From 8ef0837ce91e5f840225037d39d1c2428280e542 Mon Sep 17 00:00:00 2001 From: Steffen Moeller Date: Mon, 10 Aug 2020 16:01:48 +0200 Subject: Import daligner_1.0+git20200727.ed40ce5.orig.tar.xz [dgit import orig daligner_1.0+git20200727.ed40ce5.orig.tar.xz] --- DB.c | 2902 ++++++++++++++++++++++++++++++ DB.h | 728 ++++++++ HPC.daligner.c | 1159 ++++++++++++ LAa2b.c | 104 ++ LAb2a.c | 107 ++ LAcat.c | 199 +++ LAcheck.c | 397 +++++ LAdump.c | 507 ++++++ LAmerge.c | 524 ++++++ LAshow.c | 650 +++++++ LAsort.c | 413 +++++ LAsplit.c | 229 +++ LICENSE | 34 + Makefile | 58 + QV.c | 1481 +++++++++++++++ QV.h | 99 + README.md | 536 ++++++ align.c | 5453 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ align.h | 377 ++++ daligner.c | 758 ++++++++ dumpLA.c | 177 ++ filter.c | 2677 ++++++++++++++++++++++++++++ filter.h | 39 + lsd.sort.c | 268 +++ lsd.sort.h | 8 + 25 files changed, 19884 insertions(+) create mode 100644 DB.c create mode 100644 DB.h create mode 100644 HPC.daligner.c create mode 100644 LAa2b.c create mode 100644 LAb2a.c create mode 100644 LAcat.c create mode 100644 LAcheck.c create mode 100644 LAdump.c create mode 100644 LAmerge.c create mode 100644 LAshow.c create mode 100644 LAsort.c create mode 100644 LAsplit.c create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 QV.c create mode 100644 QV.h create mode 100644 README.md create mode 100644 align.c create mode 100644 align.h create mode 100644 daligner.c create mode 100644 dumpLA.c create mode 100644 filter.c create mode 100644 filter.h create mode 100644 lsd.sort.c create mode 100644 lsd.sort.h diff --git a/DB.c b/DB.c new file mode 100644 index 0000000..e519ea7 --- /dev/null +++ b/DB.c @@ -0,0 +1,2902 @@ +/******************************************************************************************* + * + * Compressed data base module. Auxiliary routines to open and manipulate a data base for + * which the sequence and read information are separated into two separate files, and the + * sequence is compressed into 2-bits for each base. Support for tracks of additional + * information, and trimming according to the current partition. + * + * Author : Gene Myers + * Date : July 2013 + * Revised: April 2014 + * + ********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" + +#ifdef HIDE_FILES +#define PATHSEP "/." +#else +#define PATHSEP "/" +#endif + + +/******************************************************************************************* + * + * GENERAL UTILITIES + * + ********************************************************************************************/ + +char *Prog_Name; + +#ifdef INTERACTIVE + +char Ebuffer[1000]; + +#endif + +int Count_Args(char *var) +{ int cnt, lev; + char *s; + + cnt = 1; + lev = 0; + for (s = var; *s != '\0'; s++) + if (*s == ',') + { if (lev == 0) + cnt += 1; + } + else if (*s == '(') + lev += 1; + else if (*s == ')') + lev -= 1; + return (cnt); +} + +void *Malloc(int64 size, char *mesg) +{ void *p; + + if ((p = malloc(size)) == NULL) + { if (mesg == NULL) + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); + else + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); + } + return (p); +} + +void *Realloc(void *p, int64 size, char *mesg) +{ if (size <= 0) + size = 1; + if ((p = realloc(p,size)) == NULL) + { if (mesg == NULL) + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); + else + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); + } + return (p); +} + +char *Strdup(char *name, char *mesg) +{ char *s; + + if (name == NULL) + return (NULL); + if ((s = strdup(name)) == NULL) + { if (mesg == NULL) + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); + else + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); + } + return (s); +} + +FILE *Fopen(char *name, char *mode) +{ FILE *f; + + if (name == NULL || mode == NULL) + return (NULL); + if ((f = fopen(name,mode)) == NULL) + EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); + return (f); +} + +char *PathTo(char *name) +{ char *path, *find; + + if (name == NULL) + return (NULL); + if ((find = rindex(name,'/')) != NULL) + { *find = '\0'; + path = Strdup(name,"Extracting path from"); + *find = '/'; + } + else + path = Strdup(".","Allocating default path"); + return (path); +} + +char *Root(char *name, char *suffix) +{ char *path, *find, *dot; + int epos; + + if (name == NULL) + return (NULL); + find = rindex(name,'/'); + if (find == NULL) + find = name; + else + find += 1; + if (suffix == NULL) + { dot = strchr(find,'.'); + if (dot != NULL) + *dot = '\0'; + path = Strdup(find,"Extracting root from"); + if (dot != NULL) + *dot = '.'; + } + else + { epos = strlen(find); + epos -= strlen(suffix); + if (epos > 0 && strcasecmp(find+epos,suffix) == 0) + { find[epos] = '\0'; + path = Strdup(find,"Extracting root from"); + find[epos] = suffix[0]; + } + else + path = Strdup(find,"Allocating root"); + } + return (path); +} + +char *Catenate(char *path, char *sep, char *root, char *suffix) +{ static char *cat = NULL; + static int max = -1; + int len; + + if (path == NULL || root == NULL || sep == NULL || suffix == NULL) + return (NULL); + len = strlen(path); + len += strlen(sep); + len += strlen(root); + len += strlen(suffix); + if (len > max) + { max = ((int) (1.2*len)) + 100; + cat = (char *) realloc(cat,max+1); + if (cat == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); + return (NULL); + } + } + sprintf(cat,"%s%s%s%s",path,sep,root,suffix); + return (cat); +} + +char *Numbered_Suffix(char *left, int num, char *right) +{ static char *sfx = NULL; + static int max = -1; + int len; + + if (left == NULL || right == NULL) + return (NULL); + len = strlen(left); + len += strlen(right) + 40; + if (len > max) + { max = ((int) (1.2*len)) + 100; + sfx = (char *) realloc(sfx,max+1); + if (sfx == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); + return (NULL); + } + } + sprintf(sfx,"%s%d%s",left,num,right); + return (sfx); +} + +static char *MyCatenate(char *path, char *sep, char *root, char *suffix) +{ static char *cat = NULL; + static int max = -1; + int len; + + if (path == NULL || root == NULL || sep == NULL || suffix == NULL) + return (NULL); + len = strlen(path); + len += strlen(sep); + len += strlen(root); + len += strlen(suffix); + if (len > max) + { max = ((int) (1.2*len)) + 100; + cat = (char *) realloc(cat,max+1); + if (cat == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); + return (NULL); + } + } + sprintf(cat,"%s%s%s%s",path,sep,root,suffix); + return (cat); +} + +static char *MyNumbered_Suffix(char *left, int num, char *right) +{ static char *sfx = NULL; + static int max = -1; + int len; + + if (left == NULL || right == NULL) + return (NULL); + len = strlen(left); + len += strlen(right) + 40; + if (len > max) + { max = ((int) (1.2*len)) + 100; + sfx = (char *) realloc(sfx,max+1); + if (sfx == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); + return (NULL); + } + } + sprintf(sfx,"%s%d%s",left,num,right); + return (sfx); +} + + +#define COMMA ',' + +// Print big integers with commas/periods for better readability + +void Print_Number(int64 num, int width, FILE *out) +{ if (width == 0) + { if (num < 1000ll) + fprintf(out,"%lld",num); + else if (num < 1000000ll) + fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); + else if (num < 1000000000ll) + fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, + COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); + else + fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, + COMMA,(num%1000000000ll)/1000000ll, + COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); + } + else + { if (num < 1000ll) + fprintf(out,"%*lld",width,num); + else if (num < 1000000ll) + { if (width <= 4) + fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); + else + fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); + } + else if (num < 1000000000ll) + { if (width <= 8) + fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, + COMMA,num%1000ll); + else + fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, + COMMA,num%1000ll); + } + else + { if (width <= 12) + fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, + (num%1000000000ll)/1000000ll,COMMA, + (num%1000000ll)/1000ll,COMMA,num%1000ll); + else + fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, + (num%1000000000ll)/1000000ll,COMMA, + (num%1000000ll)/1000ll,COMMA,num%1000ll); + } + } +} + +// Return the number of digits, base 10, of num + +int Number_Digits(int64 num) +{ int digit; + + digit = 0; + while (num >= 1) + { num /= 10; + digit += 1; + } + return (digit); +} + + +/******************************************************************************************* + * + * READ COMPRESSION/DECOMPRESSION UTILITIES + * + ********************************************************************************************/ + +// Compress read into 2-bits per base (from [0-3] per byte representation + +void Compress_Read(int len, char *s) +{ int i; + char c, d; + char *s0, *s1, *s2, *s3; + + s0 = s; + s1 = s0+1; + s2 = s1+1; + s3 = s2+1; + + c = s1[len]; + d = s2[len]; + s0[len] = s1[len] = s2[len] = 0; + + for (i = 0; i < len; i += 4) + *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); + + s1[len] = c; + s2[len] = d; +} + +// Uncompress read form 2-bits per base into [0-3] per byte representation + +void Uncompress_Read(int len, char *s) +{ int i, tlen, byte; + char *s0, *s1, *s2, *s3; + char *t; + + s0 = s; + s1 = s0+1; + s2 = s1+1; + s3 = s2+1; + + tlen = (len-1)/4; + + t = s+tlen; + for (i = tlen*4; i >= 0; i -= 4) + { byte = *t--; + s0[i] = (char) ((byte >> 6) & 0x3); + s1[i] = (char) ((byte >> 4) & 0x3); + s2[i] = (char) ((byte >> 2) & 0x3); + s3[i] = (char) (byte & 0x3); + } + s[len] = 4; +} + +// Convert read in [0-3] representation to ascii representation (end with '\n') + +void Lower_Read(char *s) +{ static char letter[4] = { 'a', 'c', 'g', 't' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + +void Upper_Read(char *s) +{ static char letter[4] = { 'A', 'C', 'G', 'T' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + +void Letter_Arrow(char *s) +{ static char letter[4] = { '1', '2', '3', '4' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + +// Convert read in ascii representation to [0-3] representation (end with 4) + +void Number_Read(char *s) +{ static char number[128] = + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + for ( ; *s != '\0'; s++) + *s = number[(int) *s]; + *s = 4; +} + +void Number_Arrow(char *s) +{ static char arrow[128] = + { 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 0, 1, 2, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + }; + + for ( ; *s != '\0'; s++) + *s = arrow[(int) *s]; + *s = 4; +} + +void Change_Read(char *s) +{ static char change[128] = + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 'a', 0, 'c', 0, 0, 0, 'g', + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 't', 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 'A', 0, 'C', 0, 0, 0, 'G', + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 'T', 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + for ( ; *s != '\0'; s++) + *s = change[(int) *s]; +} + + +/******************************************************************************************* + * + * DB STUB HANDLING ROUTINES + * + ********************************************************************************************/ + + // Read the contents of the DB stub file at "path" and return it encoded in a DAZZ_STUB + // structure. This is allocated by the routine. "path" is assumed to be the complete + // name of the file. + +DAZZ_STUB *Read_DB_Stub(char *path, int what) +{ FILE *dbfile; + DAZZ_STUB *stub; + + char buf1[MAX_NAME+100]; + char buf2[MAX_NAME+100]; + int nread; + + int i; + int nfiles; + int nblocks; + int64 size; + int all, cutoff; + + dbfile = Fopen(path,"r"); + if (dbfile == NULL) + EXIT(NULL); + + stub = Malloc(sizeof(DAZZ_STUB),"Allocating DB stub record"); + if (stub == NULL) + EXIT(NULL); + + stub->nreads = NULL; + stub->fname = NULL; + stub->prolog = NULL; + stub->ublocks = NULL; + stub->tblocks = NULL; + + if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) + goto stub_trash; + + if (what & DB_STUB_NREADS) + { stub->nreads = (int *) Malloc(sizeof(int)*(nfiles+1),"Allocating DB stub record"); + if (stub->nreads == NULL) + goto stub_error; + stub->nreads += 1; + } + + if (what & DB_STUB_FILES) + { stub->fname = (char **) Malloc(sizeof(char *)*(nfiles+1),"Allocating DB stub record"); + if (stub->fname == NULL) + goto stub_error; + stub->fname += 1; + + stub->nfiles = nfiles; + for (i = 0; i < nfiles; i++) + stub->fname[i] = NULL; + } + + if (what & DB_STUB_PROLOGS) + { stub->prolog = (char **) Malloc(sizeof(char *)*(nfiles+1),"Allocating DB stub record"); + if (stub->prolog == NULL) + goto stub_error; + stub->prolog += 1; + + for (i = 0; i < nfiles; i++) + stub->prolog[i] = NULL; + } + + for (i = 0; i < nfiles; i++) + { if (fscanf(dbfile,DB_FDATA,&nread,buf1,buf2) != 3) + goto stub_trash; + if (what & DB_STUB_NREADS) + stub->nreads[i] = nread; + if (what & DB_STUB_FILES) + { stub->fname[i] = Strdup(buf1,"Alloacting DB stub record"); + if (stub->fname[i] == NULL) + goto stub_error; + } + if (what & DB_STUB_PROLOGS) + { stub->prolog[i] = Strdup(buf2,"Alloacting DB stub record"); + if (stub->prolog[i] == NULL) + goto stub_error; + } + } + + if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) + goto stub_trash; + + if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) + goto stub_trash; + + if (what & DB_STUB_BLOCKS) + { stub->ublocks = (int *) Malloc(sizeof(int)*(nblocks+1),"Allocating DB stub record"); + stub->tblocks = (int *) Malloc(sizeof(int)*(nblocks+1),"Allocating DB stub record"); + if (stub->ublocks == NULL || stub->tblocks == NULL) + goto stub_error; + + for (i = 0; i <= nblocks; i++) + if (fscanf(dbfile,DB_BDATA,stub->ublocks+i,stub->tblocks+i) != 2) + goto stub_trash; + } + + fclose(dbfile); + + stub->nfiles = nfiles; + stub->all = all; + stub->cutoff = cutoff; + stub->bsize = size; + stub->nblocks = nblocks; + return (stub); + +stub_trash: + EPRINTF(EPLACE,"%s: Stub file %s is junk\n",Prog_Name,path); +stub_error: + Free_DB_Stub(stub); + EXIT(NULL); +} + + // Read the DB stub file "path" and extract the read index range [*first,*last) + // for block n, for the trimmed DB if trim is set, the untrimmed DB otherwise. + +int Fetch_Block_Range(char *path, int trim, int n, int *first, int *last) +{ FILE *dbfile; + char buffer[2*MAX_NAME+100]; + int nfiles; + int nblocks; + int64 size; + int all, cutoff; + int tfirst, tlast; + int ufirst, ulast; + int i; + + dbfile = Fopen(path,"r"); + if (dbfile == NULL) + EXIT(1); + if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) + goto stub_error; + for (i = 0; i < nfiles; i++) + if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) + goto stub_error; + if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) + goto stub_error; + + if (n < 0 || n >= nblocks) + { *first = *last = -1; + return (0); + } + + if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) + goto stub_error; + for (i = 1; i <= n; i++) + if (fscanf(dbfile,DB_BDATA,&ufirst,&tfirst) != 2) + goto stub_error; + if (fscanf(dbfile,DB_BDATA,&ulast,&tlast) != 2) + goto stub_error; + fclose(dbfile); + + if (trim) + { *first = tfirst; + *last = tlast; + } + else + { *first = ufirst; + *last = ulast; + } + + return (0); + +stub_error: + EPRINTF(EPLACE,"%s: Stub file %s is junk\n",Prog_Name,path); + EXIT(1); +} + + // Free a DAZZ_STUB data structure returned by Read_DB_Stub + +void Free_DB_Stub(DAZZ_STUB *stub) +{ int i; + + if (stub == NULL) + return; + if (stub->fname != NULL) + { for (i = 0; i < stub->nfiles; i++) + free(stub->fname[i]); + free(stub->fname-1); + } + if (stub->prolog != NULL) + { for (i = 0; i < stub->nfiles; i++) + free(stub->prolog[i]); + free(stub->prolog-1); + } + if (stub->nreads != NULL) + free(stub->nreads-1); + free(stub->ublocks); + free(stub->tblocks); + free(stub); +} + + +/******************************************************************************************* + * + * DB OPEN, TRIM, SIZE_OF, LIST_FILES & CLOSE ROUTINES + * + ********************************************************************************************/ + + +// Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has +// a part # in it then just the part is opened. The index array is allocated (for all or +// just the part) and read in. +// Return status of routine: +// -1: The DB could not be opened for a reason reported by the routine to EPLACE +// 0: Open of DB proceeded without mishap +// 1: Open of DAM proceeded without mishap + +static char *atrack_name = ".@arw"; +static char *qtrack_name = ".@qvs"; + +int Open_DB(char* path, DAZZ_DB *db) +{ DAZZ_DB dbcopy; + char *root, *pwd, *bptr, *fptr, *cat; + int nreads; + FILE *index, *dbvis, *bases; + int status, plen, isdam; + int part, cutoff, all; + int ufirst, tfirst, ulast, tlast; + + status = -1; + dbcopy = *db; + + plen = strlen(path); + if (strcmp(path+(plen-4),".dam") == 0) + { root = Root(path,".dam"); + isdam = 1; + } + else + { if (strcmp(path+(plen-3),".db") == 0) + isdam = -1; + else + isdam = 0; + root = Root(path,".db"); + } + pwd = PathTo(path); + + bptr = rindex(root,'.'); + if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') + { part = strtol(bptr+1,&fptr,10); + if (*fptr != '\0' || part == 0) + part = 0; + else + *bptr = '\0'; + } + else + part = 0; + + if (isdam > 0) + cat = MyCatenate(pwd,"/",root,".dam"); + else + cat = MyCatenate(pwd,"/",root,".db"); + if (cat == NULL) + return (-1); + if ((dbvis = fopen(cat,"r")) == NULL) + { if (isdam < 0) + { EPRINTF(EPLACE,"%s: Could not open DB %s\n",Prog_Name,path); + goto error; + } + if (isdam > 0) + { EPRINTF(EPLACE,"%s: Could not open DAM %s\n",Prog_Name,path); + goto error; + } + cat = MyCatenate(pwd,"/",root,".dam"); + if (cat == NULL) + return (-1); + if ((dbvis = fopen(cat,"r")) == NULL) + { EPRINTF(EPLACE,"%s: Could not open %s as a DB or a DAM\n",Prog_Name,path); + goto error; + } + isdam = 1; + } + if (isdam < 0) + isdam = 0; + + if ((index = Fopen(MyCatenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) + goto error1; + if (fread(db,sizeof(DAZZ_DB),1,index) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + goto error2; + } + + { int p, nblocks, nfiles; + int64 size; + char fname[MAX_NAME], prolog[MAX_NAME]; + + nblocks = 0; + if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + for (p = 0; p < nfiles; p++) + if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) + if (part == 0) + { cutoff = 0; + all = DB_ALL; + } + else + { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", + Prog_Name,root); + goto error2; + } + else + { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + if (part > nblocks) + { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); + goto error2; + } + } + + if (part > 0) + { for (p = 1; p <= part; p++) + if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + } + else + { ufirst = tfirst = 0; + ulast = db->ureads; + tlast = db->treads; + } + } + + db->trimmed = 0; + db->tracks = NULL; + db->part = part; + db->cutoff = cutoff; + db->allarr |= all; + db->ufirst = ufirst; + db->tfirst = tfirst; + + nreads = ulast-ufirst; + if (part <= 0) + { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); + if (db->reads == NULL) + goto error2; + + db->reads += 1; + if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(db->reads-1); + goto error2; + } + } + else + { DAZZ_READ *reads; + int i, r, maxlen; + int64 totlen; + + reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); + if (reads == NULL) + goto error2; + reads += 1; + + fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR); + if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(reads-1); + goto error2; + } + + totlen = 0; + maxlen = 0; + for (i = 0; i < nreads; i++) + { r = reads[i].rlen; + totlen += r; + if (r > maxlen) + maxlen = r; + } + + db->maxlen = maxlen; + db->totlen = totlen; + db->reads = reads; + } + + ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part + ((int *) (db->reads))[-2] = tlast - tfirst; + + db->nreads = nreads; + db->path = Strdup(MyCatenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); + if (db->path == NULL) + { free(db->reads-1); + goto error2; + } + bases = Fopen(MyCatenate(db->path,"","",".bps"),"r"); + if (bases == NULL) + { free(db->path); + free(db->reads-1); + goto error2; + } + db->bases = (void *) bases; + db->loaded = 0; + + status = isdam; + +error2: + fclose(index); +error1: + fclose(dbvis); +error: + if (bptr != NULL) + *bptr = '.'; + + free(pwd); + free(root); + + if (status < 0) + *db = dbcopy; + + return (status); +} + + +// Trim the DB or part thereof and all opened tracks according to the cuttof and all settings +// of the current DB partition. Reallocate smaller memory blocks for the information kept +// for the retained reads. + +void Trim_DB(DAZZ_DB *db) +{ int i, j, r, f; + int allflag, cutoff, css; + int64 totlen; + int maxlen, nreads; + DAZZ_TRACK *record; + DAZZ_READ *reads; + + if (db->trimmed) return; + + if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return; + + { int load_error; + + load_error = db->loaded; + for (record = db->tracks; record != NULL; record = record->next) + if (record->name == atrack_name) + { if (((DAZZ_ARROW *) record)->loaded) + load_error = 1; + } + else if (record->name != qtrack_name) + { if (record->loaded) + load_error = 1; + } + if (load_error) + { EPRINTF(EPLACE,"%s: Cannot load anything before trim (Trim_DB)\n",Prog_Name); + return; + } + } + + cutoff = db->cutoff; + if ((db->allarr & DB_ALL) != 0) + allflag = 0; + else + allflag = DB_BEST; + + reads = db->reads; + nreads = db->nreads; + + for (record = db->tracks; record != NULL; record = record->next) + if (record->name == qtrack_name) + { uint16 *table = ((DAZZ_QV *) record)->table; + + j = 0; + for (i = 0; i < db->nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + table[j++] = table[i]; + } + else if (record->name == atrack_name) + { DAZZ_ARROW *atrack = (DAZZ_ARROW *) record; + int64 *aoff = atrack->aoff; + + for (j = i = 0; i < nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + aoff[j++] = aoff[i]; + atrack->aoff = Realloc(aoff,sizeof(int64)*j,NULL); + } + else + { int size; + + size = record->size; + if (record->data == NULL) + { char *anno = (char *) record->anno; + j = 0; + for (i = r = 0; i < db->nreads; i++, r += size) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + { memmove(anno+j,anno+r,size); + j += size; + } + record->anno = Realloc(record->anno,record->size*j,NULL); + } + else if (size == 4) + { int *anno4 = (int *) (record->anno); + int *alen = record->alen; + + j = 0; + for (i = 0; i < db->nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + { anno4[j] = anno4[i]; + alen[j] = alen[i]; + j += 1; + } + record->alen = Realloc(record->alen,sizeof(int)*j,NULL); + record->anno = Realloc(record->anno,record->size*(j+1),NULL); + } + else // size == 8 + { int64 *anno8 = (int64 *) (record->anno); + int *alen = record->alen; + + j = 0; + for (i = 0; i < db->nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + { anno8[j] = anno8[i]; + alen[j] = alen[i]; + j += 1; + } + record->alen = Realloc(record->alen,sizeof(int)*j,NULL); + record->anno = Realloc(record->anno,record->size*(j+1),NULL); + } + record->nreads = j; + } + + css = 0; + totlen = maxlen = 0; + for (j = i = 0; i < nreads; i++) + { f = reads[i].flags; + if ((f & DB_CCS) == 0) + css = 0; + r = reads[i].rlen; + if ((f & DB_BEST) >= allflag && r >= cutoff) + { totlen += r; + if (r > maxlen) + maxlen = r; + reads[j] = reads[i]; + if (css) + reads[j++].flags |= DB_CCS; + else + reads[j++].flags &= ~DB_CCS; + css = 1; + } + } + + db->totlen = totlen; + db->maxlen = maxlen; + db->nreads = j; + db->trimmed = 1; + + if (j < nreads) + { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL); + db->reads += 1; + } +} + + +// Return the size in bytes of the memory occupied by a given DB + +int64 sizeof_DB(DAZZ_DB *db) +{ int64 s; + DAZZ_TRACK *t; + + s = sizeof(DAZZ_DB) + + sizeof(DAZZ_READ)*(db->nreads+2) + + strlen(db->path)+1 + + (db->totlen+db->nreads+4); + + t = db->tracks; + if (t != NULL && strcmp(t->name,".@qvs") == 0) + { DAZZ_QV *q = (DAZZ_QV *) t; + s += sizeof(DAZZ_QV) + + sizeof(uint16) * db->nreads + + q->ncodes * sizeof(QVcoding) + + 6; + t = t->next; + } + + for (; t != NULL; t = t->next) + { s += sizeof(DAZZ_TRACK) + + strlen(t->name)+1 + + t->size * (db->nreads+1); + if (t->data != NULL) + { if (t->size == 8) + s += sizeof(int)*((int64 *) t->anno)[db->nreads]; + else // t->size == 4 + s += sizeof(int)*((int *) t->anno)[db->nreads]; + } + } + + return (s); +} + + +// For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all +// those of the form "prefix/[.]root.part" and call actor with the complete path to each file +// pointed at by path, and the suffix of the path by extension. The . proceeds the root +// name if the defined constant HIDE_FILES is set. Always the first call is with the +// path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for +// "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and +// so this routine gives one a way to know all the tracks associated with a given DB. +// -1 is returned if the path could not be found, and 1 is returned if an error (reported +// to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int List_DB_Files(char *path, void actor(char *path, char *extension)) +{ int status, plen, rlen, dlen; + char *root, *pwd, *name; + int isdam; + DIR *dirp; + struct dirent *dp; + + status = 0; + pwd = PathTo(path); + plen = strlen(path); + if (strcmp(path+(plen-4),".dam") == 0) + root = Root(path,".dam"); + else + root = Root(path,".db"); + rlen = strlen(root); + + if (root == NULL || pwd == NULL) + { free(pwd); + free(root); + EXIT(1); + } + + if ((dirp = opendir(pwd)) == NULL) + { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); + status = -1; + goto error; + } + + isdam = 0; + while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) + { name = dp->d_name; + if (strcmp(name,MyCatenate("","",root,".db")) == 0) + break; + if (strcmp(name,MyCatenate("","",root,".dam")) == 0) + { isdam = 1; + break; + } + } + if (dp == NULL) + { status = -1; + closedir(dirp); + goto error; + } + + if (isdam) + actor(MyCatenate(pwd,"/",root,".dam"),"dam"); + else + actor(MyCatenate(pwd,"/",root,".db"),"db"); + + rewinddir(dirp); // Report each auxiliary file + while ((dp = readdir(dirp)) != NULL) + { name = dp->d_name; + dlen = strlen(name); +#ifdef HIDE_FILES + if (name[0] != '.') + continue; + dlen -= 1; + name += 1; +#endif + if (dlen < rlen+1) + continue; + if (name[rlen] != '.') + continue; + if (strncmp(name,root,rlen) != 0) + continue; + actor(MyCatenate(pwd,PATHSEP,name,""),name+(rlen+1)); + } + closedir(dirp); + +error: + free(pwd); + free(root); + return (status); +} + +void Print_Read(char *s, int width) +{ int i; + + if (s[0] < 4) + { for (i = 0; s[i] != 4; i++) + { if (i%width == 0 && i != 0) + printf("\n"); + printf("%d",s[i]); + } + printf("\n"); + } + else + { for (i = 0; s[i] != '\0'; i++) + { if (i%width == 0 && i != 0) + printf("\n"); + printf("%c",s[i]); + } + printf("\n"); + } +} + + +// Shut down an open 'db' by freeing all associated space, including tracks and QV structures, +// and any open file pointers. The record pointed at by db however remains (the user +// supplied it and so should free it). + +void Close_DB(DAZZ_DB *db) +{ if (db->loaded) + free(((char *) (db->bases)) - 1); + else if (db->bases != NULL) + fclose((FILE *) db->bases); + if (db->reads != NULL) + free(db->reads-1); + free(db->path); + + Close_QVs(db); + + Close_Arrow(db); + + while (db->tracks != NULL) + Close_Track(db,db->tracks); +} + + +/******************************************************************************************* + * + * READ AND ARROW BUFFER ALLOCATION, LOAD, & LOAD_ALL + * + ********************************************************************************************/ + +// Allocate and return a buffer big enough for the largest read in 'db', leaving room +// for an initial delimiter character + +char *New_Read_Buffer(DAZZ_DB *db) +{ char *read; + + read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); + if (read == NULL) + EXIT(NULL); + return (read+1); +} + +// Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a +// lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and +// 3(T) otherwise. +// +// **NB**, the byte before read will be set to a delimiter character! + +int Load_Read(DAZZ_DB *db, int i, char *read, int ascii) +{ FILE *bases = (FILE *) db->bases; + int64 off; + int len, clen; + DAZZ_READ *r = db->reads; + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(1); + } + + if (db->loaded) + { len = r[i].rlen; + strncpy(read,(char *) bases + r[i].boff,len); + if (ascii == 0) + { if (*read < 4) + read[-1] = read[len] = 4; + else + { read[len] = '\0'; + Number_Read(read); + read[-1] = 4; + } + } + else + { if (*read < 4) + { read[len] = 4; + if (ascii == 1) + Lower_Read(read); + else + Upper_Read(read); + read[-1] = '\0'; + } + else + { read[len] = '\0'; + if ((ascii == 1) != islower(*read)) + Change_Read(read); + } + read[-1] = '\0'; + } + return (0); + } + + off = r[i].boff; + len = r[i].rlen; + + if (ftello(bases) != off) + fseeko(bases,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(read,clen,1,bases) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(1); + } + } + Uncompress_Read(len,read); + if (ascii == 1) + { Lower_Read(read); + read[-1] = '\0'; + } + else if (ascii == 2) + { Upper_Read(read); + read[-1] = '\0'; + } + else + read[-1] = 4; + return (0); +} + + +// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the +// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii +// string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string +// over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to +// the string holding the substring so it has a delimeter for traversals in either direction. +// A NULL pointer is returned if an error occured and INTERACTIVE is defined. + +char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii) +{ FILE *bases = (FILE *) db->bases; + int64 off; + int len, clen; + int bbeg, bend; + DAZZ_READ *r = db->reads; + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(NULL); + } + + if (db->loaded) + { len = end-beg; + strncpy(read,(char *) bases + r[i].boff + beg,len); + if (ascii == 0) + { if (*read < 4) + read[-1] = read[len] = 4; + else + { read[len] = '\0'; + Number_Read(read); + read[-1] = 4; + } + } + else + { if (*read < 4) + { read[len] = 4; + if (ascii == 1) + Lower_Read(read); + else + Upper_Read(read); + read[-1] = '\0'; + } + else + { read[len] = '\0'; + if ((ascii == 1) != islower(*read)) + Change_Read(read); + } + read[-1] = '\0'; + } + return (read); + } + + bbeg = beg/4; + bend = (end-1)/4+1; + + off = r[i].boff + bbeg; + len = end - beg; + + if (ftello(bases) != off) + fseeko(bases,off,SEEK_SET); + clen = bend-bbeg; + if (clen > 0) + { if (fread(read,clen,1,bases) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(NULL); + } + } + Uncompress_Read(4*clen,read); + read += beg%4; + read[len] = 4; + if (ascii == 1) + { Lower_Read(read); + read[-1] = '\0'; + } + else if (ascii == 2) + { Upper_Read(read); + read[-1] = '\0'; + } + else + read[-1] = 4; + + return (read); +} + +// Allocate a block big enough for all the uncompressed sequences, read them into it, +// reset the 'off' in each read record to be its in-memory offset, and set the +// bases pointer to point at the block after closing the bases file. If ascii is +// non-zero then the reads are converted to ACGT ascii, otherwise the reads are left +// as numeric strings over 0(A), 1(C), 2(G), and 3(T). + +int Load_All_Reads(DAZZ_DB *db, int ascii) +{ FILE *bases = (FILE *) db->bases; + int nreads = db->nreads; + DAZZ_READ *reads = db->reads; + void (*translate)(char *s); + + char *seq; + int64 o, off; + int i, len, clen; + + if (db->loaded) + return (0); + + seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); + if (seq == NULL) + EXIT(1); + + *seq++ = 4; + + if (ascii == 1) + translate = Lower_Read; + else + translate = Upper_Read; + + o = 0; + for (i = 0; i < nreads; i++) + { len = reads[i].rlen; + off = reads[i].boff; + if (ftello(bases) != off) + fseeko(bases,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(seq+o,clen,1,bases) != 1) + { EPRINTF(EPLACE,"%s: Read of .bps file failed (Load_All_Sequences)\n",Prog_Name); + free(seq-1); + EXIT(1); + } + } + Uncompress_Read(len,seq+o); + if (ascii) + translate(seq+o); + reads[i].boff = o; + o += (len+1); + } + reads[nreads].boff = o; + + fclose(bases); + + db->bases = (void *) seq; + db->loaded = 1; + + return (0); +} + + +/******************************************************************************************* + * + * ARROW OPEN, LOAD, LOAD_ALL, & CLOSE + * + ********************************************************************************************/ + +DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" +DAZZ_ARROW *Arrow_Ptr; // Becomes invalid after closing + +// If the Arrow pseudo track is not already in db's track list, then load it and set it up. +// The database reads must not have been loaded with Load_All_Reads yet. +// -1 is returned if a .arw file is not present, and 1 is returned if an error (reported +// to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int Open_Arrow(DAZZ_DB *db) +{ int64 *avector; + DAZZ_ARROW *atrack; + FILE *afile; + DAZZ_READ *reads; + int i, nreads; + + if (db->tracks != NULL && db->tracks->name == atrack_name) + return (0); + + if ((db->allarr & DB_ARROW) == 0) + { EPRINTF(EPLACE,"%s: The DB is not an Arrow database (Open_Arrow)\n",Prog_Name); + EXIT(1); + } + if (db->loaded) + { EPRINTF(EPLACE,"%s: Cannot open Arrow vectors after loading all reads (Open_Arrow)\n", + Prog_Name); + EXIT(1); + } + + afile = Fopen(MyCatenate(db->path,"","",".arw"),"r"); + if (afile == NULL) + return (-1); + + nreads = db->nreads; + avector = (int64 *) Malloc(sizeof(int64)*nreads,"Allocating Arrow index"); + atrack = (DAZZ_ARROW *) Malloc(sizeof(DAZZ_ARROW),"Allocating Arrow track"); + if (avector == NULL || atrack == NULL) + { fclose(afile); + if (avector != NULL) + free(avector); + EXIT(1); + } + db->tracks = (DAZZ_TRACK *) atrack; + atrack->next = NULL; + atrack->name = atrack_name; + atrack->aoff = avector; + atrack->arrow = (void *) afile; + atrack->loaded = 0; + + + reads = db->reads; + for (i = 0; i < nreads; i++) + avector[i] = reads[i].boff; + return (0); +} + +// Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1, +// and as a numeric string otherwise. + +int Load_Arrow(DAZZ_DB *db, int i, char *arrow, int ascii) +{ FILE *afile; + int64 off; + int len, clen; + + if (db != Arrow_DB) + { if (db->tracks == NULL || db->tracks->name != atrack_name) + { EPRINTF(EPLACE,"%s: Arrow data is not available (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + Arrow_Ptr = (DAZZ_ARROW *) db->tracks; + Arrow_DB = db; + } + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + + afile = (FILE *) Arrow_Ptr->arrow; + off = Arrow_Ptr->aoff[i]; + len = db->reads[i].rlen; + + if (ftello(afile) != off) + fseeko(afile,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(arrow,clen,1,afile) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .arw file (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + } + Uncompress_Read(len,arrow); + if (ascii == 1) + { Letter_Arrow(arrow); + arrow[-1] = '\0'; + } + else + arrow[-1] = 4; + return (0); +} + +// Allocate a block big enough for all the uncompressed Arrow vectors, read them into it, +// reset the 'off' in each arrow record to be its in-memory offset, and set the +// arrow pointer to point at the block after closing the arrow file. If ascii is +// non-zero then the arrows are converted to 0123 ascii, otherwise the arrows are left +// as numeric strings over [0-3]. + +int Load_All_Arrows(DAZZ_DB *db, int ascii) +{ int nreads = db->nreads; + DAZZ_READ *reads = db->reads; + FILE *afile; + int64 *aoff; + + char *seq; + int64 o, off; + int i, len, clen; + + if (db != Arrow_DB) + { if (db->tracks == NULL || db->tracks->name != atrack_name) + { EPRINTF(EPLACE,"%s: Arrow data is not available (Load_All_Arrows)\n",Prog_Name); + EXIT(1); + } + Arrow_Ptr = (DAZZ_ARROW *) db->tracks; + Arrow_DB = db; + } + + if (Arrow_Ptr->loaded) + return (0); + + afile = (FILE *) Arrow_Ptr->arrow; + aoff = Arrow_Ptr->aoff; + + seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Arrows"); + if (seq == NULL) + EXIT(1); + + *seq++ = 4; + o = 0; + for (i = 0; i < nreads; i++) + { len = reads[i].rlen; + off = aoff[i]; + if (ftello(afile) != off) + fseeko(afile,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(seq+o,clen,1,afile) != 1) + { EPRINTF(EPLACE,"%s: Read of .bps file failed (Load_All_Sequences)\n",Prog_Name); + free(seq-1); + EXIT(1); + } + } + Uncompress_Read(len,seq+o); + if (ascii) + Letter_Arrow(seq+o); + aoff[i] = o; + o += (len+1); + } + aoff[nreads] = o; + + fclose(afile); + + Arrow_Ptr->arrow = (void *) seq; + Arrow_Ptr->loaded = 1; + + return (0); +} + +// Remove the Arrow pseudo track, all space associated with it, and close the .arw file. + +void Close_Arrow(DAZZ_DB *db) +{ DAZZ_ARROW *atrack; + + Arrow_DB = NULL; + if (db->tracks != NULL && db->tracks->name == atrack_name) + { atrack = (DAZZ_ARROW *) db->tracks; + if (atrack->loaded) + free(atrack->arrow); + else + fclose((FILE *) atrack->arrow); + free(atrack->aoff); + db->tracks = db->tracks->next; + free(atrack); + } +} + + +/******************************************************************************************* + * + * TRACK CHECK, OPEN, BUFFER ALLOCATION, LOAD, LOAD_ALL & CLOSE ROUTINES + * TRACK EXTRAS READING & WRITING + * + ********************************************************************************************/ + +// Return status of track: +// 1: Track is for trimmed DB +// 0: Track is for untrimmed DB +// -1: Track is not the right size of DB either trimmed or untrimmed +// -2: Could not find the track + +int Check_Track(DAZZ_DB *db, char *track, int *kind) +{ FILE *afile; + int tracklen, size, ispart; + int ureads, treads; + + afile = NULL; + if (db->part > 0) + { afile = fopen(MyCatenate(db->path,MyNumbered_Suffix(".",db->part,"."),track,".anno"),"r"); + ispart = 1; + } + if (afile == NULL) + { afile = fopen(MyCatenate(db->path,".",track,".anno"),"r"); + ispart = 0; + } + if (afile == NULL) + return (-2); + + if (fread(&tracklen,sizeof(int),1,afile) != 1) + { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); + exit (1); + } + if (fread(&size,sizeof(int),1,afile) != 1) + { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); + exit (1); + } + + if (size == 0) + *kind = MASK_TRACK; + else if (size > 0) + *kind = CUSTOM_TRACK; + else + { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); + exit (1); + } + + fclose(afile); + + if (ispart) + { ureads = ((int *) (db->reads))[-1]; + treads = ((int *) (db->reads))[-2]; + } + else + { ureads = db->ureads; + treads = db->treads; + } + + if (tracklen == ureads) + return (0); + else if (tracklen == treads) + return (1); + else + return (-1); +} + +// The DB has already been trimmed, but a track over the untrimmed DB needs to be opened. +// Trim the track by rereading the untrimmed DB index from the file system. + +static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart) +{ int i, j, r; + int allflag, cutoff; + int ureads; + char *root; + DAZZ_READ read; + FILE *indx; + + if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0); + + cutoff = db->cutoff; + if ((db->allarr & DB_ALL) != 0) + allflag = 0; + else + allflag = DB_BEST; + + root = rindex(db->path,'/') + 2; + indx = Fopen(MyCatenate(db->path,"","",".idx"),"r"); + fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET); + if (ispart) + ureads = ((int *) (db->reads))[-1]; + else + ureads = db->ureads; + + { int size; + + size = track->size; + if (track->data == NULL) + { char *anno = (char *) track->anno; + j = r = 0; + for (i = r = 0; i < ureads; i++, r += size) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + fclose(indx); + EXIT(1); + } + if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) + { memmove(anno+j,anno+r,size); + j += size; + } + r += size; + } + track->anno = Realloc(track->anno,track->size*j,NULL); + } + else if (size == 4) + { int *anno4 = (int *) (track->anno); + int *alen = track->alen; + + j = 0; + for (i = 0; i < ureads; i++) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + fclose(indx); + EXIT(1); + } + if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) + { anno4[j] = anno4[i]; + alen[j] = alen[i]; + j += 1; + } + } + track->alen = Realloc(track->alen,sizeof(int)*j,NULL); + track->anno = Realloc(track->anno,track->size*(j+1),NULL); + } + else // size == 8 + { int64 *anno8 = (int64 *) (track->anno); + int *alen = track->alen; + + j = 0; + for (i = 0; i < ureads; i++) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + fclose(indx); + EXIT(1); + } + if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) + { anno8[j] = anno8[i]; + alen[j] = alen[i]; + j += 1; + } + } + track->alen = Realloc(track->alen,sizeof(int)*j,NULL); + track->anno = Realloc(track->anno,track->size*(j+1),NULL); + } + } + + fclose(indx); + return (0); +} + +// If track is not already in the db's track list, then allocate all the storage for it, +// read it in from the appropriate file, add it to the track list, and return a pointer +// to the newly created DAZZ_TRACK record. If the track does not exist or cannot be +// opened for some reason, then NULL is returned. + +DAZZ_TRACK *Open_Track(DAZZ_DB *db, char *track) +{ FILE *afile, *dfile; + int tracklen, size; + int nreads, ispart; + int treads, ureads; + int64 dmax; + void *anno; + int *alen; + void *data; + char *name; + DAZZ_TRACK *record; + + if (track[0] == '.') + { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); + EXIT(NULL); + } + + for (record = db->tracks; record != NULL; record = record->next) + if (strcmp(record->name,track) == 0) + return (record); + + afile = NULL; + if (db->part) + { afile = fopen(MyCatenate(db->path,MyNumbered_Suffix(".",db->part,"."),track,".anno"),"r"); + ispart = 1; + } + if (afile == NULL) + { afile = fopen(MyCatenate(db->path,".",track,".anno"),"r"); + ispart = 0; + } + if (afile == NULL) + { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); + return (NULL); + } + + dfile = NULL; + anno = NULL; + alen = NULL; + data = NULL; + record = NULL; + + if (ispart) + name = MyCatenate(db->path,MyNumbered_Suffix(".",db->part,"."),track,".data"); + else + name = MyCatenate(db->path,".",track,".data"); + if (name == NULL) + goto error; + dfile = fopen(name,"r"); + + if (fread(&tracklen,sizeof(int),1,afile) != 1) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (fread(&size,sizeof(int),1,afile) != 1) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + + if (size < 0) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (size == 0) + size = 8; + + if (ispart) + { ureads = ((int *) (db->reads))[-1]; + treads = ((int *) (db->reads))[-2]; + } + else + { ureads = db->ureads; + treads = db->treads; + } + + if (db->trimmed) + { if (tracklen != treads && tracklen != ureads) + { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; + } + if ( ! ispart && db->part > 0) + { if (tracklen == treads) + fseeko(afile,size*db->tfirst,SEEK_CUR); + else + fseeko(afile,size*db->ufirst,SEEK_CUR); + } + } + else + { if (tracklen != ureads) + { if (tracklen == treads) + EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); + else + EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; + } + if ( ! ispart && db->part > 0) + fseeko(afile,size*db->ufirst,SEEK_CUR); + } + if (tracklen == treads) + nreads = ((int *) (db->reads))[-2]; + else + nreads = ((int *) (db->reads))[-1]; + + anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); + if (anno == NULL) + goto error; + + if (dfile != NULL) + { int64 *anno8; + int *anno4; + int64 x, y; + int i; + + alen = (int *) Malloc(sizeof(int)*nreads,"Allocating Track Anno Lengths"); + if (alen == NULL) + goto error; + + if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + + dmax = 0; + if (size == 4) + { anno4 = (int *) anno; + y = anno4[0]; + for (i = 1; i <= nreads; i++) + { x = anno4[i]; + y = x-y; + if (y > dmax) + dmax = y; + alen[i-1] = y; + y = x; + } + } + else + { anno8 = (int64 *) anno; + y = anno8[0]; + for (i = 1; i <= nreads; i++) + { x = anno8[i]; + y = x-y; + if (y > dmax) + dmax = y; + alen[i-1] = y; + y = x; + } + } + } + else + { dmax = 0; + if (fread(anno,size,nreads,afile) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + } + + fclose(afile); + + record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record"); + if (record == NULL) + goto error; + record->name = Strdup(track,"Allocating Track Name"); + if (record->name == NULL) + goto error; + if (dfile == NULL) + record->data = NULL; + else + record->data = (void *) dfile; + record->anno = anno; + record->alen = alen; + record->size = size; + record->nreads = nreads; + record->loaded = 0; + record->dmax = dmax; + + if (db->trimmed && tracklen != treads) + { if (Late_Track_Trim(db,record,ispart)) + goto error; + } + + if (db->tracks != NULL && (db->tracks->name == qtrack_name || db->tracks->name == atrack_name)) + { record->next = db->tracks->next; + db->tracks->next = record; + } + else + { record->next = db->tracks; + db->tracks = record; + } + + return (record); + +error: + if (record != NULL) + free(record); + if (data != NULL) + free(data); + if (alen != NULL) + free(alen); + if (anno != NULL) + free(anno); + if (dfile != NULL) + fclose(dfile); + fclose(afile); + EXIT (NULL); +} + +// Allocate a data buffer large enough to hold the longest read data block that will occur +// in the track. If cannot allocate memory then return NULL if INTERACTIVE is defined, +// or print error to stderr and exit otherwise. + +void *New_Track_Buffer(DAZZ_TRACK *track) +{ void *data; + + data = (void *) Malloc(track->dmax,"Allocating New Track Data Buffer"); + if (data == NULL) + EXIT(NULL); + return (data); +} + +// Load into 'data' the read data block for read i's "track" data. Return the length of +// the data in bytes, unless an error occurs and INTERACTIVE is defined in which case +// return wtih -1. + +int Load_Track_Data(DAZZ_TRACK *track, int i, void *data) +{ FILE *dfile; + int64 off; + int len; + + if (i < 0 || i >= track->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Track_Data)\n",Prog_Name); + EXIT(-1); + } + + if (track->size == 4) + off = ((int *) track->anno)[i]; + else + off = ((int64 *) track->anno)[i]; + len = track->alen[i]; + + if (track->loaded) + { strncpy(data,(void *) track->data + off,len); + return (len); + } + + dfile = (FILE *) track->data; + if (ftello(dfile) != off) + fseeko(dfile,off,SEEK_SET); + if (len > 0) + if (fread(data,len,1,dfile) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .data file (Load_Track_Data)\n",Prog_Name); + EXIT(-1); + } + return (len); +} + +// Allocate a block big enough for all the track data and read the data into it, +// reset the 'off' in each anno pointer to be its in-memory offset, and set the +// data pointer to point at the block after closing the data file. Return with a +// zero, except when an error occurs and INTERACTIVE is defined in which +// case return wtih 1. + +int Load_All_Track_Data(DAZZ_TRACK *track) +{ FILE *dfile; + void *data; + int *alen; + int64 dlen, off, o; + int i, len, nreads; + + if (track->loaded || track->data == NULL) + return (0); + + nreads = track->nreads; + dfile = (FILE *) track->data; + alen = track->alen; + + dlen = 0; + for (i = 0; i < nreads; i++) + dlen += alen[i]; + + data = (void *) Malloc(dlen,"Allocating All Track Data"); + if (data == NULL) + EXIT(1); + + o = 0; + if (track->size == 4) + { int *anno4 = (int *) track->anno; + + for (i = 0; i < nreads; i++) + { len = alen[i]; + off = anno4[i]; + if (ftello(dfile) != off) + fseeko(dfile,off,SEEK_SET); + if (len > 0) + { if (fread(data+o,len,1,dfile) != 1) + { EPRINTF(EPLACE,"%s: Read of .data failed (Load_All_Track_Data)\n",Prog_Name); + free(data); + EXIT(1); + } + } + anno4[i] = o; + o += len; + } + anno4[nreads] = o; + } + else + { int64 *anno8 = (int64 *) track->anno; + + for (i = 0; i < nreads; i++) + { len = alen[i]; + off = anno8[i]; + if (ftello(dfile) != off) + fseeko(dfile,off,SEEK_SET); + if (len > 0) + { if (fread(data+o,len,1,dfile) != 1) + { EPRINTF(EPLACE,"%s: Read of .data failed (Load_All_Track_Data)\n",Prog_Name); + free(data); + EXIT(1); + } + } + anno8[i] = o; + o += len; + } + anno8[nreads] = o; + } + + fclose(dfile); + + track->data = (void *) data; + track->loaded = 1; + + return (0); +} + + +// Assumming file pointer for afile is correctly positioned at the start of a extra item, +// and aname is the name of the .anno file, decode the value present and places it in +// extra if extra->nelem == 0, otherwise reduce the value just read into extra according +// according the to the directive given by 'accum'. Leave the read poinrt at the next +// extra or end-of-file. +// Returns: +// 1 if at the end of file, +// 0 if item was read and folded correctly, +// -1 if there was a system IO or allocation error (if interactive), and +// -2 if the new value could not be reduced into the currenct value of extra (interactive) + +int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) +{ int vtype, nelem, accum, slen; + char *name; + void *value; + +#define EREAD(v,s,n,file,ret) \ + { if (fread(v,s,n,file) != (size_t) n) \ + { if (ferror(file)) \ + fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + else if (ret) \ + return (1); \ + else \ + fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,aname); \ + EXIT(-1); \ + } \ + } + + EREAD(&vtype,sizeof(int),1,afile,1) + EREAD(&nelem,sizeof(int),1,afile,0) + EREAD(&accum,sizeof(int),1,afile,0) + EREAD(&slen,sizeof(int),1,afile,0) + + if (extra == NULL) + { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0) + { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); + EXIT(-1); + } + return (0); + } + + name = (char *) Malloc(slen+1,"Allocating extra name"); + value = Malloc(8*nelem,"Allocating extra value"); + if (name == NULL || value == NULL) + EXIT(-1); + + EREAD(name,1,slen,afile,0); + EREAD(value,8,nelem,afile,0); + name[slen] = '\0'; + + if (extra->nelem == 0) + { extra->vtype = vtype; + extra->nelem = nelem; + extra->accum = accum; + extra->name = name; + extra->value = value; + return (0); + } + + if (vtype != extra->vtype) + { fprintf(stderr,"%s: Type of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + if (nelem != extra->nelem) + { fprintf(stderr,"%s: Length of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + if (accum != extra->accum) + { fprintf(stderr,"%s: Reduction indicator of extra %s does not agree with",Prog_Name,name); + fprintf(stderr," previos .anno block files\n"); + goto error; + } + if (strcmp(name,extra->name) != 0) + { fprintf(stderr,"%s: Expecting extra %s in .anno block file, not %s\n", + Prog_Name,extra->name,name); + goto error; + } + + if (vtype == DB_INT) + { int64 *ival = (int64 *) value; + int64 *eval = (int64 *) (extra->value); + int j; + + if (accum == DB_EXACT) + { for (j = 0; j < nelem; j++) + if (eval[j] != ival[j]) + { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); + fprintf(stderr," with previous .anno block files\n"); + goto error; + } + } + else + { for (j = 0; j < nelem; j++) + eval[j] += ival[j]; + } + } + + else + { double *ival = (double *) value; + double *eval = (double *) (extra->value); + int j; + + if (accum == DB_EXACT) + { for (j = 0; j < nelem; j++) + if (eval[j] != ival[j]) + { fprintf(stderr,"%s: Value of extra %s doe not agree",Prog_Name,name); + fprintf(stderr," with previous .anoo block files\n"); + goto error; + } + } + else + { for (j = 0; j < nelem; j++) + eval[j] += ival[j]; + } + } + + free(value); + free(name); + return (0); + +error: + free(value); + free(name); + EXIT(1); +} + +// Write extra record to end of file afile and advance write pointer +// If interactive, then return non-zero on error, if bash, then print +// and halt if an error + +int Write_Extra(FILE *afile, DAZZ_EXTRA *extra) +{ int slen; + + FFWRITE(&(extra->vtype),sizeof(int),1,afile) + FFWRITE(&(extra->nelem),sizeof(int),1,afile) + FFWRITE(&(extra->accum),sizeof(int),1,afile) + slen = strlen(extra->name); + FFWRITE(&slen,sizeof(int),1,afile) + FFWRITE(extra->name,1,slen,afile) + FFWRITE(extra->value,8,extra->nelem,afile) + + return (0); +} + +void Close_Track(DAZZ_DB *db, DAZZ_TRACK *track) +{ DAZZ_TRACK *record, *prev; + + prev = NULL; + for (record = db->tracks; record != NULL; record = record->next) + { if (track == record) + { free(record->anno); + free(record->alen); + if (record->loaded) + free(record->data); + else + fclose((FILE *) record->data); + free(record->name); + if (prev == NULL) + db->tracks = record->next; + else + prev->next = record->next; + free(record); + return; + } + prev = record; + } + return; +} + + +/******************************************************************************************* + * + * QV OPEN, BUFFER ALLOCATION, LOAD, & CLOSE ROUTINES + * + ********************************************************************************************/ + +DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" +DAZZ_QV *Active_QV; // Becomes invalid after closing + +int Open_QVs(DAZZ_DB *db) +{ FILE *quiva, *istub, *indx; + char *root; + uint16 *table; + DAZZ_QV *qvtrk; + QVcoding *coding, *nx; + int ncodes = 0; + + if (db->tracks != NULL && db->tracks->name == qtrack_name) + return (0); + + if (db->trimmed) + { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); + EXIT(1); + } + + if (db->reads[db->nreads-1].coff < 0) + { if (db->part > 0) + { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name); + EXIT(1); + } + else + { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name); + EXIT(1); + } + } + + // Open .qvs, .idx, and .db files + + quiva = Fopen(MyCatenate(db->path,"","",".qvs"),"r"); + if (quiva == NULL) + return (-1); + + istub = NULL; + indx = NULL; + table = NULL; + coding = NULL; + qvtrk = NULL; + + root = rindex(db->path,'/'); + if (root[1] == '.') + { *root = '\0'; + istub = Fopen(MyCatenate(db->path,"/",root+2,".db"),"r"); + *root = '/'; + } + else + istub = Fopen(MyCatenate(db->path,"","",".db"),"r"); + if (istub == NULL) + goto error; + + { int first, last, nfiles; + char prolog[MAX_NAME], fname[MAX_NAME]; + int i, j; + + if (fscanf(istub,DB_NFILE,&nfiles) != 1) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + + if (db->part > 0) + { int pfirst, plast; + int fbeg, fend; + int n, k; + FILE *indx; + + // Determine first how many and which files span the block (fbeg to fend) + + pfirst = db->ufirst; + plast = pfirst + db->nreads; + + first = 0; + for (fbeg = 0; fbeg < nfiles; fbeg++) + { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + if (last > pfirst) + break; + first = last; + } + for (fend = fbeg+1; fend <= nfiles; fend++) + { if (last >= plast) + break; + if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + first = last; + } + + indx = Fopen(MyCatenate(db->path,"","",".idx"),"r"); + ncodes = fend-fbeg; + coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (indx == NULL || coding == NULL || table == NULL) + { ncodes = 0; + goto error; + } + + // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD + // in .idx that is *not* in memory). Get all the other coding schemes normally and + // assign the tables # for each read in the block in "tables". + + rewind(istub); + (void) fscanf(istub,DB_NFILE,&nfiles); + + first = 0; + for (n = 0; n < fbeg; n++) + { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); + first = last; + } + + for (n = fbeg; n < fend; n++) + { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); + + i = n-fbeg; + if (first < pfirst) + { DAZZ_READ read; + + fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET); + if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + ncodes = i; + goto error; + } + fseeko(quiva,read.coff,SEEK_SET); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; + } + else + { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; + db->reads[first-pfirst].coff = ftello(quiva); + } + + j = first-pfirst; + if (j < 0) + j = 0; + k = last-pfirst; + if (k > db->nreads) + k = db->nreads; + while (j < k) + table[j++] = (uint16) i; + + first = last; + } + + fclose(indx); + indx = NULL; + } + + else + { // Load in coding scheme for each file, adjust .coff of first read in the file, and + // record which table each read uses + + ncodes = nfiles; + coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (coding == NULL || table == NULL) + goto error; + + first = 0; + for (i = 0; i < nfiles; i++) + { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + + fseeko(quiva,db->reads[first].coff,SEEK_SET); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; + db->reads[first].coff = ftello(quiva); + + for (j = first; j < last; j++) + table[j] = (uint16) i; + + first = last; + } + } + + // Allocate and fill in the DAZZ_QV record and add it to the front of the + // track list + + qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track"); + if (qvtrk == NULL) + goto error; + qvtrk->name = qtrack_name; + if (qvtrk->name == NULL) + goto error; + qvtrk->next = db->tracks; + db->tracks = (DAZZ_TRACK *) qvtrk; + qvtrk->ncodes = ncodes; + qvtrk->table = table; + qvtrk->coding = coding; + qvtrk->quiva = quiva; + } + + fclose(istub); + return (0); + +error: + if (qvtrk != NULL) + free(qvtrk); + if (table != NULL) + free(table); + if (coding != NULL) + { int i; + for (i = 0; i < ncodes; i++) + Free_QVcoding(coding+i); + free(coding); + } + if (indx != NULL) + fclose(indx); + if (istub != NULL) + fclose(istub); + fclose(quiva); + EXIT(1); +} + +// Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' + +char **New_QV_Buffer(DAZZ_DB *db) +{ char **entry; + char *qvs; + int i; + + qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); + entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); + if (qvs == NULL || entry == NULL) + EXIT(NULL); + for (i = 0; i < 5; i++) + entry[i] = qvs + i*db->maxlen; + return (entry); +} + +// Load into entry the QV streams for the i'th read from db. The parameter ascii applies to +// the DELTAG stream as described for Load_Read. + +int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii) +{ DAZZ_READ *reads; + FILE *quiva; + int rlen; + + if (db != Active_DB) + { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) + { EPRINTF(EPLACE,"%s: QV's have not been opened (Load_QVentry)\n",Prog_Name); + EXIT(1); + } + Active_QV = (DAZZ_QV *) db->tracks; + Active_DB = db; + } + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); + EXIT(1); + } + + reads = db->reads; + quiva = Active_QV->quiva; + rlen = reads[i].rlen; + + fseeko(quiva,reads[i].coff,SEEK_SET); + if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) + EXIT(1); + + if (ascii != 1) + { char *deltag = entry[1]; + + if (ascii != 2) + { char x = deltag[rlen]; + deltag[rlen] = '\0'; + Number_Read(deltag); + deltag[rlen] = x; + } + else + { int j; + int u = 'A'-'a'; + + for (j = 0; j < rlen; j++) + deltag[j] = (char) (deltag[j]+u); + } + } + + return (0); +} + +// Close the QV stream, free the QV pseudo track and all associated memory + +void Close_QVs(DAZZ_DB *db) +{ DAZZ_TRACK *track; + DAZZ_QV *qvtrk; + int i; + + Active_DB = NULL; + + track = db->tracks; + if (track != NULL && strcmp(track->name,".@qvs") == 0) + { qvtrk = (DAZZ_QV *) track; + for (i = 0; i < qvtrk->ncodes; i++) + Free_QVcoding(qvtrk->coding+i); + free(qvtrk->coding); + free(qvtrk->table); + fclose(qvtrk->quiva); + db->tracks = track->next; + free(track); + } + return; +} + + +/******************************************************************************************* + * + * COMMAND LINE @-EXPANSION PARSER + * Take a command line argument and interpret the '@' block number ranges. + * Parse_Block_Arg produces an Block_Looper iterator object that can then + * be invoked multiple times to iterate through all the files implied by + * the @ pattern/range. + * + ********************************************************************************************/ + +typedef struct + { int first, last, next; + char *root, *pwd, *ppnt; + int isDB; + char *slice; + } _Block_Looper; + +// Advance the iterator e_parse to the next file, open it, and return the file pointer +// to it. Return NULL if at the end of the list of files. + +int Next_Block_Exists(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + char *disp; + struct stat sts; + + if (parse->isDB) + { if (parse->next+1 > parse->last) + return (0); + else + return (1); + } + + if (parse->next+1 > parse->last) + return (0); + + if (parse->next < 0) + disp = parse->root; + else + disp = MyNumbered_Suffix(parse->root,parse->next+1,parse->ppnt); + + if (stat(MyCatenate(parse->pwd,"/",disp,".las"),&sts)) + return (0); + else + return (1); +} + + +FILE *Next_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + char *disp; + FILE *input; + + if (parse->isDB) + { fprintf(stderr,"%s: Cannot open a DB block as a file (Next_Block_Arg)\n",Prog_Name); + exit (1); + } + + parse->next += 1; + if (parse->next > parse->last) + return (NULL); + + if (parse->next < 0) + disp = parse->root; + else + disp = MyNumbered_Suffix(parse->root,parse->next,parse->ppnt); + + if ((input = fopen(MyCatenate(parse->pwd,"/",disp,".las"),"r")) == NULL) + { if (parse->last != INT_MAX) + { fprintf(stderr,"%s: %s.las is not present\n",Prog_Name,disp); + exit (1); + } + return (NULL); + } + return (input); +} + +// Reset the iterator e_parse to the first file + +void Reset_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + parse->next = parse->first - 1; +} + +// Advance the iterator e_parse to the next file + +int Advance_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + if (Next_Block_Exists(e_parse)) + { parse->next += 1; + return (1); + } + else + return (0); +} + +// Return a pointer to the path for the current file + +char *Block_Arg_Path(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + return (Strdup(parse->pwd,"Allocating block path")); +} + +// Return a pointer to the root name for the current file + +char *Block_Arg_Root(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + char *name; + + if (parse->next < 0) + name = parse->root; + else + name = MyNumbered_Suffix(parse->root,parse->next,parse->ppnt); + return (Strdup(name,"Allocating block root")); +} + +// Free the iterator + +void Free_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + free(parse->root); + free(parse->pwd); + free(parse->slice); + free(parse); +} + +char *Next_Block_Slice(Block_Looper *e_parse, int slice) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + if (parse->slice == NULL) + { int size = strlen(parse->pwd) + strlen(Block_Arg_Root(parse)) + 30; + parse->slice = (char *) Malloc(size,"Block argument slice"); + if (parse->slice == NULL) + exit (1); + } + + if (parse->next+1 > parse->last) + return (NULL); + if (parse->next+slice > parse->last) + slice = parse->last-parse->next; + + if (parse->first < 0) + sprintf(parse->slice,"%s/%s",parse->pwd,parse->root); + else + sprintf(parse->slice,"%s/%s%c%d-%d%s",parse->pwd,parse->root,BLOCK_SYMBOL,parse->next+1, + parse->next+slice,parse->ppnt); + parse->next += slice; + return (parse->slice); +} + +// Parse the command line argument and return an iterator to move through the +// file names, setting it up to report the first file. + +static Block_Looper *parse_block_arg(char *arg, int isDB) +{ _Block_Looper *parse; + char *pwd, *root; + char *ppnt, *cpnt; + int first, last; + + parse = (_Block_Looper *) Malloc(sizeof(_Block_Looper),"Allocating parse node"); + pwd = PathTo(arg); + if (isDB) + { int len = strlen(arg); + if (strcmp(arg+(len-4),".dam") == 0) + { root = Root(arg,".dam"); + isDB = 2; + } + else + root = Root(arg,".db"); + } + else + root = Root(arg,".las"); + if (parse == NULL || pwd == NULL || root == NULL) + exit (1); + + ppnt = index(root,BLOCK_SYMBOL); + if (ppnt == NULL) + first = last = -1; + else + { if (index(ppnt+1,BLOCK_SYMBOL) != NULL) + { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } + *ppnt++ = '\0'; + first = strtol(ppnt,&cpnt,10); + if (cpnt == ppnt) + { first = 1; + last = INT_MAX; + } + else + { if (first < 1) + { fprintf(stderr, + "%s: Integer following %c-sigan is less than 1 in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } + if (*cpnt == '-') + { ppnt = cpnt+1; + last = strtol(ppnt,&cpnt,10); + if (cpnt == ppnt) + { fprintf(stderr,"%s: Second integer must follow - in source name '%s'\n", + Prog_Name,root); + exit (1); + } + if (last < first) + { fprintf(stderr, + "%s: 2nd integer is less than 1st integer in source name '%s'\n", + Prog_Name,root); + exit (1); + } + ppnt = cpnt; + } + else + { last = INT_MAX; + ppnt = cpnt; + } + } + } + + parse->pwd = pwd; + parse->root = root; + parse->ppnt = ppnt; + parse->first = first; + parse->last = last; + parse->next = first-1; + parse->slice = NULL; + parse->isDB = isDB; + + if (isDB && first >= 0 && last == INT_MAX) + { char buffer[2*MAX_NAME+100]; + char *dbname; + FILE *dbfile; + int i, nfiles, nblocks; + + dbname = MyCatenate(pwd,"/",root,"db"); + dbfile = fopen(dbname,"r"); + if (dbfile == NULL) + { dbname = MyCatenate(pwd,"/",root,"dam"); + dbfile = fopen(dbname,"r"); + if (dbfile == NULL) + { fprintf(stderr,"%s: Cannot open database %s[db|dam]\n",Prog_Name,root); + exit (1); + } + } + + if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) + SYSTEM_READ_ERROR + if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) + SYSTEM_READ_ERROR + fclose(dbfile); + + parse->last = nblocks; + } + + return ((Block_Looper *) parse); +} + +Block_Looper *Parse_Block_LAS_Arg(char *arg) +{ return (parse_block_arg(arg, 0)); } + +Block_Looper *Parse_Block_DB_Arg(char *arg) +{ return (parse_block_arg(arg, 1)); } diff --git a/DB.h b/DB.h new file mode 100644 index 0000000..c8d06fe --- /dev/null +++ b/DB.h @@ -0,0 +1,728 @@ +/******************************************************************************************* + * + * Compressed data base module. Auxiliary routines to open and manipulate a data base for + * which the sequence and read information are separated into two separate files, and the + * sequence is compressed into 2-bits for each base. Support for tracks of additional + * information, and trimming according to the current partition. Eventually will also + * support compressed quality information. + * + * Author : Gene Myers + * Date : July 2013 + * Revised: April 2014 + * + ********************************************************************************************/ + +#ifndef _DAZZ_DB + +#define _DAZZ_DB + +#include + +#include "QV.h" + +#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" + // Undefine if you don't want this + +// For interactive applications where it is inappropriate to simply exit with an error +// message to standard error, define the constant INTERACTIVE. If set, then error +// messages are put in the global variable Ebuffer and the caller of a DB routine +// can decide how to deal with the error. +// +// DB, QV, or alignment routines that can encounter errors function as before in +// non-INTERACTIVE mode by exiting after printing an error message to stderr. In +// INTERACTIVE mode the routines place a message at EPLACE and return an error +// value. For such routines that were previously void, they are now int, and +// return 1 if an error occured, 0 otherwise. + +#ifdef INTERACTIVE + +#define EPRINTF sprintf +#define EPLACE Ebuffer +#define EXIT(x) return (x) + +#else // BATCH + +#define EPRINTF fprintf +#define EPLACE stderr +#define EXIT(x) exit (1) + +#endif + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; +typedef unsigned long long uint64; +typedef signed char int8; +typedef signed short int16; +typedef signed int int32; +typedef signed long long int64; +typedef float float32; +typedef double float64; + +#define LAST_READ_SYMBOL '$' +#define BLOCK_SYMBOL '@' + +/******************************************************************************************* + * + * COMMAND LINE INTERPRETATION MACROS + * + ********************************************************************************************/ + +extern char *Prog_Name; // Name of program + +#ifdef INTERACTIVE + +extern char Ebuffer[]; + +#endif + +#define ARG_INIT(name) \ + Prog_Name = Strdup(name,""); \ + for (i = 0; i < 128; i++) \ + flags[i] = 0; + +#define ARG_FLAGS(set) \ + for (k = 1; argv[i][k] != '\0'; k++) \ + { if (index(set,argv[i][k]) == NULL) \ + { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ + exit (1); \ + } \ + flags[(int) argv[i][k]] = 1; \ + } + +#define ARG_POSITIVE(var,name) \ + var = strtol(argv[i]+2,&eptr,10); \ + if (*eptr != '\0' || argv[i][2] == '\0') \ + { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ + Prog_Name,argv[i][1],argv[i]+2); \ + exit (1); \ + } \ + if (var <= 0) \ + { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ + exit (1); \ + } + +#define ARG_NON_NEGATIVE(var,name) \ + var = strtol(argv[i]+2,&eptr,10); \ + if (*eptr != '\0' || argv[i][2] == '\0') \ + { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ + Prog_Name,argv[i][1],argv[i]+2); \ + exit (1); \ + } \ + if (var < 0) \ + { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ + exit (1); \ + } + +#define ARG_REAL(var) \ + var = strtod(argv[i]+2,&eptr); \ + if (*eptr != '\0' || argv[i][2] == '\0') \ + { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \ + Prog_Name,argv[i][1],argv[i]+2); \ + exit (1); \ + } + + +/******************************************************************************************* + * + * GUARDED BATCH IO MACROS + * + ********************************************************************************************/ + + // Utilitieis + +int Count_Args(char *arg); + +#define SYSTEM_READ_ERROR \ + { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + exit (2); \ + } + +#define SYSTEM_WRITE_ERROR \ + { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \ + exit (2); \ + } + +#define SYSTEM_CLOSE_ERROR \ + { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \ + exit (2); \ + } + + // Output + +#define FFWRITE(v,s,n,file) \ + { if (fwrite(v,s,n,file) != (size_t) n) \ + SYSTEM_WRITE_ERROR \ + } + +#define FPRINTF(file,...) \ + { if (fprintf(file,__VA_ARGS__) < 0) \ + SYSTEM_WRITE_ERROR \ + } + +#define PRINTF(...) \ + { if (printf(__VA_ARGS__) < 0) \ + SYSTEM_WRITE_ERROR \ + } + +#define FPUTS(x,file) \ + { if (fputs(x,file) == EOF) \ + SYSTEM_WRITE_ERROR \ + } + + // Close + +#define FCLOSE(file) \ + { if (fclose(file) != 0) \ + SYSTEM_CLOSE_ERROR \ + } + + // Input + +#define FFREAD(v,s,n,file) \ + { if (fread(v,s,n,file) != (size_t) n) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FSCANF(file,...) \ + { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FGETS(v,n,file) \ + { if (fgets(v,n,file) == NULL) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FSEEKO(file,p,d) \ + { if (fseeko(file,p,d) < 0) \ + SYSTEM_READ_ERROR \ + } + +#define FTELLO(val,file) \ + { val = ftello(file); \ + if (val < 0) \ + SYSTEM_READ_ERROR \ + } + +/******************************************************************************************* + * + * UTILITIES + * + ********************************************************************************************/ + +// The following general utilities return NULL if any of their input pointers are NULL, or if they +// could not perform their function (in which case they also print an error to stderr). + +void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc +void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to +char *Strdup(char *string, char *mesg); // stderr if out of memory + +FILE *Fopen(char *path, char *mode); // Open file path for "mode" +char *PathTo(char *path); // Return path portion of file name "path" +char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" + +// Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer +// Numbered_Suffix returns concatenation of left..right in a *temporary* buffer + +char *Catenate(char *path, char *sep, char *root, char *suffix); +char *Numbered_Suffix(char *left, int num, char *right); + + +// DB-related utilities + +void Print_Number(int64 num, int width, FILE *out); // Print big integer with commas +int Number_Digits(int64 num); // Return # of digits in printed number + +#define COMPRESSED_LEN(len) (((len)+3) >> 2) + +void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form +void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form +void Print_Read(char *s, int width); + +void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) +void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) +void Number_Read(char *s); // Convert read from letters to numbers +void Change_Read(char *s); // Convert read from one case to the other + +void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234) +void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers + + +/******************************************************************************************* + * + * DB IN-CORE DATA STRUCTURES + * + ********************************************************************************************/ + +#define DB_QV 0x03ff // Mask for 3-digit quality value +#define DB_CCS 0x0400 // This is the second or later of a group of subreads from a given insert +#define DB_BEST 0x0800 // This is the "best" subread of a given insert (may be the only 1) + +#define DB_ARROW 0x2 // DB is an arrow DB +#define DB_ALL 0x1 // all wells are in the trimmed DB + +// Fields have different interpretations if a .db versus a .dam + +typedef struct + { int origin; // Well # (DB), Contig # (DAM) + int rlen; // Length of the sequence (Last pulse = fpulse + rlen) + int fpulse; // First pulse (DB), left index of contig in scaffold (DAM) + int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of + // uncompressed bases in memory block + int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB), + // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) + // 4 compressed shorts containing snr info if an arrow DB. + int flags; // QV of read + flags above (DB only) + } DAZZ_READ; + +// A track can be of 3 types: +// data == NULL: there are nreads 'anno' records of size 'size'. +// data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) +// contains the variable length data +// data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) +// contains the variable length data +// if loaded is set then the data is not loaded if present, rather data is an open file pointer +// set for reading. + +typedef struct _track + { struct _track *next; // Link to next track + char *name; // Symbolic name of track + int size; // Size in bytes of anno records + int nreads; // Number of reads in track + void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records + int *alen; // length of track data for read i (if data != NULL) + void *data; // data[anno[i] .. anno[i]+alen[i[) is data for read i (if data != NULL) + int loaded; // Is track data loaded in memory? + int64 dmax; // Largest read data segment in bytes + } DAZZ_TRACK; + +// The tailing part of a .anno track file can contain meta-information produced by the +// command that produced the track. For example, the coverage, or good/bad parameters +// for trimming, or even say a histogram of QV values. Each item is an array of 'nelem' +// 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that +// describes it, and an indicator as to whether the values should be equal accross all +// block tracks, or summed accross all block tracks (by Catrack). 'value' points at the +// array of values + +#define DB_INT 0 +#define DB_REAL 1 + +#define DB_EXACT 0 +#define DB_SUM 1 + +typedef struct + { int vtype; // INT64 or FLOAST64 + int nelem; // >= 1 + int accum; // EXACT, SUM + char *name; + void *value; + } DAZZ_EXTRA; + +// The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track" +// named ".@qvs" and is always the first track record in the list (if present). Since normal +// track names cannot begin with a . (this is enforced), this pseudo-track is never confused +// with a normal track. + +typedef struct + { struct _track *next; + char *name; + int ncodes; // # of coding tables + QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) + uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with + // scheme coding[table[i]] + FILE *quiva; // the open file pointer to the .qvs file + } DAZZ_QV; + +// The information for accessing Arrow streams is in a DAZZ_ARW record that is a "pseudo-track" +// named ".@arw" and is always the first track record in the list (if present). +// Since normal track names cannot begin with a . (this is enforced), this pseudo-track +// is never confused with a normal track. + +typedef struct + { struct _track *next; + char *name; + int64 *aoff; // offset in file or memory of arrow vector for read i + void *arrow; // FILE * to the .arw file if not loaded, memory block otherwise + int loaded; // Are arrow vectors loaded in memory? + } DAZZ_ARROW; + +// Every DB is referred to by an ASCII stub file with extension .db or .dam. This file +// contains the information about the SMRT cells in the DB and the current division of +// the DB into blocks for HPC processing. This file can be read into the following +// data structure: + +typedef struct + { int nfiles; // Number of files/SMRT cells in DB + int *nreads; // [0..nfiles) = # of reads from cell + char **fname; // [0..nfiles) = file name of cell + char **prolog; // [0..nfiles) = fasta header prolog for cell + int all; // Keep only best read from each well? + int cutoff; // Trim reads less than cutoff + int64 bsize; // Target size for blocks + int nblocks; // Number of blocks for DB + int *ublocks; // [0..nblcoks] = index of 1st read in block in untrimmed DB + int *tblocks; // [0..nblcoks] = index of 1st read in block in trimmed DB + } DAZZ_STUB; + +// The DB record holds all information about the current state of an active DB including an +// array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which +// is always a DAZZ_QV pseudo-track (if the QVs have been loaded). + +typedef struct + { int ureads; // Total number of reads in untrimmed DB + int treads; // Total number of reads in trimmed DB + int cutoff; // Minimum read length in block (-1 if not yet set) + int allarr; // DB_ALL | DB_ARROW + float freq[4]; // frequency of A, C, G, T, respectively + + // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) + + int maxlen; // length of maximum read (initially over all DB) + int64 totlen; // total # of bases (initially over all DB) + + int nreads; // # of reads in actively loaded portion of DB + int trimmed; // DB has been trimmed by cutoff/all + int part; // DB block (if > 0), total DB (if == 0) + int ufirst; // Index of first read in block (without trimming) + int tfirst; // Index of first read in block (with trimming) + + // In order to avoid forcing users to have to rebuild all thier DBs to accommodate + // the addition of fields for the size of the actively loaded trimmed and untrimmed + // blocks, an additional read record is allocated in "reads" when a DB is loaded into + // memory (reads[-1]) and the two desired fields are crammed into the first two + // integer spaces of the record. + + char *path; // Root name of DB for .bps, .qvs, and tracks + int loaded; // Are reads loaded in memory? + void *bases; // file pointer for bases file (to fetch reads from), + // or memory pointer to uncompressed block of all sequences. + DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ + DAZZ_TRACK *tracks; // Linked list of loaded tracks + } DAZZ_DB; + + +/******************************************************************************************* + * + * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock + * + ********************************************************************************************/ + +#define MAX_NAME 10000 // Longest file name or fasta header line + +#define DB_NFILE "files = %9d\n" // number of files +#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name +#define DB_NBLOCK "blocks = %9d\n" // number of blocks +#define DB_PARAMS "size = %11lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well +#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) + + // Read the specified contents of the DB stub file at "path" and return it encoded in a DAZZ_STUB + // structure. This is allocated by the routine. "path" is assumed to be the complete + // name of the file. If all flags are off, then just the scalar parts of the stub + // are returned (i.e. nfiles, all, cutoff, bsize, nblocks). Returns NULL if an error + // occured in INTERACTIVE mode + +#define DB_STUB_NREADS 0x1 +#define DB_STUB_FILES 0x2 +#define DB_STUB_PROLOGS 0x4 +#define DB_STUB_BLOCKS 0x8 + +DAZZ_STUB *Read_DB_Stub(char *path, int what); + + // Read the DB stub file "path" and extract the read index range [*first,*last) + // for block n, for the trimmed DB if trim is set, the untrimmed DB otherwise. + // If n is out of range first and last will be set to -1. Returns 0 unless + // an error occurs in INTERACTIVE mode in which case it returns 1. + +int Fetch_Block_Range(char *path, int trim, int n, int *first, int *last); + + // Free a DAZZ_STUB data structure returned by Read_DB_Stub + +void Free_DB_Stub(DAZZ_STUB *stub); + + +/******************************************************************************************* + * + * DB ROUTINES + * + ********************************************************************************************/ + + // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, + // .DB.qvs, and files .DB..anno and DB..data where is a track name + // (not containing a . !). + + // A DAM is basically a DB except that: + // 1. there are no QV's, instead .coff points to the '\0' terminated fasta header of the read + // in an additional file: .DB.hdr + // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences + // contain N-separated contigs), and .fpulse the first base of the contig in the + // fasta entry + + // Open the given database or dam, "path", into the supplied DAZZ_DB record "db". If the name has + // a part # in it then just the part is opened. The index array is allocated (for all or + // just the part) and read in. + // Return status of routine: + // -1: The DB could not be opened for a reason reported by the routine to EPLACE + // 0: Open of DB proceeded without mishap + // 1: Open of DAM proceeded without mishap + +int Open_DB(char *path, DAZZ_DB *db); + + // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings + // of the current DB partition. Reallocate smaller memory blocks for the information kept + // for the retained reads. + +void Trim_DB(DAZZ_DB *db); + + // Return the size in bytes of the given DB + +int64 sizeof_DB(DAZZ_DB *db); + + // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all + // those of the form "prefix/[.]root.part" and call actor with the complete path to each file + // pointed at by path, and the suffix of the path by extension. The . proceeds the root + // name if the defined constant HIDE_FILES is set. Always the first call is with the + // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for + // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and + // so this routine gives one a way to know all the tracks associated with a given DB. + // -1 is returned if the path could not be found, and 1 is returned if an error (reported + // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int List_DB_Files(char *path, void actor(char *path, char *extension)); + + // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, + // and any open file pointers. The record pointed at by db however remains (the user + // supplied it and so should free it). + +void Close_DB(DAZZ_DB *db); + + +/******************************************************************************************* + * + * READ ROUTINES + * + ********************************************************************************************/ + + // Allocate and return a buffer big enough for the largest read in 'db'. + // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte + // are needed by the alignment algorithms. If cannot allocate memory then return NULL + // if INTERACTIVE is defined, or print error to stderr and exit otherwise. + +char *New_Read_Buffer(DAZZ_DB *db); + + // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an + // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) + // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter + // for traversals in either direction. A non-zero value is returned if an error occured + // and INTERACTIVE is defined. + +int Load_Read(DAZZ_DB *db, int i, char *read, int ascii); + + // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the + // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii + // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string + // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to + // the string holding the substring so it has a delimeter for traversals in either direction. + // A NULL pointer is returned if an error occured and INTERACTIVE is defined. + +char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii); + + // Allocate a block big enough for all the uncompressed read sequences and read and uncompress + // the reads into it, reset the 'boff' in each read record to be its in-memory offset, + // and set the bases pointer to point at the block after closing the bases file. Return + // with a zero, except when an error occurs and INTERACTIVE is defined in which + // case return wtih 1. + +int Load_All_Reads(DAZZ_DB *db, int ascii); + + +/******************************************************************************************* + * + * ARROW ROUTINES + * + ********************************************************************************************/ + + // If the Arrow pseudo track is not already in db's track list, then load it and set it up. + // The database reads must not have been loaded with Load_All_Reads yet. + // -1 is returned if a .arw file is not present, and 1 is returned if an error (reported + // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int Open_Arrow(DAZZ_DB *db); + + // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, + // and there is only a choice between numeric (0) or ascii (1); + +int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii); + + // Allocate a block big enough for all the uncompressed Arrow vectors, read them into it, + // reset the 'off' in each arrow record to be its in-memory offset, and set the + // arrow pointer to point at the block after closing the arrow file. If ascii is + // non-zero then the arrows are converted to 0123 ascii, otherwise the arrows are left + // as numeric strings over [0-3]. + +int Load_All_Arrows(DAZZ_DB *db, int ascii); + + // Remove the Arrow pseudo track, all space associated with it, and close the .arw file. + +void Close_Arrow(DAZZ_DB *); + + +/******************************************************************************************* + * + * TRACK ROUTINES + * + ********************************************************************************************/ + + // Look up the file and header in the file of the indicated track. Return: + // 1: Track is for trimmed DB + // 0: Track is for untrimmed DB + // -1: Track is not the right size of DB either trimmed or untrimmed + // -2: Could not find the track + // In addition, if opened (0 or 1 returned), then kind points at an integer indicating + // the type of track as follows: + // CUSTOM 0 => a custom track + // MASK 1 => a mask track + +#define CUSTOM_TRACK 0 +#define MASK_TRACK 1 + +int Check_Track(DAZZ_DB *db, char *track, int *kind); + + // If track is not already in the db's track list, then allocate all the storage for the anno + // index, read it in from the appropriate file, add it to the track list, and return a pointer + // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be + // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise + // the routine prints an error message to stderr and exits if an error occurs, and returns + // with NULL only if the track does not exist. + +DAZZ_TRACK *Open_Track(DAZZ_DB *db, char *track); + + // Allocate a data buffer large enough to hold the longest read data block that will occur + // in the track. If cannot allocate memory then return NULL if INTERACTIVE is defined, + // or print error to stderr and exit otherwise. + +void *New_Track_Buffer(DAZZ_TRACK *track); + + // Load into 'data' the read data block for read i's "track" data. Return the length of + // the data in bytes, unless an error occurs and INTERACTIVE is defined in which case + // return wtih -1. + +int Load_Track_Data(DAZZ_TRACK *track, int i, void *data); + + // Allocate a block big enough for all the track data and read the data into it, + // reset the 'off' in each anno pointer to be its in-memory offset, and set the + // data pointer to point at the block after closing the data file. Return with a + // zero, except when an error occurs and INTERACTIVE is defined in which + // case return wtih 1. + +int Load_All_Track_Data(DAZZ_TRACK *track); + + // Assumming file pointer for afile is correctly positioned at the start of an extra item, + // and aname is the name of the .anno file, decode the value present and place it in + // extra if extra->nelem == 0, otherwise reduce the value just read into extra according + // according to the directive given by 'accum'. Leave the read pointer at the next + // extra or end-of-file. + // Returns: + // 1 if at the end of file, + // 0 if item was read and folded correctly, + // -1 if there was a system IO or allocation error (if interactive), and + // -2 if the new value could not be reduced into the current value of extra (interactive) + +int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra); + + // Write extra record to end of file afile and advance write pointer + // If interactive, then return non-zero on error, if batch, then print + // and halt if an error + +int Write_Extra(FILE *afile, DAZZ_EXTRA *extra); + + // If track is on the db's track list, then it is removed and all storage associated with it + // is freed. + +void Close_Track(DAZZ_DB *db, DAZZ_TRACK *track); + + +/******************************************************************************************* + * + * QV ROUTINES + * + ********************************************************************************************/ + + // If QV pseudo track is not already in db's track list, then load it and set it up. + // The database must not have been trimmed yet. -1 is returned if a .qvs file is not + // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE + // is defined. Otherwise a 0 is returned. + +int Open_QVs(DAZZ_DB *db); + + // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur + // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, + // or print error to stderr and exit otherwise. + +#define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer +#define DEL_TAG 1 // The deleted characters +#define INS_QV 2 // The insertion QVs +#define SUB_QV 3 // The substitution QVs +#define MRG_QV 4 // The merge QVs + +char **New_QV_Buffer(DAZZ_DB *db); + + // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters + // are converted to a numeric or upper/lower case ascii string as per ascii. Return with + // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. + +int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii); + + // Remove the QV pseudo track, all space associated with it, and close the .qvs file. + +void Close_QVs(DAZZ_DB *db); + + +/******************************************************************************************* + * + * @-SIGN EXPANSION ROUTINES + * + ********************************************************************************************/ + + // Take a command line argument and interpret the '@' block number ranges. + // Parse_Block_[LAS,DB]_Arg produces a Block_Looper iterator object that can then + // be invoked multiple times to iterate through all the file names implied by + // the @ pattern/range. Next_Block_Slice returns a string encoing the next + // slice files represented by an @-notation, and advances the iterator by + // that many files. + +typedef void Block_Looper; + +Block_Looper *Parse_Block_LAS_Arg(char *arg); +Block_Looper *Parse_Block_DB_Arg(char *arg); + +int Next_Block_Exists(Block_Looper *e_parse); +FILE *Next_Block_Arg(Block_Looper *e_parse); +void Reset_Block_Arg(Block_Looper *e_parse); // Reset iterator to first file +int Advance_Block_Arg(Block_Looper *e_parse); // Advance iterator to next file, 0 if none +void Free_Block_Arg(Block_Looper *e_parse); // Free the iterator + +char *Next_Block_Slice(Block_Looper *e_parse,int slice); + +char *Block_Arg_Path(Block_Looper *e_parse); // Path of current file, must free +char *Block_Arg_Root(Block_Looper *e_parse); // Root name of current file, must free + +#endif // _DAZZ_DB diff --git a/HPC.daligner.c b/HPC.daligner.c new file mode 100644 index 0000000..b9e5187 --- /dev/null +++ b/HPC.daligner.c @@ -0,0 +1,1159 @@ +/*********************************************************************************************\ + * + * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge + * them into as many .las files as their are blocks. + * + * Author: Gene Myers + * Date : June 1, 2014 + * + *********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" + +#undef LSF // define if want a directly executable LSF script +#undef SLURM // define if want a directly executable SLURM script + +static char *Usage[] = + { "[-vad] [-l] [-s] [-t] [-M]", + " [-P] [-B] [-T] [-f]", + " ( [-k] [-%] [-h] [-e] [-H]", + " [-k] [-%] [-h] [-e] )", + " [-m]+ [[-]]" + }; + + // Command Options + +static int BUNIT; +static int VON, CON, DON; +static int WINT, TINT, HGAP, HINT, KINT, SINT, PINT, LINT, MINT; +static int NTHREADS; +static double EREL; +static int MMAX, MTOP; +static char **MASK; +static char *ONAME; +static char *PDIR; + +#ifdef LSF + +#define HPC + +#define HPC_ALIGN \ + "bsub -q medium -n %d -o DALIGNER.out -e DALIGNER.err -R span[hosts=1] -J align#%d" +#define HPC_MERGE \ + "bsub -q short -n 12 -o MERGE.DAL.out -e MERGE.DAL.err -R span[hosts=1] -J merge#%d" +#define HPC_CHECK \ + "bsub -q short -n 12 -o CHECK.DAL.out -e CHECK.DAL.err -R span[hosts=1] -J check#%d" + +#endif + +#ifdef SLURM + +#define HPC + +#define HPC_ALIGN \ + "srun -p batch -n 1 -c %d --mem_per_cpu=%d -o DALIGNER.out -e DALIGNER.err -J align#%d" +#define HPC_MERGE \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o MERGE.DAL.out -e MERGE.DAL.err -J merge#%d" +#define HPC_CHECK \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o CHECK.DAL.out -e CHECK.DAL.err -J check#%d" + +#endif + +void daligner_script(int argc, char *argv[]) +{ int nblocks; + int usepath; + int useblock; + int fblock, lblock; +#ifdef HPC + int jobid; +#endif + + FILE *out; + char name[100]; + char *pwd, *root; + + // Make sure DB exists and is partitioned, get number of blocks in partition + + pwd = PathTo(argv[1]); + if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) + root = Root(argv[1],".dam"); + else + root = Root(argv[1],".db"); + + { int i, nfiles; + FILE *dbvis; + + dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); + if (dbvis == NULL) + { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r"); + if (dbvis == NULL) + exit (1); + } + + if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + { char buffer[30001]; + + if (fgets(buffer,30000,dbvis) == NULL) + SYSTEM_READ_ERROR + } + + useblock = 1; + if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1 || nblocks == 1) + { useblock = 0; + nblocks = 1; + } + + usepath = (strcmp(pwd,".") != 0); + + fclose(dbvis); + } + + // Set range fblock-lblock checking that DB..las exists & DB..las does not + + { char *eptr, *fptr; + FILE *file; + + if (argc == 3) + { fblock = strtol(argv[2],&eptr,10); + if (*eptr != '\0' && *eptr != '-') + { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", + Prog_Name,argv[2]); + exit (1); + } + useblock = 1; + if (*eptr == '-') + { lblock = strtol(eptr+1,&fptr,10); + if (*fptr != '\0') + { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", + Prog_Name,eptr+1); + exit (1); + } + } + else + lblock = fblock; + if (fblock < 1 || lblock > nblocks || fblock > lblock) + { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); + exit (1); + } + } + else + { fblock = 1; + lblock = nblocks; + } + + if (fblock > 1) + { file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock-1,".las")),"r"); + if (file == NULL) + { if (usepath) + fprintf(stderr,"%s: File %s/%s.%d.las should already be present!\n", + Prog_Name,pwd,root,fblock-1); + else + fprintf(stderr,"%s: File %s.%d.las should already be present!\n", + Prog_Name,root,fblock-1); + exit (1); + } + else + fclose(file); + } + if (useblock) + file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock,".las")),"r"); + else + file = fopen(Catenate(pwd,"/",root,".las"),"r"); + if (file != NULL) + { if (usepath) + if (useblock) + fprintf(stderr,"%s: File %s/%s.%d.las should not yet exist!\n", + Prog_Name,pwd,root,fblock); + else + fprintf(stderr,"%s: File %s/%s.las should not yet exist!\n",Prog_Name,pwd,root); + else + if (useblock) + fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n",Prog_Name,root,fblock); + else + fprintf(stderr,"%s: File %s.las should not yet exist!\n",Prog_Name,root); + exit (1); + } + + DON = (DON && (lblock > 1)); + out = stdout; + } + + { int njobs; + int i, j, k; + + // Create all work subdirectories if DON + + if (DON && lblock > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.00.MKDIR",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Create work subdirectories\n"); + for (i = 1; i <= lblock; i++) + fprintf(out,"mkdir -p work%d\n",i); + + if (ONAME != NULL) + fclose(out); + } + + // Produce all necessary daligner jobs + + if (ONAME != NULL) + { sprintf(name,"%s.01.OVL",ONAME); + out = fopen(name,"w"); + } + + njobs = 0; + for (i = fblock; i <= lblock; i++) + njobs += (i-1)/BUNIT+1; + + fprintf(out,"# Daligner jobs (%d)\n",njobs); + +#ifdef HPC + jobid = 1; +#endif + for (i = fblock; i <= lblock; i++) + { int bits; + int low, hgh; + + bits = (i-1)/BUNIT+1; + low = 1; + for (j = 1; j <= bits; j++) + { +#ifdef LSF + fprintf(out,HPC_ALIGN,NTHREADS,jobid++); + fprintf(out," \""); +#endif +#ifdef SLURM + if (MINT >= 0) + fprintf(out,HPC_ALIGN,NTHREADS,(MINT*1024)/NTHREADS,jobid++); + else + fprintf(out,HPC_ALIGN,NTHREADS,(16*1024)/NTHREADS,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"daligner"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (KINT != 16) + fprintf(out," -k%d",KINT); + if (PINT != 28) + fprintf(out," -%%%d",PINT); + if (WINT != 6) + fprintf(out," -w%d",WINT); + if (HINT != 50) + fprintf(out," -h%d",HINT); + if (TINT > 0) + fprintf(out," -t%d",TINT); + if (HGAP > 0) + fprintf(out," -H%d",HGAP); + if (EREL > 0.) + fprintf(out," -e%g",EREL); + if (LINT != 1500) + fprintf(out," -l%d",LINT); + if (SINT != 100) + fprintf(out," -s%d",SINT); + if (MINT >= 0) + fprintf(out," -M%d",MINT); + if (PDIR != NULL) + fprintf(out," -P%s",PDIR); + if (NTHREADS != 4) + fprintf(out," -T%d",NTHREADS); + for (k = 0; k < MTOP; k++) + fprintf(out," -m%s",MASK[k]); + if (useblock) + if (usepath) + fprintf(out," %s/%s.%d",pwd,root,i); + else + fprintf(out," %s.%d",root,i); + else + if (usepath) + fprintf(out," %s/%s",pwd,root); + else + fprintf(out," %s",root); + hgh = (i*j)/bits + 1; + + if (useblock) + if (usepath) + fprintf(out," %s/%s.@%d-%d",pwd,root,low,hgh-1); + else + fprintf(out," %s.@%d-%d",root,low,hgh-1); + else + if (usepath) + fprintf(out," %s/%s",pwd,root); + else + fprintf(out," %s",root); + + if (lblock == 1) // ==> i = 1, [low,hgh) = [1,2) + { fprintf(out," && mv"); + if (useblock) + fprintf(out," %s.1.%s.1.las",root,root); + else + fprintf(out," %s.%s.las",root,root); + if (usepath) + fprintf(out," %s/",pwd); + else + fprintf(out," "); + if (useblock) + fprintf(out,"%s.1.las",root); + else + fprintf(out,"%s.las",root); + } + else if (DON) + { fprintf(out," && mv"); + for (k = low; k < hgh; k++) + fprintf(out," %s.%d.%s.%d.las",root,i,root,k); + fprintf(out," work%d",i); + for (k = low; k < hgh; k++) + if (k != i) + fprintf(out," && mv %s.%d.%s.%d.las work%d",root,k,root,i,k); + } + +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + low = hgh; + } + } + + // Check .las files (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.02.CHECK.OPT",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n",lblock); + +#ifdef HPC + jobid = 1; +#endif + for (i = 1; i <= lblock; i++) + { +#ifdef HPC + fprintf(out,HPC_CHECK,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAcheck -v%sS",CON?"a":""); + if (usepath) + fprintf(out," %s/%s",pwd,root); + else + fprintf(out," %s",root); + if (lblock == 1) + { if (usepath) + if (useblock) + fprintf(out," %s/%s.1",pwd,root); + else + fprintf(out," %s/%s",pwd,root); + else + if (useblock) + fprintf(out," %s.1",root); + else + fprintf(out," %s",root); + } + else if (i < fblock) + { if (DON) + fprintf(out," work%d/%s.%d.%s.%c%d",i,root,i,root,BLOCK_SYMBOL,fblock); + else + fprintf(out," %s.%d.%s.%c%d",root,i,root,BLOCK_SYMBOL,fblock); + } + else + { if (DON) + fprintf(out," work%d/%s.%d.%s.%c",i,root,i,root,BLOCK_SYMBOL); + else + fprintf(out," %s.%d.%s.%c",root,i,root,BLOCK_SYMBOL); + } +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + + // Merges required if lblock > 1 + + if (lblock > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.03.MERGE",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Merge jobs (%d)\n",lblock); + + // Incremental update merges + +#ifdef HPC + jobid = 1; +#endif + for (j = 1; j < fblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MERGE,jobid++); + fprintf(out," \""); +#endif + if (DON) + { if (usepath) + fprintf(out,"mv %s/%s.%d.las work%d/_%s.%d.las && ", + pwd,root,j,j,root,j); + else + fprintf(out,"mv %s.%d.las work%d/_%s.%d.las && ",root,j,j,root,j); + } + else + { if (usepath) + fprintf(out,"mv %s/%s.%d.las _%s.%d.las && ",pwd,root,j,root,j); + else + fprintf(out,"mv %s.%d.las _%s.%d.las && ",root,j,root,j); + } + fprintf(out,"LAmerge"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (usepath) + fprintf(out," %s/%s.%d",pwd,root,j); + else + fprintf(out," %s.%d",root,j); + if (DON) + fprintf(out," work%d/_%s.%d",j,root,j); + else + fprintf(out," _%s.%d",root,j); + if (DON) + fprintf(out," work%d/%s.%d.%s.%c%d-%d",j,root,j,root,BLOCK_SYMBOL,fblock,lblock); + else + fprintf(out," %s.%d.%s.%c%d-%d",root,j,root,BLOCK_SYMBOL,fblock,lblock); + if (usepath) + fprintf(out," && LAcheck -v%sS %s/%s %s/%s.%d",CON?"a":"",pwd,root,pwd,root,j); + else + fprintf(out," && LAcheck -v%sS %s %s.%d",CON?"a":"",root,root,j); + if (DON) + fprintf(out," && rm work%d/_%s.%d.las",j,root,j); + else + fprintf(out," && rm _%s.%d.las",root,j); +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + // New block merges + + for (j = fblock; j <= lblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MERGE,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAmerge"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (usepath) + fprintf(out," %s/%s.%d",pwd,root,j); + else + fprintf(out," %s.%d",root,j); + if (DON) + fprintf(out," work%d/%s.%d.%s.%c",j,root,j,root,BLOCK_SYMBOL); + else + fprintf(out," %s.%d.%s.%c",root,j,root,BLOCK_SYMBOL); + if (usepath) + fprintf(out," && LAcheck -v%sS %s/%s %s/%s.%d",CON?"a":"",pwd,root,pwd,root,j); + else + fprintf(out," && LAcheck -v%sS %s %s.%d",CON?"a":"",root,root,j); +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + // Cleanup (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.04.RM.OPT",ONAME); + out = fopen(name,"w"); + } + fprintf(out,"# Remove block .las files (optional)\n"); + + for (i = 1; i <= lblock; i++) + { if (DON) + fprintf(out,"cd work%d; ",i); + fprintf(out,"rm %s.%d.%s.*.las",root,i,root); + if (DON) + fprintf(out,"; cd .."); + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + } + } + + free(root); + free(pwd); +} + +/*********************************************************************************************\ + * + * Produce a script to compute overlaps for all block pairs between two DBs, and then sort + * and merge them into as many .las files as their are blocks of the 1st DB. + * + * Author: Gene Myers + * Date : December 31, 2014 + * + *********************************************************************************************/ + +#ifdef LSF + +#define HPC_MALIGN \ + "bsub -q medium -n %d -o MAP.ALL.out -e MAP.ALL.err -R span[hosts=1] -J malign#%d" +#define HPC_MMERGE \ + "bsub -q short -n 12 -o MERGE.ALL.out -e MERGE.ALL.err -R span[hosts=1] -J mmerge#%d" +#define HPC_MCHECK \ + "bsub -q short -n 12 -o CHECK.ALL.out -e CHECK.ALL.err -R span[hosts=1] -J mcheck#%d" + +#endif + +#ifdef SLURM + +#define HPC_MALIGN \ + "srun -p batch -n 1 -c %d --mem_per_cpu=%d -o MAP.ALL.out -e MAP.ALL.err -J malign#%d" +#define HPC_MMERGE \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o MERGE.ALL.out -e MERGE.ALL.err -J mmerge#%d" +#define HPC_MCHECK \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o CHECK.ALL.out -e CHECK.DAL.err -J mcheck#%d" + +#endif + +void mapper_script(int argc, char *argv[]) +{ int nblocks1, nblocks2; + int useblock1, useblock2; + int usepath1, usepath2; + int fblock, lblock; +#ifdef HPC + int jobid; +#endif + + FILE *out; + char name[100]; + char *pwd1, *root1; + char *pwd2, *root2; + + // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition + + pwd1 = PathTo(argv[1]); + if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) + root1 = Root(argv[1],".dam"); + else + root1 = Root(argv[1],".db"); + + { int i, nfiles; + FILE *dbvis; + + dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r"); + if (dbvis == NULL) + { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r"); + if (dbvis == NULL) + exit (1); + } + + if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + { char buffer[30001]; + + if (fgets(buffer,30000,dbvis) == NULL) + SYSTEM_READ_ERROR + } + + useblock1 = 1; + if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1 || nblocks1 == 1) + { useblock1 = 0; + nblocks1 = 1; + } + + usepath1 = (strcmp(pwd1,".") != 0); + + fclose(dbvis); + } + + pwd2 = PathTo(argv[2]); + if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) + root2 = Root(argv[2],".dam"); + else + root2 = Root(argv[2],".db"); + + if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0) + { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n", + Prog_Name,pwd1,root1); + exit (1); + } + + { int i, nfiles; + FILE *dbvis; + + dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r"); + if (dbvis == NULL) + { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r"); + if (dbvis == NULL) + exit (1); + } + + if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + { char buffer[30001]; + + if (fgets(buffer,30000,dbvis) == NULL) + SYSTEM_READ_ERROR + } + + useblock2 = 1; + if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1 || nblocks2 == 1) + { useblock2 = 0; + nblocks2 = 1; + } + + usepath2 = (strcmp(pwd2,".") != 0); + + fclose(dbvis); + } + + // Set range fblock-lblock checking that DB..las exists & DB..las does not + + { char *eptr, *fptr, *src2; + FILE *file; + + if (argc == 4) + { fblock = strtol(argv[3],&eptr,10); + if ((*eptr != '\0' && *eptr != '-') || eptr <= argv[3]) + { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", + Prog_Name,argv[3]); + exit (1); + } + useblock2 = 1; + if (*eptr == '-') + { lblock = strtol(eptr+1,&fptr,10); + if (*fptr != '\0' || fptr <= eptr+1) + { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", + Prog_Name,eptr+1); + exit (1); + } + } + else + lblock = fblock; + if (fblock < 1 || lblock > nblocks2 || fblock > lblock) + { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); + exit (1); + } + } + else + { fblock = 1; + lblock = nblocks2; + } + + if (usepath2) + src2 = Strdup(Catenate(pwd2,"/",root2,""),"Allocating small string!"); + else + src2 = Strdup(root2,"Allocating small string!"); + if (src2 == NULL) + exit (1); + + if (fblock > 1) + { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock-1,".las")),"r"); + if (file == NULL) + { fprintf(stderr,"%s: File %s.%d.%s.las should already be present!\n", + Prog_Name,src2,fblock-1,root1); + exit (1); + } + else + fclose(file); + } + if (useblock2) + { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock,".las")),"r"); + if (file != NULL) + { fprintf(stderr,"%s: File %s.%d.%s.las should not yet exist!\n", + Prog_Name,src2,fblock,root1); + exit (1); + } + } + else + { file = fopen(Catenate(src2,".",root1,".las"),"r"); + if (file != NULL) + { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n", + Prog_Name,src2,root1); + exit (1); + } + } + + free(src2); + + DON = (DON && (nblocks1 > 1)); + out = stdout; + } + + { int njobs; + int i, j, k; + + // Create all work subdirectories if DON + + if (DON && nblocks1 > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.00.MKDIR",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Create work subdirectories\n"); + for (i = fblock; i <= lblock; i++) + fprintf(out,"mkdir -p work%d\n",i); + + if (ONAME != NULL) + fclose(out); + } + + // Produce all necessary daligner jobs ... + + if (ONAME != NULL) + { sprintf(name,"%s.01.CMP",ONAME); + out = fopen(name,"w"); + } + + njobs = nblocks1 * ( (lblock-fblock)/BUNIT + 1); + + fprintf(out,"# Daligner jobs (%d)\n",njobs); + +#ifdef HPC + jobid = 1; +#endif + for (i = fblock; i <= lblock; i++) + { int bits; + int low, hgh; + + bits = (nblocks1-1)/BUNIT+1; + low = 1; + for (j = 1; j <= bits; j++) + { +#ifdef LSF + fprintf(out,HPC_MALIGN,NTHREADS,jobid++); +#endif +#ifdef SLURM + if (MINT >= 0) + fprintf(out,HPC_MALIGN,NTHREADS,(MINT*1024)/NTHREADS,jobid++); + else + fprintf(out,HPC_MALIGN,NTHREADS,(16*1024)/NTHREADS,jobid++); +#endif +#ifdef HPC + fprintf(out," \""); +#endif + fprintf(out,"daligner -A"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (KINT != 20) + fprintf(out," -k%d",KINT); + if (PINT != 50) + fprintf(out," -%%%d",PINT); + if (WINT != 6) + fprintf(out," -w%d",WINT); + if (HINT != 70) + fprintf(out," -h%d",HINT); + if (TINT > 0) + fprintf(out," -t%d",TINT); + if (EREL > 0.) + fprintf(out," -e%g",EREL); + else + fprintf(out," -e.85"); + if (LINT != 1000) + fprintf(out," -l%d",LINT); + if (SINT != 100) + fprintf(out," -s%d",SINT); + if (NTHREADS != 4) + fprintf(out," -T%d",NTHREADS); + if (MINT >= 0) + fprintf(out," -M%d",MINT); + if (PDIR != NULL) + fprintf(out," -P%s",PDIR); + for (k = 0; k < MTOP; k++) + fprintf(out," -m%s",MASK[k]); + + fprintf(out," "); + if (usepath2) + fprintf(out,"%s/",pwd2); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",i); + + hgh = 1 + (nblocks1*j)/bits; + for (k = low; k < hgh; k++) + { fprintf(out," "); + if (usepath1) + fprintf(out,"%s/",pwd1); + fprintf(out,"%s",root1); + if (useblock1) + fprintf(out,".%d",k); + } + + if (nblocks1 == 1) + { if (usepath2) + { fprintf(out," && mv %s",root2); + if (useblock2) + fprintf(out,".%d",i); + fprintf(out,".%s.las %s",root1,pwd2); + } + } + else if (DON) + { fprintf(out," && mv"); + for (k = low; k < hgh; k++) + { fprintf(out," %s",root2); + if (useblock2) + fprintf(out,".%d",i); + fprintf(out,".%s.%d.las",root1,k); + } + fprintf(out," work%d",i); + } +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + low = hgh; + } + } + + // Check .las files (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.02.CHECK.OPT",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n", + (lblock-fblock)+1); + +#ifdef HPC + jobid = 1; +#endif + for (j = fblock; j <= lblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MCHECK,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAcheck -v%sS",CON?"a":""); + if (usepath2) + fprintf(out," %s/%s",pwd2,root2); + else + fprintf(out," %s",root2); + if (usepath1) + fprintf(out," %s/%s",pwd1,root1); + else + fprintf(out," %s",root1); + fprintf(out," "); + if (nblocks1 == 1) + { if (usepath2) + fprintf(out,"%s/",pwd2); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s",root1); + } + else + { if (DON) + fprintf(out,"work%d/",j); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s.%c",root1,BLOCK_SYMBOL); + } +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + + // Higher level merges (if lblock > 1) + + if (nblocks1 > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.03.MERGE",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Merge jobs (%d)\n",(lblock-fblock)+1); + +#ifdef HPC + jobid = 1; +#endif + for (j = fblock; j <= lblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MMERGE,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAmerge "); + if (VON) + fprintf(out,"-v "); + if (CON) + fprintf(out,"-a "); + if (usepath2) + fprintf(out,"%s/",pwd2); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s",root1); + if (DON) + fprintf(out," work%d/",j); + else + fprintf(out," "); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s.%c",root1,BLOCK_SYMBOL); + +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + // Cleanup (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.04.RM",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Remove temporary .las files\n"); + + for (j = fblock; j <= lblock; j++) + { if (DON) + fprintf(out,"cd work%d; ",j); + fprintf(out,"rm %s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s.*.las",root1); + if (DON) + fprintf(out,"; cd .."); + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + } + } + + free(root2); + free(pwd2); + free(root1); + free(pwd1); + + exit (0); +} + +int main(int argc, char *argv[]) +{ int i, j, k; + int flags[128]; + char *eptr; + int mapper; + + // Process options and decide if its a overlap or mapper script + + ARG_INIT("HPC.daligner") + + KINT = 0; + HINT = 0; + HGAP = 0; + EREL = 0.; + + BUNIT = 4; + TINT = 0; + WINT = 6; + LINT = 1500; + SINT = 100; + MINT = -1; + PINT = -1; + PDIR = NULL; + + MTOP = 0; + MMAX = 10; + MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); + if (MASK == NULL) + exit (1); + ONAME = NULL; + + NTHREADS = 4; + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("vadAI"); + break; + case 'e': + ARG_REAL(EREL) + if (EREL < .7 || EREL >= 1.) + { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); + exit (1); + } + break; + case 'f': + ONAME = argv[i]+2; + break; + case 'h': + ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") + break; + case 'k': + ARG_POSITIVE(KINT,"K-mer length") + if (KINT > 32) + { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name); + exit (1); + } + break; + case 'l': + ARG_POSITIVE(LINT,"Minimum ovlerap length") + break; + case 'm': + if (MTOP >= MMAX) + { MMAX = 1.2*MTOP + 10; + MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); + if (MASK == NULL) + exit (1); + } + MASK[MTOP++] = argv[i]+2; + break; + case 's': + ARG_POSITIVE(SINT,"Trace spacing") + break; + case 't': + ARG_POSITIVE(TINT,"Tuple suppression frequency") + break; + case 'w': + ARG_POSITIVE(WINT,"Log of bin width") + break; + case 'B': + ARG_NON_NEGATIVE(BUNIT,"Blocks per command") + break; + case 'H': + ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") + break; + case 'M': + ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") + break; + case 'P': + PDIR = argv[i]+2; + break; + case 'T': + ARG_POSITIVE(NTHREADS,"Number of threads") + break; + case '%': + ARG_POSITIVE(PINT,"Modimer percentage") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VON = flags['v']; + CON = flags['a']; + DON = flags['d']; + + if (argc < 2 || argc > 4) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[4]); + fprintf(stderr,"\n"); + fprintf(stderr," Passed through to daligner.\n"); + fprintf(stderr," -k: k-mer size (must be <= 32).\n"); + fprintf(stderr," -%%: modimer percentage (take %% of the k-mers).\n"); + fprintf(stderr," -w: Look for k-mers in averlapping bands of size 2^-w.\n"); + fprintf(stderr," -h: A seed hit if the k-mers in band cover >= -h bps in the"); + fprintf(stderr," targest read.\n"); + fprintf(stderr," -t: Ignore k-mers that occur >= -t times in a block.\n"); + fprintf(stderr," -M: Use only -M GB of memory by ignoring most frequent k-mers.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -e: Look for alignments with -e percent similarity.\n"); + fprintf(stderr," -l: Look for alignments of length >= -l.\n"); + fprintf(stderr," -s: Use -s as the trace point spacing for encoding alignments.\n"); + fprintf(stderr," -H: HGAP option: align only target reads of length >= -H.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -T: Use -T threads.\n"); + fprintf(stderr," -P: Do first level sort and merge in directory -P.\n"); + fprintf(stderr," -m: Soft mask the blocks with the specified mask.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," Script control.\n"); + fprintf(stderr," -v: Run all commands in script in verbose mode.\n"); + fprintf(stderr," -a: Instruct LAsort & LAmerge to sort only on (a,ab).\n"); + fprintf(stderr," -d: Put .las files for each target block in a sub-directory\n"); + fprintf(stderr," -B: # of block compares per daligner job\n"); + fprintf(stderr," -f: Place script bundles in separate files with prefix \n"); + exit (1); + } + + if (argc == 2) + mapper = 0; + else if (argc == 4) + mapper = 1; + else + { (void) strtol(argv[2],&eptr,10); + if ((*eptr == '\0' || *eptr == '-') && eptr > argv[2]) + mapper = 0; + else + mapper = 1; + } + + if (mapper) + { if (HGAP > 0) + { fprintf(stderr,"%s: Cannot use -H option in a comparison script\n",Prog_Name); + exit (1); + } + if (KINT <= 0) + KINT = 20; + if (HINT <= 0) + HINT = 70; + if (EREL <= 0.) + EREL = .85; + if (PINT <= 0) + PINT = 50; + } + else + { if (KINT <= 0) + KINT = 16; + if (HINT <= 0) + HINT = 50; + if (PINT <= 0) + PINT = 28; + } + + if (mapper) + mapper_script(argc,argv); + else + daligner_script(argc,argv); + + exit (0); +} diff --git a/LAa2b.c b/LAa2b.c new file mode 100644 index 0000000..5f68780 --- /dev/null +++ b/LAa2b.c @@ -0,0 +1,104 @@ +#include +#include + +#include "DB.h" +#include "align.h" + +int main(int argc, char *argv[]) +{ char code, which; + int64 total; + int aread, bread; + char orient, chain; + int alen, blen; + int ab, ae, bb, be; + int diffs; + int len; + int tspace, small; + uint8 *tbuffer = NULL; + uint16 *sbuffer = NULL; + + (void) argv; + + // Process arguments + + if (argc > 1) + { fprintf(stderr,"Usage: LAa2b <(ascii) >(binary)\n"); + exit (1); + } + + while (scanf(" %c",&code) == 1) // Header lines + if (code == '@' || code == '+' || code == '%') + { scanf(" %c %lld",&which,&total); + fwrite(&code,sizeof(char),1,stdout); + fwrite(&which,sizeof(char),1,stdout); + fwrite(&total,sizeof(int64),1,stdout); + if (code == '@') + { tbuffer = (uint8 *) malloc(2*total*sizeof(uint16)); + sbuffer = (uint16 *) tbuffer; + } + } + else + { ungetc(code,stdin); + break; + } + small = 0; + if (tbuffer != NULL) + { if (code != 'X') + { fprintf(stderr,"LAa2b: .las dump has traces but no X-line\n"); + exit (1); + } + scanf(" X %d",&tspace); + small = (tspace <= TRACE_XOVR && tspace != 0); + fwrite(&code,sizeof(char),1,stdout); + fwrite(&tspace,sizeof(int),1,stdout); + } + + while (scanf(" %c",&code) == 1) // For each data line do + { fwrite(&code,sizeof(char),1,stdout); + switch (code) + { case 'P': // Alignment pair + scanf(" %d %d %c %c",&aread,&bread,&orient,&chain); + fwrite(&aread,sizeof(int),1,stdout); + fwrite(&bread,sizeof(int),1,stdout); + fwrite(&orient,sizeof(char),1,stdout); + fwrite(&chain,sizeof(char),1,stdout); + break; + case 'L': // Read lengths + scanf(" %d %d",&alen,&blen); + fwrite(&len,sizeof(int),1,stdout); + fwrite(&blen,sizeof(int),1,stdout); + break; + case 'C': // Coordinate intervals + scanf(" %d %d %d %d",&ab,&ae,&bb,&be); + fwrite(&ab,sizeof(int),1,stdout); + fwrite(&ae,sizeof(int),1,stdout); + fwrite(&bb,sizeof(int),1,stdout); + fwrite(&be,sizeof(int),1,stdout); + break; + case 'D': // Differences + scanf(" %d",&diffs); + fwrite(&diffs,sizeof(int),1,stdout); + break; + case 'T': // Mask + if (tbuffer == NULL) + { fprintf(stderr,"LAa2b: .las dump has traces but no @ T-line\n"); + exit (1); + } + scanf(" %d",&len); + fwrite(&len,sizeof(int),1,stdout); + len *= 2; + if (small) + { for (int i = 0; i < len; i += 2) + scanf(" %hhd %hhd",tbuffer+i,tbuffer+(i+1)); + fwrite(tbuffer,sizeof(uint8),len,stdout); + } + else + { for (int i = 0; i < len; i += 2) + scanf(" %hd %hd",sbuffer+i,sbuffer+(i+1)); + fwrite(sbuffer,sizeof(uint16),len,stdout); + } + } + } + + exit (0); +} diff --git a/LAb2a.c b/LAb2a.c new file mode 100644 index 0000000..0c0dbf8 --- /dev/null +++ b/LAb2a.c @@ -0,0 +1,107 @@ +#include +#include + +#include "DB.h" +#include "align.h" + +int main(int argc, char *argv[]) +{ char code, which; + int64 total; + int aread, bread; + char orient, chain; + int alen, blen; + int ab, ae, bb, be; + int diffs; + int len; + int tspace, small; + uint8 *tbuffer = NULL; + uint16 *sbuffer = NULL; + + (void) argv; + + // Process arguments + + if (argc > 1) + { fprintf(stderr,"Usage: LAa2b <(ascii) >(binary)\n"); + exit (1); + } + + if (fread(&code,sizeof(char),1,stdin) == 0) + code = 0; + + while (code == '@' || code == '+' || code == '%') + { fread(&which,sizeof(char),1,stdin); + fread(&total,sizeof(int64),1,stdin); + printf("%c %c %lld\n",code,which,total); + if (code == '@') + { tbuffer = (uint8 *) malloc(2*total*sizeof(uint16)); + sbuffer = (uint16 *) tbuffer; + } + + if (fread(&code,sizeof(char),1,stdin) == 0) + code = 0; + } + + small = 0; + if (tbuffer != NULL && code != 0) + { if (code != 'X') + { fprintf(stderr,"LAb2a: .las dump has traces but no X-info\n"); + exit (1); + } + fread(&tspace,sizeof(int),1,stdin); + small = (tspace <= TRACE_XOVR && tspace != 0); + printf("X %d\n",tspace); + } + + while (code != 0) // For each data item do + { switch (code) + { case 'P': // Alignment pair + fread(&aread,sizeof(int),1,stdin); + fread(&bread,sizeof(int),1,stdin); + fread(&orient,sizeof(char),1,stdin); + fread(&chain,sizeof(char),1,stdin); + printf("%c %d %d %c %c\n",code,aread,bread,orient,chain); + break; + case 'L': // Read lengths + scanf(" %d %d",&alen,&blen); + fread(&len,sizeof(int),1,stdin); + fread(&blen,sizeof(int),1,stdin); + printf("%c %d %d\n",code,alen,blen); + break; + case 'C': // Coordinate intervals + fread(&ab,sizeof(int),1,stdin); + fread(&ae,sizeof(int),1,stdin); + fread(&bb,sizeof(int),1,stdin); + fread(&be,sizeof(int),1,stdin); + printf("%c %d %d %d %d\n",code,ab,ae,bb,be); + break; + case 'D': // Differences + fread(&diffs,sizeof(int),1,stdin); + printf("%c %d\n",code,diffs); + break; + case 'T': // Mask + if (tbuffer == NULL) + { fprintf(stderr,"LAb2a: .las dump has traces but no @ T-info\n"); + exit (1); + } + fread(&len,sizeof(int),1,stdin); + printf("%c %d\n",code,len); + len *= 2; + if (small) + { fread(tbuffer,sizeof(uint8),len,stdin); + for (int i = 0; i < len; i += 2) + printf(" %d %d\n",tbuffer[i],tbuffer[i+1]); + } + else + { fread(sbuffer,sizeof(uint16),len,stdin); + for (int i = 0; i < len; i += 2) + printf(" %d %d\n",sbuffer[i],sbuffer[i+1]); + } + } + + if (fread(&code,sizeof(char),1,stdin) == 0) + code = 0; + } + + exit (0); +} diff --git a/LAcat.c b/LAcat.c new file mode 100644 index 0000000..fa6b03b --- /dev/null +++ b/LAcat.c @@ -0,0 +1,199 @@ +/******************************************************************************************* + * + * Merge together in index order, overlap files .1.las, .2.las, ... into a + * single overlap file and output to the standard output + * + * Author: Gene Myers + * Date : July 2013 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "[-v] ... > .las"; + +#define MEMORY 1000 // How many megabytes for output buffer + +int main(int argc, char *argv[]) +{ char *iblock, *oblock; + FILE *input; + int64 novl, bsize, ovlsize, ptrsize; + int tspace, tbytes; + int c; + + int VERBOSE; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAcat") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + { ARG_FLAGS("v") } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + + if (argc <= 1) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," 's may contain a template that is %c-sign optionally\n", + BLOCK_SYMBOL); + fprintf(stderr," followed by an integer or integer range\n"); + exit (1); + } + } + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + bsize = MEMORY * 1000000ll; + oblock = (char *) Malloc(bsize,"Allocating output block"); + iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); + if (oblock == NULL || iblock == NULL) + exit (1); + iblock += ptrsize; + + novl = 0; + tspace = -1; + for (c = 1; c < argc; c++) + { Block_Looper *parse; + FILE *input; + + parse = Parse_Block_LAS_Arg(argv[c]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { int64 povl; + int mspace; + + if (fread(&povl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + novl += povl; + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (tspace < 0) + tspace = mspace; + else if (tspace != mspace) + { fprintf(stderr,"%s: trace-point spacing conflict between %s and earlier files", + Prog_Name,Block_Arg_Root(parse)); + fprintf(stderr," (%d vs %d)\n",tspace,mspace); + exit (1); + } + + fclose(input); + } + + Free_Block_Arg(parse); + } + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + if (fwrite(&novl,sizeof(int64),1,stdout) != 1) + SYSTEM_READ_ERROR + if (fwrite(&tspace,sizeof(int),1,stdout) != 1) + SYSTEM_READ_ERROR + + { Block_Looper *parse; + int c, j; + Overlap *w; + int64 tsize, povl; + int mspace; + char *iptr, *itop; + char *optr, *otop; + + optr = oblock; + otop = oblock + bsize; + + for (c = 1; c < argc; c++) + { parse = Parse_Block_LAS_Arg(argv[c]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { if (fread(&povl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + if (VERBOSE) + { fprintf(stderr, + " Concatenating %s: %lld la\'s\n",Block_Arg_Root(parse),povl); + fflush(stderr); + } + + iptr = iblock; + itop = iblock + fread(iblock,1,bsize,input); + + for (j = 0; j < povl; j++) + { if (iptr + ovlsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + } + + w = (Overlap *) (iptr - ptrsize); + tsize = w->path.tlen*tbytes; + + if (optr + ovlsize + tsize > otop) + { if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + optr = oblock; + } + + memmove(optr,iptr,ovlsize); + optr += ovlsize; + iptr += ovlsize; + + if (iptr + tsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + } + + memmove(optr,iptr,tsize); + optr += tsize; + iptr += tsize; + } + + fclose(input); + } + + Free_Block_Arg(parse); + } + + if (optr > oblock) + { if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + } + } + + if (VERBOSE) + { fprintf(stderr," Totalling %lld la\'s\n",novl); + fflush(stderr); + } + + free(oblock); + free(iblock-ptrsize); + + exit (0); +} diff --git a/LAcheck.c b/LAcheck.c new file mode 100644 index 0000000..f9e545e --- /dev/null +++ b/LAcheck.c @@ -0,0 +1,397 @@ +/******************************************************************************************* + * + * Check the structural integrity of .las files + * + * Author: Gene Myers + * Date : July 2014 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "[-vaS] [ ] ..."; + +#define MEMORY 1000 // How many megabytes for output buffer + +int main(int argc, char *argv[]) +{ DAZZ_DB _db1, *db1 = &_db1; + DAZZ_DB _db2, *db2 = &_db2; + int VERBOSE; + int MAP_ORDER; + int SORTED; + int ISTWO; + int status; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAcheck") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("vaS") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + MAP_ORDER = flags['a']; + SORTED = flags['S']; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output error messages.\n"); + fprintf(stderr," -S: Check that .las is in sorted order.\n"); + fprintf(stderr," -a: If -S, then check sorted by A-read, A-position pairs\n"); + fprintf(stderr," off => check sorted by A,B-read pairs (LA-piles)\n"); + exit (1); + } + } + + // Open trimmed DB + + { Block_Looper *parse; + int status; + + ISTWO = 0; + status = Open_DB(argv[1],db1); + if (status < 0) + exit (1); + if (db1->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); + exit (1); + } + + if (argc <= 3) + db2 = db1; + else + { parse = Parse_Block_LAS_Arg(argv[2]); + if (! Next_Block_Exists(parse)) + { ISTWO = 1; + status = Open_DB(argv[2],db2); + if (status < 0) + exit (1); + if (db2->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); + exit (1); + } + Trim_DB(db2); + } + else + db2 = db1; + Free_Block_Arg(parse); + } + Trim_DB(db1); + } + + { char *iblock; + int64 bsize, ovlsize, ptrsize; + int i, j; + DAZZ_READ *reads1 = db1->reads; + int nreads1 = db1->nreads; + DAZZ_READ *reads2 = db2->reads; + int nreads2 = db2->nreads; + + // Setup IO buffers + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + bsize = MEMORY * 1000000ll; + iblock = (char *) Malloc(bsize+ptrsize,"Allocating input block"); + if (iblock == NULL) + exit (1); + iblock += ptrsize; + + // For each file do + + status = 0; + for (i = 2+ISTWO; i < argc; i++) + { Block_Looper *parse; + FILE *input; + char *disp; + char *iptr, *itop; + Overlap last, prev; + int64 novl; + int tspace, tbytes; + int has_chains; + + // Establish IO and (novl,tspace) header + + parse = Parse_Block_LAS_Arg(argv[i]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { disp = Block_Arg_Root(parse); + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (novl < 0) + { if (VERBOSE) + fprintf(stderr," %s: Number of alignments < 0\n",disp); + goto error; + } + if (tspace < 0) + { if (VERBOSE) + fprintf(stderr," %s: Trace spacing < 0\n",disp); + goto error; + } + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + iptr = iblock; + itop = iblock + fread(iblock,1,bsize,input); + + // For each record in file do + + has_chains = 0; + last.aread = -1; + last.bread = -1; + last.flags = 0; + last.path.bbpos = last.path.abpos = 0; + last.path.bepos = last.path.aepos = 0; + prev = last; + for (j = 0; j < novl; j++) + { Overlap ovl; + int tsize; + int equal; + + // Fetch next record + + if (iptr + ovlsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + if (iptr + ovlsize > itop) + { if (VERBOSE) + fprintf(stderr," %s: Too few alignment records\n",disp); + goto error; + } + } + + ovl = *((Overlap *) (iptr - ptrsize)); + iptr += ovlsize; + tsize = ovl.path.tlen*tbytes; + + if (iptr + tsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + if (iptr + tsize > itop) + { if (VERBOSE) + fprintf(stderr," %s: Too few alignment records\n",disp); + goto error; + } + } + ovl.path.trace = iptr; + iptr += tsize; + + // Basic checks + + if (ovl.aread < 0 || ovl.bread < 0) + { if (VERBOSE) + fprintf(stderr," %s: Read indices < 0\n",disp); + goto error; + } + if (ovl.aread >= nreads1 || ovl.bread >= nreads2) + { if (VERBOSE) + fprintf(stderr," %s: Read indices out of range\n",disp); + goto error; + } + + if (ovl.path.abpos >= ovl.path.aepos || ovl.path.aepos > reads1[ovl.aread].rlen || + ovl.path.bbpos >= ovl.path.bepos || ovl.path.bepos > reads2[ovl.bread].rlen || + ovl.path.abpos < 0 || ovl.path.bbpos < 0 ) + { if (VERBOSE) + fprintf(stderr," %s: Non-sense alignment intervals\n",disp); + goto error; + } + + if (ovl.path.diffs < 0 || ovl.path.diffs > reads1[ovl.aread].rlen || + ovl.path.diffs > reads2[ovl.bread].rlen) + { if (VERBOSE) + fprintf(stderr," %s: Non-sense number of differences\n",disp); + goto error; + } + + if (Check_Trace_Points(&ovl,tspace,VERBOSE,disp)) + goto error; + + if (j == 0) + has_chains = ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0); + if (has_chains) + { if (CHAIN_START(ovl.flags) && CHAIN_NEXT(ovl.flags)) + { if (VERBOSE) + fprintf(stderr," %s: LA has both start & next flag set\n",disp); + goto error; + } + if (BEST_CHAIN(ovl.flags) && CHAIN_NEXT(ovl.flags)) + { if (VERBOSE) + fprintf(stderr," %s: LA has both best & next flag set\n",disp); + goto error; + } + } + else + { if ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0) + { if (VERBOSE) + fprintf(stderr," %s: LAs should not have chain flags\n",disp); + goto error; + } + } + + // Duplicate check and sort check if -S set + + equal = 0; + if (SORTED) + { if (CHAIN_NEXT(ovl.flags)) + { if (ovl.aread == last.aread && ovl.bread != last.bread && + COMP(ovl.flags) != COMP(last.flags) && + ovl.path.abpos >= last.path.abpos && + ovl.path.bbpos >= last.path.bbpos) + goto dupcheck; + if (VERBOSE) + fprintf(stderr," %s: Chain is not valid (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + else if (!has_chains) + { if (ovl.aread > last.aread) goto inorder; + if (ovl.aread == last.aread) + { if (MAP_ORDER) + { if (ovl.path.abpos > prev.path.abpos) goto inorder; + if (ovl.path.abpos == prev.path.abpos) + goto dupcheck; + } + else + { if (ovl.bread > last.bread) goto inorder; + if (ovl.bread == last.bread) + { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder; + if (COMP(ovl.flags) == COMP(last.flags)) + { if (ovl.path.abpos > last.path.abpos) goto inorder; + if (ovl.path.abpos == last.path.abpos) + { equal = 1; + goto inorder; + } + } + } + } + } + if (VERBOSE) + fprintf(stderr," %s: LAs are not sorted (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + else // First element of a chain + { if (ovl.aread > prev.aread) goto inorder; + if (ovl.aread == prev.aread) + { if (MAP_ORDER) + { if (ovl.path.abpos > prev.path.abpos) goto inorder; + if (ovl.path.abpos == prev.path.abpos) + goto dupcheck; + } + else + { if (ovl.bread > prev.bread) goto inorder; + if (ovl.bread == prev.bread) + { if (COMP(ovl.flags) > COMP(prev.flags)) goto inorder; + if (COMP(ovl.flags) == COMP(prev.flags)) + { if (ovl.path.abpos > prev.path.abpos) goto inorder; + if (ovl.path.abpos == prev.path.abpos) + { equal = 1; + goto dupcheck; + } + } + } + } + } + if (VERBOSE) + fprintf(stderr," %s: Chains are not sorted (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + } + dupcheck: + if (ovl.aread == last.aread && ovl.bread == last.bread && + COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos) + equal = 1; + inorder: + if (equal) + { if (ovl.path.aepos == last.path.aepos && + ovl.path.bbpos == last.path.bbpos && + ovl.path.bepos == last.path.bepos) + { if (VERBOSE) + fprintf(stderr," %s: Duplicate LAs (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + } + + last = ovl; + if (CHAIN_START(ovl.flags)) + prev = ovl; + } + + // File processing epilog: Check all data read and print OK if -v + + if (iptr < itop) + { if (VERBOSE) + fprintf(stderr," %s: Too many alignment records\n",disp); + goto error; + } + + if (VERBOSE) + { printf(" %s: ",disp); + Print_Number(novl,0,stdout); + printf(" all OK\n"); + fflush(stdout); + } + goto cleanup; + + error: + status = 1; + if (VERBOSE) + { printf(" %s: Not OK, see stderr\n",disp); + fflush(stdout); + } + cleanup: + if (input != NULL) + fclose(input); + } + + Free_Block_Arg(parse); + } + + free(iblock-ptrsize); + } + + Close_DB(db1); + if (ISTWO) + Close_DB(db2); + + exit (status); +} diff --git a/LAdump.c b/LAdump.c new file mode 100644 index 0000000..1cccf76 --- /dev/null +++ b/LAdump.c @@ -0,0 +1,507 @@ +/******************************************************************************************* + * + * Utility for displaying the information in the overlaps of a .las file in a very + * simple to parse format. + * + * Author: Gene Myers + * Creation: July 2013 + * Last Mod: Jan 2015 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = + "[-cdtlo] [] [ | ...]"; + +static int ORDER(const void *l, const void *r) +{ int x = *((int *) l); + int y = *((int *) r); + return (x-y); +} + +int main(int argc, char *argv[]) +{ DAZZ_DB _db1, *db1 = &_db1; + DAZZ_DB _db2, *db2 = &_db2; + Overlap _ovl, *ovl = &_ovl; + + FILE *input; + int64 novl; + int tspace, tbytes, small; + int trmax; + int reps, *pts; + int input_pts; + + int OVERLAP; + int DOCOORDS, DODIFFS, DOTRACE, DOLENS; + int ISTWO; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAdump") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("ocdtl") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + OVERLAP = flags['o']; + DOCOORDS = flags['c']; + DODIFFS = flags['d']; + DOTRACE = flags['t']; + DOLENS = flags['l']; + + if (DOTRACE) + DOCOORDS = 1; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," P #a #b #o #c -"); + fprintf(stderr," (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and\n"); + fprintf(stderr," "); + fprintf(stderr," #c is '>' (start of best chain), '+' (start of alternate chain),\n"); + fprintf(stderr," "); + fprintf(stderr," '-' (continuation of chain), or '.' (no chains in file).\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -c: C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be]\n"); + fprintf(stderr," -d: D # - there are # differences in the LA\n"); + fprintf(stderr," -t: T #n -"); + fprintf(stderr," there are #n trace point intervals for the LA\n"); + fprintf(stderr," (#d #y )^#n -"); + fprintf(stderr," there are #d difference aligning the #y bp's of B with the\n"); + fprintf(stderr," next fixed-size interval of A\n"); + fprintf(stderr," -l: L #la #lb -"); + fprintf(stderr," #la is the length of the a-read and #lb that of the b-read\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -o: Output proper overlaps only\n"); + + exit (1); + } + } + + // Open trimmed DB or DB pair + + { int status; + char *pwd, *root; + FILE *input; + + ISTWO = 0; + status = Open_DB(argv[1],db1); + if (status < 0) + exit (1); + if (db1->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); + exit (1); + } + + if (argc > 3) + { pwd = PathTo(argv[3]); + root = Root(argv[3],".las"); + if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) + { ISTWO = 1; + fclose(input); + status = Open_DB(argv[2],db2); + if (status < 0) + exit (1); + if (db2->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); + exit (1); + } + Trim_DB(db2); + } + else + db2 = db1; + free(root); + free(pwd); + } + else + db2 = db1; + Trim_DB(db1); + } + + // Process read index arguments into a sorted list of read ranges + + input_pts = 0; + if (argc == ISTWO+4) + { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') + { char *eptr, *fptr; + int b, e; + + b = strtol(argv[ISTWO+3],&eptr,10); + if (eptr > argv[ISTWO+3] && b > 0) + { if (*eptr == '-') + { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') + { e = strtol(eptr+1,&fptr,10); + input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); + } + } + else + input_pts = (*eptr != '\0'); + } + else + input_pts = 1; + } + } + + if (input_pts) + { int v, x; + FILE *input; + + input = Fopen(argv[ISTWO+3],"r"); + if (input == NULL) + exit (1); + + reps = 0; + while ((v = fscanf(input," %d",&x)) != EOF) + if (v == 0) + { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", + Prog_Name,reps+1,argv[2]); + exit (1); + } + else + reps += 1; + + reps *= 2; + pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + rewind(input); + for (v = 0; v < reps; v += 2) + { fscanf(input," %d",&x); + pts[v] = pts[v+1] = x; + } + + fclose(input); + } + + else + { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + reps = 0; + if (argc > 3+ISTWO) + { int c, b, e; + char *eptr, *fptr; + + for (c = 3+ISTWO; c < argc; c++) + { if (argv[c][0] == LAST_READ_SYMBOL) + { b = db1->nreads; + eptr = argv[c]+1; + } + else + b = strtol(argv[c],&eptr,10); + if (eptr > argv[c]) + { if (b <= 0) + { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); + exit (1); + } + if (*eptr == '\0') + { pts[reps++] = b; + pts[reps++] = b; + continue; + } + else if (*eptr == '-') + { if (eptr[1] == LAST_READ_SYMBOL) + { e = INT32_MAX; + fptr = eptr+2; + } + else + e = strtol(eptr+1,&fptr,10); + if (fptr > eptr+1 && *fptr == 0 && e > 0) + { pts[reps++] = b; + pts[reps++] = e; + if (b > e) + { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); + exit (1); + } + continue; + } + } + } + fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); + exit (1); + } + + qsort(pts,reps/2,sizeof(int64),ORDER); + + b = 0; + for (c = 0; c < reps; c += 2) + if (b > 0 && pts[b-1] >= pts[c]-1) + { if (pts[c+1] > pts[b-1]) + pts[b-1] = pts[c+1]; + } + else + { pts[b++] = pts[c]; + pts[b++] = pts[c+1]; + } + pts[b++] = INT32_MAX; + reps = b; + } + else + { pts[reps++] = 1; + pts[reps++] = INT32_MAX; + } + } + + // Initiate file reading and read header + + { char *over, *pwd, *root; + + pwd = PathTo(argv[2+ISTWO]); + root = Root(argv[2+ISTWO],".las"); + over = Catenate(pwd,"/",root,".las"); + input = Fopen(over,"r"); + if (input == NULL) + exit (1); + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + if (tspace <= TRACE_XOVR && tspace != 0) + { small = 1; + tbytes = sizeof(uint8); + } + else + { small = 0; + tbytes = sizeof(uint16); + } + + free(pwd); + free(root); + } + + // Scan to count sizes of things + + { int j, al, tlen; + int in, npt, idx, ar; + int64 novls, odeg, omax, sdeg, smax, tmax, ttot; + + in = 0; + npt = pts[0]; + idx = 1; + + // For each record do + + trmax = 0; + novls = omax = smax = ttot = tmax = 0; + sdeg = odeg = 0; + + al = 0; + for (j = 0; j < novl; j++) + + // Read it in + + { Read_Overlap(input,ovl); + tlen = ovl->path.tlen; + fseeko(input,tlen*tbytes,SEEK_CUR); + if (tlen > trmax) + trmax = tlen; + + // Determine if it should be displayed + + ar = ovl->aread+1; + if (in) + { while (ar > npt) + { npt = pts[idx++]; + if (ar < npt) + { in = 0; + break; + } + npt = pts[idx++]; + } + } + else + { while (ar >= npt) + { npt = pts[idx++]; + if (ar <= npt) + { in = 1; + break; + } + npt = pts[idx++]; + } + } + if (!in) + continue; + + // If -o check display only overlaps + + if (OVERLAP) + { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) + continue; + if (ovl->path.aepos != db1->reads[ovl->aread].rlen && + ovl->path.bepos != db2->reads[ovl->bread].rlen) + continue; + } + + if (ar != al) + { if (sdeg > smax) + smax = sdeg; + if (odeg > omax) + omax = odeg; + sdeg = odeg = 0; + al = ar; + } + + novls += 1; + odeg += 1; + sdeg += tlen; + ttot += tlen; + if (tlen > tmax) + tmax = tlen; + } + + if (sdeg > smax) + smax = sdeg; + if (odeg > omax) + omax = odeg; + + printf("+ P %lld\n",novls); + printf("%% P %lld\n",omax); + if (DOTRACE) + { printf("+ T %lld\n",ttot); + printf("%% T %lld\n",smax); + printf("@ T %lld\n",tmax); + printf("X %d\n",tspace); + } + } + + // Read the file and display selected records + + { int j, k; + uint16 *trace; + int in, npt, idx, ar; + DAZZ_READ *read1, *read2; + + rewind(input); + fread(&novl,sizeof(int64),1,input); + fread(&tspace,sizeof(int),1,input); + + trace = (uint16 *) Malloc(sizeof(uint16)*trmax,"Allocating trace vector"); + if (trace == NULL) + exit (1); + + read1 = db1->reads; + read2 = db2->reads; + + in = 0; + npt = pts[0]; + idx = 1; + + // For each record do + + for (j = 0; j < novl; j++) + + // Read it in + + { Read_Overlap(input,ovl); + ovl->path.trace = (void *) trace; + Read_Trace(input,ovl,tbytes); + + // Determine if it should be displayed + + ar = ovl->aread+1; + if (in) + { while (ar > npt) + { npt = pts[idx++]; + if (ar < npt) + { in = 0; + break; + } + npt = pts[idx++]; + } + } + else + { while (ar >= npt) + { npt = pts[idx++]; + if (ar <= npt) + { in = 1; + break; + } + npt = pts[idx++]; + } + } + if (!in) + continue; + + // If -o check display only overlaps + + if (OVERLAP) + { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) + continue; + if (ovl->path.aepos != db1->reads[ovl->aread].rlen && + ovl->path.bepos != db2->reads[ovl->bread].rlen) + continue; + } + + // Display it + + printf("P %d %d",ovl->aread+1,ovl->bread+1); + if (COMP(ovl->flags)) + printf(" c"); + else + printf(" n"); + if (CHAIN_NEXT(ovl->flags)) + printf(" -"); + else if (BEST_CHAIN(ovl->flags)) + printf(" >"); + else if (CHAIN_START(ovl->flags)) + printf(" +"); + else + printf(" ."); + printf("\n"); + + if (DOLENS) + printf("L %d %d\n",read1[ovl->aread].rlen,read2[ovl->bread].rlen); + + if (DOCOORDS) + printf("C %d %d %d %d\n",ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos); + + if (DODIFFS) + printf("D %d\n",ovl->path.diffs); + + if (DOTRACE) + { uint16 *trace = (uint16 *) ovl->path.trace; + int tlen = ovl->path.tlen; + + if (small) + Decompress_TraceTo16(ovl); + printf("T %d\n",tlen>>1); + for (k = 0; k < tlen; k += 2) + printf(" %d %d\n",trace[k],trace[k+1]); + } + } + + free(trace); + } + + Close_DB(db1); + if (ISTWO) + Close_DB(db2); + + exit (0); +} diff --git a/LAmerge.c b/LAmerge.c new file mode 100644 index 0000000..2d72545 --- /dev/null +++ b/LAmerge.c @@ -0,0 +1,524 @@ +/******************************************************************************************* + * + * Given a list of sorted .las files, merge them into a single sorted .las file. + * + * Author: Gene Myers + * Date : July 2013 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +#undef DEBUG + +static char *Usage = "[-va] [-P] ..."; + +#define MEMORY 4000 // in Mb + +#define MAX_FILES 250 + + // Heap sort of records according to (aread,bread,COMP(flags),abpos) order + +#define COMPARE(lp,rp) \ + if (lp->aread > rp->aread) \ + bigger = 1; \ + else if (lp->aread < rp->aread) \ + bigger = 0; \ + else if (lp->bread > rp->bread) \ + bigger = 1; \ + else if (lp->bread < rp->bread) \ + bigger = 0; \ + else if (COMP(lp->flags) > COMP(rp->flags)) \ + bigger = 1; \ + else if (COMP(lp->flags) < COMP(rp->flags)) \ + bigger = 0; \ + else if (lp->path.abpos > rp->path.abpos) \ + bigger = 1; \ + else if (lp->path.abpos < rp->path.abpos) \ + bigger = 0; \ + else if (lp > rp) \ + bigger = 1; \ + else \ + bigger = 0; + +static void reheap(int s, Overlap **heap, int hsize) +{ int c, l, r; + int bigger; + Overlap *hs, *hr, *hl; + + c = s; + hs = heap[s]; + while ((l = 2*c) <= hsize) + { r = l+1; + hl = heap[l]; + if (r > hsize) + bigger = 1; + else + { hr = heap[r]; + COMPARE(hr,hl) + } + if (bigger) + { COMPARE(hs,hl) + if (bigger) + { heap[c] = hl; + c = l; + } + else + break; + } + else + { COMPARE(hs,hr) + if (bigger) + { heap[c] = hr; + c = r; + } + else + break; + } + } + if (c != s) + heap[c] = hs; +} + + // Heap sort of records according to (aread,abpos) order + +#define MAPARE(lp,rp) \ + if (lp->aread > rp->aread) \ + bigger = 1; \ + else if (lp->aread < rp->aread) \ + bigger = 0; \ + else if (lp->path.abpos > rp->path.abpos) \ + bigger = 1; \ + else if (lp->path.abpos < rp->path.abpos) \ + bigger = 0; \ + else if (lp > rp) \ + bigger = 1; \ + else \ + bigger = 0; + +static void maheap(int s, Overlap **heap, int hsize) +{ int c, l, r; + int bigger; + Overlap *hs, *hr, *hl; + + c = s; + hs = heap[s]; + while ((l = 2*c) <= hsize) + { r = l+1; + hl = heap[l]; + if (r > hsize) + bigger = 1; + else + { hr = heap[r]; + MAPARE(hr,hl) + } + if (bigger) + { MAPARE(hs,hl) + if (bigger) + { heap[c] = hl; + c = l; + } + else + break; + } + else + { MAPARE(hs,hr) + if (bigger) + { heap[c] = hr; + c = r; + } + else + break; + } + } + if (c != s) + heap[c] = hs; +} + +#ifdef DEBUG + +static void showheap(Overlap **heap, int hsize) +{ int i; + printf("\n"); + for (i = 1; i <= hsize; i++) + printf(" %3d: %5d, %5d\n",i,heap[i]->aread,heap[i]->bread); +} + +#endif + + // Input block data structure and block fetcher + +typedef struct + { FILE *stream; + char *block; + char *ptr; + char *top; + int64 count; + } IO_block; + +static void ovl_reload(IO_block *in, int64 bsize) +{ int64 remains; + + remains = in->top - in->ptr; + if (remains > 0) + memmove(in->block, in->ptr, remains); + in->ptr = in->block; + in->top = in->block + remains; + in->top += fread(in->top,1,bsize-remains,in->stream); +} + + // The program + +int main(int argc, char *argv[]) +{ IO_block *in; + int64 bsize, osize, psize; + char *block, *oblock; + int i, c, fway, clen, nfile[argc]; + Overlap **heap; + int hsize; + Overlap *ovls; + int64 totl; + int tspace, tbytes; + FILE *output; + char *optr, *otop; + + int VERBOSE; + int MAP_SORT; + char *TEMP_PATH; + + // Process command line + + { int j, k; + int flags[128]; + DIR *dirp; + + ARG_INIT("LAmerge") + + TEMP_PATH = "/tmp"; + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("va") + break; + case 'P': + TEMP_PATH = argv[i]+2; + if ((dirp = opendir(TEMP_PATH)) == NULL) + { fprintf(stderr,"%s: -P option: cannot open directory %s\n",Prog_Name,TEMP_PATH); + exit (1); + } + closedir(dirp); + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + MAP_SORT = flags['a']; + + if (argc < 3) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); + fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n"); + fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n"); + fprintf(stderr," -P: Do any intermediate merging in directory -P.\n"); + exit (1); + } + } + + // Determine the number of files and check they are all mergeable + + clen = 2*strlen(TEMP_PATH) + 50; + fway = 0; + totl = 0; + tspace = -1; + for (c = 2; c < argc; c++) + { Block_Looper *parse; + FILE *input; + char *root, *path; + + parse = Parse_Block_LAS_Arg(argv[c]); + + path = Block_Arg_Path(parse); + root = Block_Arg_Root(parse); + + clen += strlen(path) + strlen(root) + 30; + + free(root); + free(path); + + nfile[c] = 0; + while ((input = Next_Block_Arg(parse)) != NULL) + { int64 povl; + int mspace; + + if (fread(&povl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + totl += povl; + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (tspace < 0) + tspace = mspace; + else if (tspace != mspace) + { fprintf(stderr,"%s: trace-point spacing conflict between %s and earlier files", + Prog_Name,Block_Arg_Root(parse)); + fprintf(stderr," (%d vs %d)\n",tspace,mspace); + exit (1); + } + + fclose(input); + nfile[c] += 1; + } + + Free_Block_Arg(parse); + fway += nfile[c]; + } + + if (VERBOSE) + { printf(" Merging %d files totaling ",fway); + Print_Number(totl,0,stdout); + printf(" records\n"); + fflush(stdout); + } + + // Must recursively merge, emit sub-merges, then merge their results + + if (fway > MAX_FILES) + { Block_Looper *parse; + int mul, dim, fsum, cut; + char command[clen], *com; + int pid; + + mul = 1; + for (c = 0; mul < fway; c++) + mul *= MAX_FILES; + dim = pow(1.*fway,1./c)+1; + + fsum = 0; + c = 2; + + parse = Parse_Block_LAS_Arg(argv[c]); + + pid = getpid(); + for (i = 1; i <= dim; i++) + { com = command; + com += sprintf(com,"LAmerge"); + if (MAP_SORT) + com += sprintf(com," -a"); + if (mul > 2) + com += sprintf(com," -P%s",TEMP_PATH); + com += sprintf(com," %s/LM%d.P%d",TEMP_PATH,pid,i); + + cut = (fway * i) / dim; + while (fsum + nfile[c] <= cut) + { com += sprintf(com," %s",Next_Block_Slice(parse,nfile[c])); + fsum += nfile[c]; + + c += 1; + if (c >= argc) + break; + + Free_Block_Arg(parse); + + parse = Parse_Block_LAS_Arg(argv[c]); + } + if (c < argc && fsum < cut) + { int n = cut-fsum; + com += sprintf(com," %s",Next_Block_Slice(parse,n)); + nfile[c] -= n; + fsum += n; + } + system(command); + } + + Free_Block_Arg(parse); + + com = command; + com += sprintf(com,"LAmerge"); + if (MAP_SORT) + com += sprintf(com," -a"); + com += sprintf(com," %s %s/LM%d.P%c",argv[1],TEMP_PATH,pid,BLOCK_SYMBOL); + system(command); + + sprintf(command,"rm %s/LM%d.P*.las",TEMP_PATH,pid); + system(command); + + exit (0); + } + + // Base level merge: Open all the input files and initialize their buffers + + psize = sizeof(void *); + osize = sizeof(Overlap) - psize; + bsize = (MEMORY*1000000ll)/(fway + 1); + block = (char *) Malloc(bsize*(fway+1)+psize,"Allocating LAmerge blocks"); + in = (IO_block *) Malloc(sizeof(IO_block)*fway,"Allocating LAmerge IO-reacords"); + if (block == NULL || in == NULL) + exit (1); + block += psize; + + fway = 0; + for (c = 2; c < argc; c++) + { Block_Looper *parse; + FILE *input; + + parse = Parse_Block_LAS_Arg(argv[c]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { int64 novl; + int mspace; + char *iblock; + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + in[fway].stream = input; + in[fway].block = iblock = block+fway*bsize; + in[fway].ptr = iblock; + in[fway].top = iblock + fread(in[fway].block,1,bsize,input); + in[fway].count = 0; + fway += 1; + } + + Free_Block_Arg(parse); + } + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + // Open the output file buffer and write (novl,tspace) header + + { char *pwd, *root; + + pwd = PathTo(argv[1]); + root = Root(argv[1],".las"); + output = Fopen(Catenate(pwd,"/",root,".las"),"w"); + if (output == NULL) + exit (1); + free(pwd); + free(root); + + if (fwrite(&totl,sizeof(int64),1,output) != 1) + SYSTEM_READ_ERROR + if (fwrite(&tspace,sizeof(int),1,output) != 1) + SYSTEM_READ_ERROR + + oblock = block+fway*bsize; + optr = oblock; + otop = oblock + bsize; + } + + // Initialize the heap + + heap = (Overlap **) Malloc(sizeof(Overlap *)*(fway+1),"Allocating heap"); + ovls = (Overlap *) Malloc(sizeof(Overlap)*fway,"Allocating heap"); + if (heap == NULL || ovls == NULL) + exit (1); + + hsize = 0; + for (i = 0; i < fway; i++) + { if (in[i].ptr < in[i].top) + { ovls[i] = *((Overlap *) (in[i].ptr - psize)); + in[i].ptr += osize; + hsize += 1; + heap[hsize] = ovls + i; + } + } + + if (hsize > 3) + { if (MAP_SORT) + for (i = hsize/2; i > 1; i--) + maheap(i,heap,hsize); + else + for (i = hsize/2; i > 1; i--) + reheap(i,heap,hsize); + } + + // While the heap is not empty do + + while (hsize > 0) + { Overlap *ov; + IO_block *src; + int64 tsize, span; + + if (MAP_SORT) + maheap(1,heap,hsize); + else + reheap(1,heap,hsize); + + ov = heap[1]; + src = in + (ov - ovls); + + do + { src->count += 1; + + tsize = ov->path.tlen*tbytes; + span = osize + tsize; + if (src->ptr + span > src->top) + ovl_reload(src,bsize); + if (optr + span > otop) + { if (fwrite(oblock,1,optr-oblock,output) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + optr = oblock; + } + + memmove(optr,((char *) ov) + psize,osize); + optr += osize; + memmove(optr,src->ptr,tsize); + optr += tsize; + + src->ptr += tsize; + if (src->ptr >= src->top) + { heap[1] = heap[hsize]; + hsize -= 1; + break; + } + *ov = *((Overlap *) (src->ptr - psize)); + src->ptr += osize; + } + while (CHAIN_NEXT(ov->flags)); + } + + // Flush output buffer and wind up + + if (optr > oblock) + { if (fwrite(oblock,1,optr-oblock,output) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + } + fclose(output); + + for (i = 0; i < fway; i++) + fclose(in[i].stream); + + for (i = 0; i < fway; i++) + totl -= in[i].count; + if (totl != 0) + { fprintf(stderr,"%s: Did not write all records to %s (%lld)\n",argv[0],argv[1],totl); + exit (1); + } + + free(ovls); + free(heap); + free(in); + free(block-psize); + + exit (0); +} diff --git a/LAshow.c b/LAshow.c new file mode 100644 index 0000000..ede907f --- /dev/null +++ b/LAshow.c @@ -0,0 +1,650 @@ +/******************************************************************************************* + * + * Utility for displaying the overlaps in a .las file in a variety of ways including + * a minimal listing of intervals, a cartoon, and a full out alignment. + * + * Author: Gene Myers + * Creation: July 2013 + * Last Mod: Jan 2015 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage[] = + { "[-caroUF] [-i] [-w] [-b] ", + " [ ] [ | ... ]" + }; + +static int ORDER(const void *l, const void *r) +{ int x = *((int *) l); + int y = *((int *) r); + return (x-y); +} + +int main(int argc, char *argv[]) +{ DAZZ_DB _db1, *db1 = &_db1; + DAZZ_DB _db2, *db2 = &_db2; + Overlap _ovl, *ovl = &_ovl; + Alignment _aln, *aln = &_aln; + + FILE *input; + int sameDB; + int64 novl; + int tspace, tbytes, small; + int reps, *pts; + int input_pts; + + int ALIGN, CARTOON, REFERENCE, OVERLAP; + int FLIP, MAP; + int INDENT, WIDTH, BORDER, UPPERCASE; + int ISTWO; + + // Process options + + { int i, j, k; + int flags[128]; + char *eptr; + + ARG_INIT("LAshow") + + INDENT = 4; + WIDTH = 100; + BORDER = 10; + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("caroUFM") + break; + case 'i': + ARG_NON_NEGATIVE(INDENT,"Indent") + break; + case 'w': + ARG_POSITIVE(WIDTH,"Alignment width") + break; + case 'b': + ARG_NON_NEGATIVE(BORDER,"Alignment border") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + CARTOON = flags['c']; + ALIGN = flags['a']; + REFERENCE = flags['r']; + OVERLAP = flags['o']; + UPPERCASE = flags['U']; + FLIP = flags['F']; + MAP = flags['M']; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); + fprintf(stderr,"\n"); + fprintf(stderr," -c: Show a cartoon of the LA between reads.\n"); + fprintf(stderr," -a: Show the alignment of each LA.\n"); + fprintf(stderr," -r: Show the alignment of each LA with -w bp's of A in each row.\n"); + fprintf(stderr," -o: Show only proper overlaps.\n"); + fprintf(stderr," -F: Switch the roles of A- and B-reads.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -U: Show alignments in upper case.\n"); + fprintf(stderr," -i: Indent alignments and cartoons by -i.\n"); + fprintf(stderr," -w: Width of each row of alignment in symbols (-a) or bps (-r).\n"); + fprintf(stderr," -b: # of border bp.s to show on each side of LA.\n"); + exit (1); + } + } + + // Open trimmed DB or DB pair + + { int status; + char *pwd, *root; + FILE *input; + struct stat stat1, stat2; + + ISTWO = 0; + status = Open_DB(argv[1],db1); + if (status < 0) + exit (1); + if (db1->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); + exit (1); + } + + sameDB = 1; + if (argc > 3) + { pwd = PathTo(argv[3]); + root = Root(argv[3],".las"); + if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) + { ISTWO = 1; + fclose(input); + status = Open_DB(argv[2],db2); + if (status < 0) + exit (1); + if (db2->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); + exit (1); + } + stat(Catenate(db1->path,"","",".idx"),&stat1); + stat(Catenate(db2->path,"","",".idx"),&stat2); + if (stat1.st_ino != stat2.st_ino) + sameDB = 0; + Trim_DB(db2); + } + else + db2 = db1; + free(root); + free(pwd); + } + else + db2 = db1; + Trim_DB(db1); + } + + // Process read index arguments into a sorted list of read ranges + + input_pts = 0; + if (argc == ISTWO+4) + { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') + { char *eptr, *fptr; + int b, e; + + b = strtol(argv[ISTWO+3],&eptr,10); + if (eptr > argv[ISTWO+3] && b > 0) + { if (*eptr == '-') + { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') + { e = strtol(eptr+1,&fptr,10); + input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); + } + } + else + input_pts = (*eptr != '\0'); + } + else + input_pts = 1; + } + } + + if (input_pts) + { int v, x; + FILE *input; + + input = Fopen(argv[ISTWO+3],"r"); + if (input == NULL) + exit (1); + + reps = 0; + while ((v = fscanf(input," %d",&x)) != EOF) + if (v == 0) + { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", + Prog_Name,reps+1,argv[2]); + exit (1); + } + else + reps += 1; + + reps *= 2; + pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + rewind(input); + for (v = 0; v < reps; v += 2) + { fscanf(input," %d",&x); + pts[v] = pts[v+1] = x; + } + + fclose(input); + } + + else + { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + reps = 0; + if (argc > 3+ISTWO) + { int c, b, e; + char *eptr, *fptr; + + for (c = 3+ISTWO; c < argc; c++) + { if (argv[c][0] == LAST_READ_SYMBOL) + { b = db1->nreads; + eptr = argv[c]+1; + } + else + b = strtol(argv[c],&eptr,10); + if (eptr > argv[c]) + { if (b <= 0) + { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); + exit (1); + } + if (*eptr == '\0') + { pts[reps++] = b; + pts[reps++] = b; + continue; + } + else if (*eptr == '-') + { if (eptr[1] == LAST_READ_SYMBOL) + { e = INT32_MAX; + fptr = eptr+2; + } + else + e = strtol(eptr+1,&fptr,10); + if (fptr > eptr+1 && *fptr == 0 && e > 0) + { pts[reps++] = b; + pts[reps++] = e; + if (b > e) + { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); + exit (1); + } + continue; + } + } + } + fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); + exit (1); + } + + qsort(pts,reps/2,sizeof(int64),ORDER); + + b = 0; + for (c = 0; c < reps; c += 2) + if (b > 0 && pts[b-1] >= pts[c]-1) + { if (pts[c+1] > pts[b-1]) + pts[b-1] = pts[c+1]; + } + else + { pts[b++] = pts[c]; + pts[b++] = pts[c+1]; + } + pts[b++] = INT32_MAX; + reps = b; + } + else + { pts[reps++] = 1; + pts[reps++] = INT32_MAX; + } + } + + // Initiate file reading and read (novl, tspace) header + + { char *over, *pwd, *root; + + pwd = PathTo(argv[2+ISTWO]); + root = Root(argv[2+ISTWO],".las"); + over = Catenate(pwd,"/",root,".las"); + input = Fopen(over,"r"); + if (input == NULL) + exit (1); + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (tspace < 0) + { fprintf(stderr,"%s: Garbage .las file, trace spacing < 0 !\n",Prog_Name); + exit (1); + } + + if (tspace <= TRACE_XOVR && tspace != 0) + { small = 1; + tbytes = sizeof(uint8); + } + else + { small = 0; + tbytes = sizeof(uint16); + } + + printf("\n%s: ",root); + Print_Number(novl,0,stdout); + printf(" records\n"); + + free(pwd); + free(root); + } + + // Read the file and display selected records + + { int j; + uint16 *trace; + Work_Data *work; + int tmax; + int in, npt, idx, ar; + int64 tps; + + char *abuffer, *bbuffer; + int ar_wide, br_wide; + int ai_wide, bi_wide; + int mn_wide, mx_wide; + int tp_wide; + int blast, match, seen, lhalf, rhalf; + + aln->path = &(ovl->path); + if (ALIGN || REFERENCE) + { work = New_Work_Data(); + abuffer = New_Read_Buffer(db1); + bbuffer = New_Read_Buffer(db2); + } + else + { abuffer = NULL; + bbuffer = NULL; + work = NULL; + } + + tmax = 1000; + trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); + if (trace == NULL) + exit (1); + + in = 0; + npt = pts[0]; + idx = 1; + + ar_wide = Number_Digits((int64) db1->nreads); + br_wide = Number_Digits((int64) db2->nreads); + ai_wide = Number_Digits((int64) db1->maxlen); + bi_wide = Number_Digits((int64) db2->maxlen); + if (db1->maxlen < db2->maxlen) + { mn_wide = ai_wide; + mx_wide = bi_wide; + if (tspace > 0) + tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); + else + tp_wide = 0; + } + else + { mn_wide = bi_wide; + mx_wide = ai_wide; + if (tspace > 0) + tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); + else + tp_wide = 0; + } + ar_wide += (ar_wide-1)/3; + br_wide += (br_wide-1)/3; + ai_wide += (ai_wide-1)/3; + bi_wide += (bi_wide-1)/3; + mn_wide += (mn_wide-1)/3; + tp_wide += (tp_wide-1)/3; + + if (FLIP) + { int x; + x = ar_wide; ar_wide = br_wide; br_wide = x; + x = ai_wide; ai_wide = bi_wide; bi_wide = x; + } + + // For each record do + + blast = -1; + match = 0; + seen = 0; + lhalf = rhalf = 0; + for (j = 0; j < novl; j++) + + // Read it in + + { Read_Overlap(input,ovl); + if (ovl->path.tlen > tmax) + { tmax = ((int) 1.2*ovl->path.tlen) + 100; + trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); + if (trace == NULL) + exit (1); + } + ovl->path.trace = (void *) trace; + Read_Trace(input,ovl,tbytes); + + if (ovl->aread >= db1->nreads) + { fprintf(stderr,"%s: A-read is out-of-range of DB %s\n",Prog_Name,argv[1]); + exit (1); + } + if (ovl->bread >= db2->nreads) + { fprintf(stderr,"%s: B-read is out-of-range of DB %s\n",Prog_Name,argv[1+ISTWO]); + exit (1); + } + + // Determine if it should be displayed + + ar = ovl->aread+1; + if (in) + { while (ar > npt) + { npt = pts[idx++]; + if (ar < npt) + { in = 0; + break; + } + npt = pts[idx++]; + } + } + else + { while (ar >= npt) + { npt = pts[idx++]; + if (ar <= npt) + { in = 1; + break; + } + npt = pts[idx++]; + } + } + if (!in) + continue; + + // If -o check display only overlaps + + aln->alen = db1->reads[ovl->aread].rlen; + aln->blen = db2->reads[ovl->bread].rlen; + aln->flags = ovl->flags; + tps = ovl->path.tlen/2; + + if (OVERLAP) + { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) + continue; + if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) + continue; + } + + // If -M option then check the completeness of the implied mapping + + if (MAP) + { while (ovl->bread != blast) + { if (!match && seen && !(lhalf && rhalf)) + { printf("Missing "); + Print_Number((int64) blast+1,br_wide+1,stdout); + printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); + } + match = 0; + seen = 0; + lhalf = rhalf = 0; + blast += 1; + } + seen = 1; + if (ovl->path.abpos == 0) + rhalf = 1; + if (ovl->path.aepos == aln->alen) + lhalf = 1; + if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) + continue; + match = 1; + } + + // Display it + + if (ALIGN || CARTOON || REFERENCE) + printf("\n"); + + if (BEST_CHAIN(ovl->flags)) + printf("> "); + else if (CHAIN_START(ovl->flags)) + printf("+ "); + else if (CHAIN_NEXT(ovl->flags)) + printf(" -"); + + if (FLIP) + { Flip_Alignment(aln,0); + Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); + printf(" "); + Print_Number((int64) ovl->aread+1,br_wide+1,stdout); + } + else + { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); + printf(" "); + Print_Number((int64) ovl->bread+1,br_wide+1,stdout); + } + if (COMP(ovl->flags)) + printf(" c"); + else + printf(" n"); + if (ovl->path.abpos == 0) + printf(" <"); + else + printf(" ["); + Print_Number((int64) ovl->path.abpos,ai_wide,stdout); + printf(".."); + Print_Number((int64) ovl->path.aepos,ai_wide,stdout); + if (ovl->path.aepos == aln->alen) + printf("> x "); + else + printf("] x "); + if (ovl->path.bbpos == 0) + printf("<"); + else + printf("["); + if (COMP(ovl->flags)) + { Print_Number((int64) (aln->blen - ovl->path.bbpos),bi_wide,stdout); + printf(".."); + Print_Number((int64) (aln->blen - ovl->path.bepos),bi_wide,stdout); + } + else + { Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); + printf(".."); + Print_Number((int64) ovl->path.bepos,bi_wide,stdout); + } + if (ovl->path.bepos == aln->blen) + printf(">"); + else + printf("]"); + + if (!CARTOON) + printf(" ~ %5.2f%% ",(200.*ovl->path.diffs) / + ((ovl->path.aepos - ovl->path.abpos) + (ovl->path.bepos - ovl->path.bbpos)) ); + printf(" ("); + Print_Number(aln->alen,ai_wide,stdout); + printf(" x "); + Print_Number(aln->blen,bi_wide,stdout); + printf(" bps,"); + if (CARTOON) + { Print_Number(tps,tp_wide,stdout); + printf(" trace pts)\n\n"); + } + else + { Print_Number((int64) ovl->path.diffs,mn_wide,stdout); + printf(" diffs, "); + Print_Number(tps,tp_wide,stdout); + printf(" trace pts)\n"); + } + + if (ALIGN || CARTOON || REFERENCE) + { if (ALIGN || REFERENCE) + { char *aseq, *bseq; + int amin, amax; + int bmin, bmax; + int self; + + if (FLIP) + Flip_Alignment(aln,0); + if (small) + Decompress_TraceTo16(ovl); + + self = sameDB && (ovl->aread == ovl->bread) && !COMP(ovl->flags); + + amin = ovl->path.abpos - BORDER; + if (amin < 0) amin = 0; + amax = ovl->path.aepos + BORDER; + if (amax > aln->alen) amax = aln->alen; + if (COMP(aln->flags)) + { bmin = (aln->blen-ovl->path.bepos) - BORDER; + if (bmin < 0) bmin = 0; + bmax = (aln->blen-ovl->path.bbpos) + BORDER; + if (bmax > aln->blen) bmax = aln->blen; + } + else + { bmin = ovl->path.bbpos - BORDER; + if (bmin < 0) bmin = 0; + bmax = ovl->path.bepos + BORDER; + if (bmax > aln->blen) bmax = aln->blen; + if (self) + { if (bmin < amin) + amin = bmin; + if (bmax > amax) + amax = bmax; + } + } + + aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); + if (!self) + bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); + else + bseq = aseq; + + aln->aseq = aseq - amin; + if (COMP(aln->flags)) + { Complement_Seq(bseq,bmax-bmin); + aln->bseq = bseq - (aln->blen - bmax); + } + else if (self) + aln->bseq = aln->aseq; + else + aln->bseq = bseq - bmin; + + if (tspace == 0) + Compute_Trace_IRR(aln,work,GREEDIEST); + else + Compute_Trace_PTS(aln,work,tspace,GREEDIEST); + + if (FLIP) + { if (COMP(aln->flags)) + { Complement_Seq(aseq,amax-amin); + Complement_Seq(bseq,bmax-bmin); + aln->aseq = aseq - (aln->alen - amax); + aln->bseq = bseq - bmin; + } + Flip_Alignment(aln,1); + } + } + if (CARTOON) + Alignment_Cartoon(stdout,aln,INDENT,mx_wide); + if (REFERENCE) + Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); + if (ALIGN) + Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); + } + } + + free(trace); + if (ALIGN) + { free(bbuffer-1); + free(abuffer-1); + Free_Work_Data(work); + } + } + + Close_DB(db1); + if (ISTWO) + Close_DB(db2); + + exit (0); +} diff --git a/LAsort.c b/LAsort.c new file mode 100644 index 0000000..2a7e11f --- /dev/null +++ b/LAsort.c @@ -0,0 +1,413 @@ +/******************************************************************************************* + * + * Load a file U.las of overlaps into memory, sort them all by A,B index, + * and then output the result to U.S.las + * + * Author: Gene Myers + * Date : July 2013 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "[-va] ..."; + +#define MEMORY 1000 // How many megabytes for output buffer + +static char *IBLOCK; + +static int SORT_OVL(const void *x, const void *y) +{ int64 l = *((int64 *) x); + int64 r = *((int64 *) y); + + Overlap *ol, *or; + int al, ar; + int bl, br; + int cl, cr; + int pl, pr; + + ol = (Overlap *) (IBLOCK+l); + or = (Overlap *) (IBLOCK+r); + + al = ol->aread; + ar = or->aread; + if (al != ar) + return (al-ar); + + bl = ol->bread; + br = or->bread; + if (bl != br) + return (bl-br); + + cl = COMP(ol->flags); + cr = COMP(or->flags); + if (cl != cr) + return (cl-cr); + + pl = ol->path.abpos; + pr = or->path.abpos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.aepos; + pr = or->path.aepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bbpos; + pr = or->path.bbpos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bepos; + pr = or->path.bepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.diffs; + pr = or->path.diffs; + if (pl != pr) + return (pl-pr); + + if (ol < or) + return (-1); + else if (ol > or) + return (1); + else + return (0); +} + +static int SORT_MAP(const void *x, const void *y) +{ int64 l = *((int64 *) x); + int64 r = *((int64 *) y); + + Overlap *ol, *or; + int al, ar; + int bl, br; + int cl, cr; + int pl, pr; + + ol = (Overlap *) (IBLOCK+l); + or = (Overlap *) (IBLOCK+r); + + al = ol->aread; + ar = or->aread; + if (al != ar) + return (al-ar); + + pl = ol->path.abpos; + pr = or->path.abpos; + if (pl != pr) + return (pl-pr); + + bl = ol->bread; + br = or->bread; + if (bl != br) + return (bl-br); + + cl = COMP(ol->flags); + cr = COMP(or->flags); + if (cl != cr) + return (cl-cr); + + pl = ol->path.aepos; + pr = or->path.aepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bbpos; + pr = or->path.bbpos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bepos; + pr = or->path.bepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.diffs; + pr = or->path.diffs; + if (pl != pr) + return (pl-pr); + + if (ol < or) + return (-1); + else if (ol > or) + return (1); + else + return (0); +} + +static int EQUAL(Overlap *ol, Overlap *or) +{ int al, ar; + int bl, br; + int cl, cr; + int pl, pr; + + al = ol->aread; + ar = or->aread; + if (al != ar) + return (0); + + bl = ol->bread; + br = or->bread; + if (bl != br) + return (0); + + cl = COMP(ol->flags); + cr = COMP(or->flags); + if (cl != cr) + return (0); + + pl = ol->path.abpos; + pr = or->path.abpos; + if (pl != pr) + return (0); + + pl = ol->path.aepos; + pr = or->path.aepos; + if (pl != pr) + return (0); + + pl = ol->path.bbpos; + pr = or->path.bbpos; + if (pl != pr) + return (0); + + pl = ol->path.bepos; + pr = or->path.bepos; + if (pl != pr) + return (0); + + return (1); +} + +int main(int argc, char *argv[]) +{ char *iblock, *fblock, *iend; + int64 isize, osize; + int64 ovlsize, ptrsize; + int tspace, tbytes; + int i; + + int VERBOSE; + int MAP_ORDER; + + // Process options + + { int j, k; + int flags[128]; + + ARG_INIT("LAsort") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + { ARG_FLAGS("va") } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + MAP_ORDER = flags['a']; + + if (argc <= 1) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); + fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n"); + fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n"); + exit (1); + } + } + + // For each file do + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + isize = 0; + iblock = NULL; + osize = MEMORY * 1000000ll; + fblock = Malloc(osize,"Allocating LAsort output block"); + + for (i = 1; i < argc; i++) + { int64 *perm; + FILE *input, *foutput; + int64 novl, sov; + Block_Looper *parse; + + parse = Parse_Block_LAS_Arg(argv[i]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { + // Read in the entire file and output header + + { int64 size; + struct stat info; + char *root, *path; + + path = Block_Arg_Path(parse); + root = Block_Arg_Root(parse); + + stat(Catenate(path,"/",root,".las"),&info); + size = info.st_size; + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + if (VERBOSE) + { printf(" %s: ",root); + Print_Number(novl,0,stdout); + printf(" records "); + Print_Number(size-novl*ovlsize,0,stdout); + printf(" trace bytes\n"); + fflush(stdout); + } + + foutput = Fopen(Catenate(path,"/",root,".S.las"),"w"); + if (foutput == NULL) + exit (1); + + if (fwrite(&novl,sizeof(int64),1,foutput) != 1) + SYSTEM_WRITE_ERROR + if (fwrite(&tspace,sizeof(int),1,foutput) != 1) + SYSTEM_WRITE_ERROR + + if (size > isize) + { if (iblock == NULL) + iblock = Malloc(size+ptrsize,"Allocating LAsort input block"); + else + iblock = Realloc(iblock-ptrsize,size+ptrsize,"Allocating LAsort input block"); + if (iblock == NULL) + exit (1); + iblock += ptrsize; + isize = size; + } + size -= (sizeof(int64) + sizeof(int)); + if (size > 0) + { if (fread(iblock,size,1,input) != 1) + SYSTEM_READ_ERROR + } + fclose(input); + iend = iblock + (size - ptrsize); + + free(root); + free(path); + } + + if (novl == 0) + { fclose(foutput); + continue; + } + + // Set up unsorted permutation array + + perm = (int64 *) Malloc(sizeof(int64)*novl,"Allocating LAsort permutation vector"); + if (perm == NULL) + exit (1); + + { int64 off; + int j; + + if (CHAIN_START(((Overlap *) (iblock-ptrsize))->flags)) + { sov = 0; + off = -ptrsize; + for (j = 0; j < novl; j++) + { if (CHAIN_START(((Overlap *) (iblock+off))->flags)) + perm[sov++] = off; + off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; + } + } + else + { off = -ptrsize; + for (j = 0; j < novl; j++) + { perm[j] = off; + off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; + } + sov = novl; + } + } + + // Sort permutation array of ptrs to records + + IBLOCK = iblock; + if (MAP_ORDER) + qsort(perm,sov,sizeof(int64),SORT_MAP); + else + qsort(perm,sov,sizeof(int64),SORT_OVL); + + // Output the records in sorted order + + { int j, equal; + Overlap *w, *x, y; + int64 tsize, span; + char *fptr, *ftop, *wo; + + y.aread = ((Overlap *) (iblock+perm[0]))->aread+1; + x = &y; + + fptr = fblock; + ftop = fblock + osize; + for (j = 0; j < sov; j++) + { w = (Overlap *) (wo = iblock+perm[j]); + do + { equal = EQUAL(w,x); + tsize = w->path.tlen*tbytes; + span = ovlsize + tsize; + if (fptr + span > ftop) + { if (fwrite(fblock,1,fptr-fblock,foutput) != (size_t) (fptr-fblock)) + SYSTEM_WRITE_ERROR + fptr = fblock; + } + if (equal) + { fptr += (ovlsize + tsize); + novl -= 1; + } + else + { memmove(fptr,((char *) w)+ptrsize,ovlsize); + fptr += ovlsize; + memmove(fptr,(char *) (w+1),tsize); + fptr += tsize; + } + x = w; + w = (Overlap *) (wo += span); + } + while (wo < iend && CHAIN_NEXT(w->flags)); + } + if (fptr > fblock) + { if (fwrite(fblock,1,fptr-fblock,foutput) != (size_t) (fptr-fblock)) + SYSTEM_WRITE_ERROR + } + } + + rewind(foutput); + if (fwrite(&novl,sizeof(int64),1,foutput) != 1) + SYSTEM_WRITE_ERROR + + free(perm); + fclose(foutput); + } + Free_Block_Arg(parse); + } + + if (iblock != NULL) + free(iblock - ptrsize); + free(fblock); + + exit (0); +} diff --git a/LAsplit.c b/LAsplit.c new file mode 100644 index 0000000..966a0ff --- /dev/null +++ b/LAsplit.c @@ -0,0 +1,229 @@ +/******************************************************************************************* + * + * Split an OVL file arriving from the standard input into 'parts' equal sized .las-files + * .1.las, .2.las ... or according to a current partitioning of + * + * Author: Gene Myers + * Date : June 2014 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "-v ( | ) < .las"; + +#define MEMORY 1000 // How many megabytes for output buffer + +int main(int argc, char *argv[]) +{ char *iblock, *oblock; + FILE *output; + DAZZ_STUB *stub; + int64 novl, bsize, ovlsize, ptrsize; + int parts, tspace, tbytes; + char *pwd, *root, *root2; + + int VERBOSE; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAsplit") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + { ARG_FLAGS("v") } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + + if (argc != 3) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," is a template that must have a single %c-sign in it\n", + BLOCK_SYMBOL); + fprintf(stderr," This symbol is replaced by numbers 1 to n = the number of parts\n"); + exit (1); + } + } + + { char *eptr; + + parts = strtol(argv[2],&eptr,10); + if (*eptr != '\0') + { pwd = PathTo(argv[2]); + if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) + { root = Root(argv[2],".dam"); + stub = Read_DB_Stub(Catenate(pwd,"/",root,".dam"),DB_STUB_BLOCKS); + parts = stub->nblocks; + } + else + { root = Root(argv[2],".db"); + stub = Read_DB_Stub(Catenate(pwd,"/",root,".db"),DB_STUB_BLOCKS); + parts = stub->nblocks; + } + free(pwd); + free(root); + } + else + { stub = NULL; + if (parts <= 0) + { fprintf(stderr,"%s: Number of parts is not positive\n",Prog_Name); + exit (1); + } + } + } + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + bsize = MEMORY * 1000000ll; + oblock = (char *) Malloc(bsize,"Allocating output block"); + iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); + if (oblock == NULL || iblock == NULL) + exit (1); + iblock += ptrsize; + + pwd = PathTo(argv[1]); + root = Root(argv[1],".las"); + + root2 = index(root,BLOCK_SYMBOL); + if (root2 == NULL) + { fprintf(stderr,"%s: No %c-sign in source name '%s'\n",Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } + if (index(root2+1,BLOCK_SYMBOL) != NULL) + { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } + *root2++ = '\0'; + + if (fread(&novl,sizeof(int64),1,stdin) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,stdin) != 1) + SYSTEM_READ_ERROR + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + if (VERBOSE) + { printf(" Distributing %lld la\'s\n",novl); + fflush(stdout); + } + + { int i; + Overlap *w; + int64 j, low, hgh, last; + int64 tsize, povl; + char *iptr, *itop; + char *optr, *otop; + + iptr = iblock; + itop = iblock + fread(iblock,1,bsize,stdin); + + hgh = 0; + for (i = 0; i < parts; i++) + { output = Fopen(Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las"),"w"); + if (output == NULL) + exit (1); + + low = hgh; + if (stub != NULL) + { last = stub->tblocks[i+1]; + hgh = 0; + } + else + { last = 0; + hgh = (novl*(i+1))/parts; + } + + povl = 0; + fwrite(&povl,sizeof(int64),1,output); + fwrite(&tspace,sizeof(int),1,output); + + optr = oblock; + otop = oblock + bsize; + + for (j = low; j < novl; j++) + { if (iptr + ovlsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,stdin); + } + + w = (Overlap *) (iptr-ptrsize); + if (stub == NULL) + { if (j >= hgh && w->aread > last) + break; + last = w->aread; + } + else + { if (w->aread >= last) + break; + } + + tsize = w->path.tlen*tbytes; + if (optr + ovlsize + tsize > otop) + { fwrite(oblock,1,optr-oblock,output); + optr = oblock; + } + + memmove(optr,iptr,ovlsize); + optr += ovlsize; + iptr += ovlsize; + + if (iptr + tsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,stdin); + } + memmove(optr,iptr,tsize); + optr += tsize; + iptr += tsize; + } + hgh = j; + + if (optr > oblock) + fwrite(oblock,1,optr-oblock,output); + + rewind(output); + povl = hgh-low; + fwrite(&povl,sizeof(int64),1,output); + + if (VERBOSE) + { printf(" Split off %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl); + fflush(stdout); + } + + fclose(output); + } + } + + free(pwd); + free(root); + Free_DB_Stub(stub); + free(iblock-ptrsize); + free(oblock); + + exit (0); +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9aa819c --- /dev/null +++ b/LICENSE @@ -0,0 +1,34 @@ + + Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + · Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + · Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + + · The name of EWM may not be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + For any issues regarding this software and its use, contact EWM at: + + Eugene W. Myers Jr. + Bautzner Str. 122e + 01099 Dresden + GERMANY + Email: gene.myers@gmail.com + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..fc3d8d4 --- /dev/null +++ b/Makefile @@ -0,0 +1,58 @@ +DEST_DIR = ~/bin + +# CFLAGS = -O0 -g -Wall -Wextra -Wno-unused-result -fno-strict-aliasing -fsanitize=address -fsanitize=undefined +# Above is for debug out of bound addresses, must compile with -lASAN -lUBSAN if gcc instead of clang + +CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing + +ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAa2b LAb2a dumpLA + +all: $(ALL) + +daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm + +HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm + +LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm + +LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm + +LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm + +LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm + +LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm + +LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm + +LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm + +LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm + +LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm + +dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm + +clean: + rm -f $(ALL) + rm -fr *.dSYM + rm -f daligner.tar.gz + +install: + cp $(ALL) $(DEST_DIR) + +package: + make clean + tar -zcf daligner.tar.gz README.md Makefile *.h *.c diff --git a/QV.c b/QV.c new file mode 100644 index 0000000..d7d7263 --- /dev/null +++ b/QV.c @@ -0,0 +1,1481 @@ +/******************************************************************************************* + * + * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on + * the histogram of values occuring in a given file. The two low complexity streams + * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant + * character. + * + * Author: Gene Myers + * Date: Jan 18, 2014 + * Modified: July 25, 2014 + * + ********************************************************************************************/ + +#include +#include +#include +#include +#include + +#include "DB.h" + +#undef DEBUG + +#define MIN_BUFFER 1000 + +#define HUFF_CUTOFF 16 // This cannot be larger than 16 ! + + +/******************************************************************************************* + * + * Endian flipping routines + * + ********************************************************************************************/ + +static int LittleEndian; // Little-endian machine ? + // Referred by: Decode & Decode_Run +static int Flip; // Flip endian of all coded shorts and ints + // Referred by: Decode & Decode_Run & Read_Scheme + +static void Set_Endian(int flip) +{ uint32 x = 3; + uint8 *b = (uint8 *) (&x); + + Flip = flip; + LittleEndian = (b[0] == 3); +} + +static void Flip_Long(void *w) +{ uint8 *v = (uint8 *) w; + uint8 x; + + x = v[0]; + v[0] = v[3]; + v[3] = x; + x = v[1]; + v[1] = v[2]; + v[2] = x; +} + +static void Flip_Short(void *w) +{ uint8 *v = (uint8 *) w; + uint8 x; + + x = v[0]; + v[0] = v[1]; + v[1] = x; +} + + +/******************************************************************************************* + * + * Routines for computing a Huffman Encoding Scheme + * + ********************************************************************************************/ + +typedef struct + { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated + uint32 codebits[256]; // If type = 2, then code 255 is the special code for + int codelens[256]; // non-Huffman exceptions + int lookup[0x10000]; // Lookup table (just for decoding) + } HScheme; + +typedef struct _HTree + { struct _HTree *lft, *rgt; + uint64 count; + } HTree; + + // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) + // assuming s is the only perturbation in the tree. + +static void Reheap(int s, HTree **heap, int hsize) +{ int c, l, r; + HTree *hs, *hr, *hl; + + c = s; + hs = heap[s]; + while ((l = 2*c) <= hsize) + { r = l+1; + hl = heap[l]; + hr = heap[r]; + if (r > hsize || hr->count > hl->count) + { if (hs->count > hl->count) + { heap[c] = hl; + c = l; + } + else + break; + } + else + { if (hs->count > hr->count) + { heap[c] = hr; + c = r; + } + else + break; + } + } + if (c != s) + heap[c] = hs; +} + + // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits + // of codebits[s] contain the code for symbol s. + +static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) +{ if (node->rgt == NULL) + { uint64 symbol = (uint64) (node->lft); + codebits[symbol] = code; + codelens[symbol] = len; + } + else + { code <<= 1; + len += 1; + Build_Table(node->lft,code,len,codebits,codelens); + Build_Table(node->rgt,code+1,len,codebits,codelens); + } +} + + // For the non-zero symbols in hist, compute a huffman tree over them, and then + // build a table of the codes. If inscheme is not NULL, then place all symbols + // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme + // as a single united entity, whose code signals that the value of these symbols + // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. + // All the symbols in this class will have the same entry in the code table and + // 255 is always in this class. + +static HScheme *Huffman(uint64 *hist, HScheme *inscheme) +{ HScheme *scheme; + HTree *heap[259]; + HTree node[512]; + int hsize; + HTree *lft, *rgt; + int value, range; + int i; + + scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); + if (scheme == NULL) + return (NULL); + + hsize = 0; // Load heap + value = 0; + if (inscheme != NULL) + { node[0].count = 0; + node[0].lft = (HTree *) (uint64) 255; + node[0].rgt = NULL; + heap[++hsize] = node+(value++); + } + for (i = 0; i < 256; i++) + if (hist[i] > 0) + { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) + node[0].count += hist[i]; + else + { node[value].count = hist[i]; + node[value].lft = (HTree *) (uint64) i; + node[value].rgt = NULL; + heap[++hsize] = node+(value++); + } + } + + for (i = hsize/2; i >= 1; i--) // Establish heap property + Reheap(i,heap,hsize); + + range = value; // Merge pairs with smallest count until have a tree + for (i = 1; i < value; i++) + { lft = heap[1]; + heap[1] = heap[hsize--]; + Reheap(1,heap,hsize); + rgt = heap[1]; + node[range].lft = lft; + node[range].rgt = rgt; + node[range].count = lft->count + rgt->count; + heap[1] = node+(range++); + Reheap(1,heap,hsize); + } + + for (i = 0; i < 256; i++) // Build the code table + { scheme->codebits[i] = 0; + scheme->codelens[i] = 0; + } + + Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); + + if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes + { scheme->type = 2; // to code and length for 255 + for (i = 0; i < 255; i++) + if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) + { scheme->codelens[i] = scheme->codelens[255]; + scheme->codebits[i] = scheme->codebits[255]; + } + } + else + { scheme->type = 0; + for (i = 0; i < 256; i++) + { if (scheme->codelens[i] > HUFF_CUTOFF) + scheme->type = 1; + } + } + + return (scheme); +} + +#ifdef DEBUG + + // For debug, show the coding table + +static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) +{ uint64 total_bits; + uint32 specval, mask, code, *bits; + int speclen, clen, *lens; + int i, k; + + total_bits = 0; + bits = scheme->codebits; + lens = scheme->codelens; + if (scheme->type == 2) + { specval = bits[255]; + speclen = lens[255]; + } + else + specval = speclen = 0x7fffffff; + + printf("\nCode Table:\n"); + for (i = 0; i < 256; i++) + if (lens[i] > 0) + { clen = lens[i]; + mask = (1 << clen); + code = bits[i]; + printf(" %3d: %2d ",i,clen); + for (k = 0; k < clen; k++) + { mask >>= 1; + if (code & mask) + printf("1"); + else + printf("0"); + } + if (code == specval && clen == speclen) + { printf(" ***"); + if (hist != NULL) + total_bits += (clen+infosize)*hist[i]; + } + else if (hist != NULL) + total_bits += clen*hist[i]; + printf("\n"); + } + if (hist != NULL) + printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); +} + + // For debug, show the histogram + +static void Print_Histogram(uint64 *hist) +{ int i, low, hgh; + uint64 count; + + for (hgh = 255; hgh >= 0; hgh--) + if (hist[hgh] != 0) + break; + for (low = 0; low < 256; low++) + if (hist[low] != 0) + break; + count = 0; + for (i = low; i <= hgh; i++) + count += hist[i]; + + for (i = hgh; i >= low; i--) + printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); +} + +#endif + + +/******************************************************************************************* + * + * Read and Write Huffman Schemes + * + ********************************************************************************************/ + + // Write the code table to out. + +static void Write_Scheme(HScheme *scheme, FILE *out) +{ int i; + uint8 x; + uint32 *bits; + int *lens; + + lens = scheme->codelens; + bits = scheme->codebits; + + x = (uint8) (scheme->type); + fwrite(&x,1,1,out); + + for (i = 0; i < 256; i++) + { x = (uint8) (lens[i]); + fwrite(&x,1,1,out); + if (x > 0) + fwrite(bits+i,sizeof(uint32),1,out); + } +} + + // Allocate and read a code table from in, and return a pointer to it. + +static HScheme *Read_Scheme(FILE *in) +{ HScheme *scheme; + int *look, *lens; + uint32 *bits, base; + int i, j, powr; + uint8 x; + + scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); + if (scheme == NULL) + return (NULL); + + lens = scheme->codelens; + bits = scheme->codebits; + look = scheme->lookup; + + if (fread(&x,1,1,in) != 1) + { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); + free(scheme); + return (NULL); + } + scheme->type = x; + for (i = 0; i < 256; i++) + { if (fread(&x,1,1,in) != 1) + { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); + return (NULL); + } + lens[i] = x; + if (x > 0) + { if (fread(bits+i,sizeof(uint32),1,in) != 1) + { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); + free(scheme); + return (NULL); + } + } + else + bits[i] = 0; + } + + if (Flip) + { for (i = 0; i < 256; i++) + Flip_Long(bits+i); + } + + for (i = 0; i < 256; i++) + { if (lens[i] > 0) + { base = (bits[i] << (16-lens[i])); + powr = (1 << (16-lens[i])); + for (j = 0; j < powr; j++) + look[base+j] = i; + } + } + + return (scheme); +} + + +/******************************************************************************************* + * + * Encoders and Decoders + * + ********************************************************************************************/ + + // Encode read[0..rlen-1] according to scheme and write to out + +static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) +{ uint32 x, c, ocode; + int n, k, olen, llen; + int *nlens; + uint32 *nbits; + uint32 nspec; + int nslen; + + nlens = scheme->codelens; + nbits = scheme->codebits; + + if (scheme->type == 2) + { nspec = nbits[255]; + nslen = nlens[255]; + } + else + nspec = nslen = 0x7fffffff; + +#define OCODE(L,C) \ +{ int len = olen + (L); \ + uint32 code = (C); \ + \ + llen = olen; \ + if (len >= 32) \ + { olen = len-32; \ + ocode |= (code >> olen); \ + fwrite(&ocode,sizeof(uint32),1,out); \ + if (olen > 0) \ + ocode = (code << (32-olen)); \ + else \ + ocode = 0; \ + } \ + else \ + { olen = len; \ + ocode |= (code << (32-olen));; \ + } \ +} + + llen = 0; + olen = 0; + ocode = 0; + for (k = 0; k < rlen; k++) + { x = read[k]; + n = nlens[x]; + c = nbits[x]; + OCODE(n,c); + if (c == nspec && n == nslen) + OCODE(8,x); + } + + if (olen > 0) // Tricky: must pad so decoder does not read past + { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. + if (llen > 16 && olen > llen) + fwrite(&ocode,sizeof(uint32),1,out); + } + else if (llen > 16) + fwrite(&ocode,sizeof(uint32),1,out); +} + + // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for + // runs of rchar characters. Write to out. + +static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) +{ uint32 x, c, ocode; + int n, h, k, olen, llen; + int *nlens, *rlens; + uint32 *nbits, *rbits; + uint32 nspec, rspec; + int nslen, rslen; + + nlens = neme->codelens; + nbits = neme->codebits; + rlens = reme->codelens; + rbits = reme->codebits; + + if (neme->type == 2) + { nspec = nbits[255]; + nslen = nlens[255]; + } + else + nspec = nslen = 0x7fffffff; + + rspec = rbits[255]; + rslen = rlens[255]; + + llen = 0; + olen = 0; + ocode = 0; + k = 0; + while (k < rlen) + { h = k; + while (k < rlen && read[k] == rchar) + k += 1; + if (k-h >= 255) + x = 255; + else + x = k-h; + n = rlens[x]; + c = rbits[x]; + OCODE(n,c); + if (c == rspec && n == rslen) + OCODE(16,k-h); + if (k < rlen) + { x = read[k]; + n = nlens[x]; + c = nbits[x]; + OCODE(n,c); + if (c == nspec && n == nslen) + OCODE(8,x); + k += 1; + } + } + + if (olen > 0) + { fwrite(&ocode,sizeof(uint32),1,out); + if (llen > 16 && olen > llen) + fwrite(&ocode,sizeof(uint32),1,out); + } + else if (llen > 16) + fwrite(&ocode,sizeof(uint32),1,out); +} + + // Read and decode from in, the next rlen symbols into read according to scheme + +static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) +{ int *look, *lens; + int signal, ilen; + uint64 icode; + uint32 *ipart; + uint16 *xpart; + uint8 *cpart; + int j, n, c; + + if (LittleEndian) + { ipart = ((uint32 *) (&icode)); + xpart = ((uint16 *) (&icode)) + 2; + cpart = ((uint8 *) (&icode)) + 5; + } + else + { ipart = ((uint32 *) (&icode)) + 1; + xpart = ((uint16 *) (&icode)) + 1; + cpart = ((uint8 *) (&icode)) + 2; + } + + if (scheme->type == 2) + signal = 255; + else + signal = 256; + lens = scheme->codelens; + look = scheme->lookup; + +#define GET \ + if (n > ilen) \ + { icode <<= ilen; \ + if (fread(ipart,sizeof(uint32),1,in) != 1) \ + { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ + return (1); \ + } \ + ilen = n-ilen; \ + icode <<= ilen; \ + ilen = 32-ilen; \ + } \ + else \ + { icode <<= n; \ + ilen -= n; \ + } + +#define GETFLIP \ + if (n > ilen) \ + { icode <<= ilen; \ + if (fread(ipart,sizeof(uint32),1,in) != 1) \ + { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ + return (1); \ + } \ + Flip_Long(ipart); \ + ilen = n-ilen; \ + icode <<= ilen; \ + ilen = 32-ilen; \ + } \ + else \ + { icode <<= n; \ + ilen -= n; \ + } + + n = 16; + ilen = 0; + icode = 0; + if (Flip) + for (j = 0; j < rlen; j++) + { GETFLIP + c = look[*xpart]; + n = lens[c]; + if (c == signal) + { GETFLIP + c = *cpart; + n = 8; + } + read[j] = (char) c; + } + else + for (j = 0; j < rlen; j++) + { GET + c = look[*xpart]; + n = lens[c]; + if (c == signal) + { GET + c = *cpart; + n = 8; + } + read[j] = (char) c; + } + + return (0); +} + + // Read and decode from in, the next rlen symbols into read according to non-rchar scheme + // neme, and the rchar runlength shceme reme + +static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, + int rlen, int rchar) +{ int *nlook, *nlens; + int *rlook, *rlens; + int nsignal, ilen; + uint64 icode; + uint32 *ipart; + uint16 *xpart; + uint8 *cpart; + int j, n, c, k; + + if (LittleEndian) + { ipart = ((uint32 *) (&icode)); + xpart = ((uint16 *) (&icode)) + 2; + cpart = ((uint8 *) (&icode)) + 5; + } + else + { ipart = ((uint32 *) (&icode)) + 1; + xpart = ((uint16 *) (&icode)) + 1; + cpart = ((uint8 *) (&icode)) + 2; + } + + if (neme->type == 2) + nsignal = 255; + else + nsignal = 256; + nlens = neme->codelens; + nlook = neme->lookup; + + rlens = reme->codelens; + rlook = reme->lookup; + + n = 16; + ilen = 0; + icode = 0; + if (Flip) + for (j = 0; j < rlen; j++) + { GETFLIP + c = rlook[*xpart]; + n = rlens[c]; + if (c == 255) + { GETFLIP + c = *xpart; + n = 16; + } + for (k = 0; k < c; k++) + read[j++] = (char) rchar; + + if (j < rlen) + { GETFLIP + c = nlook[*xpart]; + n = nlens[c]; + if (c == nsignal) + { GETFLIP + c = *cpart; + n = 8; + } + read[j] = (char) c; + } + } + else + for (j = 0; j < rlen; j++) + { GET + c = rlook[*xpart]; + n = rlens[c]; + if (c == 255) + { GET + c = *xpart; + n = 16; + } + for (k = 0; k < c; k++) + read[j++] = (char) rchar; + + if (j < rlen) + { GET + c = nlook[*xpart]; + n = nlens[c]; + if (c == nsignal) + { GET + c = *cpart; + n = 8; + } + read[j] = (char) c; + } + } + + return (0); +} + + +/******************************************************************************************* + * + * Histogrammers + * + ********************************************************************************************/ + +// Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. + +static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) +{ int k; + + for (k = 0; k < rlen; k++) + hist[stream[k]] += 1; +} + +static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) +{ int k, h; + + k = 0; + while (k < rlen) + { h = k; + while (k < rlen && stream[k] == runChar) + k += 1; + if (k-h >= 256) + run[255] += 1; + else + run[k-h] += 1; + if (k < rlen) + k += 1; + } +} + + +/******************************************************************************************* + * + * Reader + * + ********************************************************************************************/ + +static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, +static int Rmax = -1; // Compress_Next_QVentry + +static int Nline; // Referred by: QVcoding_Scan + +char *QVentry() +{ return (Read); } + +void Set_QV_Line(int line) +{ Nline = line; } + +int Get_QV_Line() +{ return (Nline); } + +// If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines +// for a sequence. Place line j at Read+j*Rmax and the length of every line is returned +// unless eof occurs in which case return -1. If any error occurs return -2. + +int Read_Lines(FILE *input, int nlines) +{ int i, rlen; + int tmax; + char *tread; + char *other; + + if (Read == NULL) + { tmax = MIN_BUFFER; + tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); + if (tread == NULL) + EXIT(-2); + Rmax = tmax; + Read = tread; + } + + Nline += 1; + if (fgets(Read,Rmax,input) == NULL) + return (-1); + + rlen = strlen(Read); + while (Read[rlen-1] != '\n') + { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; + tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); + if (tread == NULL) + EXIT(-2); + Rmax = tmax; + Read = tread; + if (fgets(Read+rlen,Rmax-rlen,input) == NULL) + { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); + EXIT(-2); + } + rlen += strlen(Read+rlen); + } + other = Read; + for (i = 1; i < nlines; i++) + { other += Rmax; + Nline += 1; + if (fgets(other,Rmax,input) == NULL) + { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT(-2); + } + if (rlen != (int) strlen(other)) + { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); + EXIT(-2); + } + } + return (rlen-1); +} + + +/******************************************************************************************* + * + * Tag compression and decompression routines + * + ********************************************************************************************/ + +// Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and +// return the # of symbols kept. + +static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) +{ int j, k; + + j = 0; + for (k = 0; k < rlen; k++) + if (qvs[k] != rchar) + tags[j++] = tags[k]; + tags[j] = '\0'; + return (j); +} + + // Count the # of non-rchar symbols in qvs[0..rlen-1] + +static int Packed_Length(char *qvs, int rlen, int rchar) +{ int k, clen; + + clen = 0; + for (k = 0; k < rlen; k++) + if (qvs[k] != rchar) + clen += 1; + return (clen); +} + + // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar + // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and + // the unpacked result, clen is the initial length of tags. + +static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) +{ int j, k; + + j = clen-1; + for (k = rlen-1; k >= 0; k--) + { if (qvs[k] == rchar) + tags[k] = 'n'; + else + tags[k] = tags[j--]; + } +} + + +/******************************************************************************************* + * + * Statistics Scan and Scheme creation and write + * + ********************************************************************************************/ + + // Read up to the next num entries or until eof from the .quiva file on input and record + // frequency statistics. Copy these entries to the temporary file temp if != NULL. + // If there is an error then -1 is returned, otherwise the number of entries read. + +static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; +static uint64 totChar; +static int delChar, subChar; + + // Referred by: QVcoding_Scan, Create_QVcoding + +void QVcoding_Scan1(int rlen, char *delQV, char *delTag, char *insQV, char *mergeQV, char *subQV) +{ + if (rlen == 0) // Initialization call + { int i; + + // Zero histograms + + bzero(delHist,sizeof(uint64)*256); + bzero(mrgHist,sizeof(uint64)*256); + bzero(insHist,sizeof(uint64)*256); + bzero(subHist,sizeof(uint64)*256); + + for (i = 0; i < 256; i++) + delRun[i] = subRun[i] = 1; + + totChar = 0; + delChar = -1; + subChar = -1; + return; + } + + // Add streams to accumulating histograms and figure out the run chars + // for the deletion and substition streams + + Histogram_Seqs(delHist,(uint8 *) delQV,rlen); + Histogram_Seqs(insHist,(uint8 *) insQV,rlen); + Histogram_Seqs(mrgHist,(uint8 *) mergeQV,rlen); + Histogram_Seqs(subHist,(uint8 *) subQV,rlen); + + if (delChar < 0) + { int k; + + for (k = 0; k < rlen; k++) + if (delTag[k] == 'n' || delTag[k] == 'N') + { delChar = delQV[k]; + break; + } + } + if (delChar >= 0) + Histogram_Runs( delRun,(uint8 *) delQV,rlen,delChar); + totChar += rlen; + if (subChar < 0) + { if (totChar >= 100000) + { int k; + + subChar = 0; + for (k = 1; k < 256; k++) + if (subHist[k] > subHist[subChar]) + subChar = k; + } + } + if (subChar >= 0) + Histogram_Runs( subRun,(uint8 *) subQV,rlen,subChar); + return; +} + +int QVcoding_Scan(FILE *input, int num, FILE *temp) +{ char *slash; + int rlen; + int i, r; + + // Zero histograms + + bzero(delHist,sizeof(uint64)*256); + bzero(mrgHist,sizeof(uint64)*256); + bzero(insHist,sizeof(uint64)*256); + bzero(subHist,sizeof(uint64)*256); + + for (i = 0; i < 256; i++) + delRun[i] = subRun[i] = 1; + + totChar = 0; + delChar = -1; + subChar = -1; + + // Make a sweep through the .quiva entries, histogramming the relevant things + // and figuring out the run chars for the deletion and substition streams + + r = 0; + for (i = 0; i < num; i++) + { int well, beg, end, qv; + + rlen = Read_Lines(input,1); + if (rlen == -2) + EXIT(-1); + if (rlen < 0) + break; + + if (rlen == 0 || Read[0] != '@') + { EPRINTF(EPLACE,"Line %d: Header in quiva file is missing\n",Nline); + EXIT(-1); + } + slash = index(Read+1,'/'); + if (slash == NULL) + { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", + Prog_Name,Nline); + EXIT(-1); + } + if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) + { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", + Prog_Name,Nline); + EXIT(-1); + } + + if (temp != NULL) + fputs(Read,temp); + + rlen = Read_Lines(input,5); + if (rlen < 0) + { if (rlen == -1) + EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT(-1); + } + + if (temp != NULL) + { fputs(Read,temp); + fputs(Read+Rmax,temp); + fputs(Read+2*Rmax,temp); + fputs(Read+3*Rmax,temp); + fputs(Read+4*Rmax,temp); + } + + Histogram_Seqs(delHist,(uint8 *) (Read),rlen); + Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); + Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); + Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); + + if (delChar < 0) + { int k; + char *del = Read+Rmax; + + for (k = 0; k < rlen; k++) + if (del[k] == 'n' || del[k] == 'N') + { delChar = Read[k]; + break; + } + } + if (delChar >= 0) + Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); + totChar += rlen; + if (subChar < 0) + { if (totChar >= 100000) + { int k; + + subChar = 0; + for (k = 1; k < 256; k++) + if (subHist[k] > subHist[subChar]) + subChar = k; + } + } + if (subChar >= 0) + Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); + + r += 1; + } + + return (r); +} + + // Using the statistics in the global stat tables, create the Huffman schemes and write + // them to output. If lossy is set, then create a lossy table for the insertion and merge + // QVs. + +QVcoding *Create_QVcoding(int lossy) +{ static QVcoding coding; + + HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; + HScheme *dRunScheme, *sRunScheme; + + delScheme = NULL; + dRunScheme = NULL; + insScheme = NULL; + mrgScheme = NULL; + subScheme = NULL; + sRunScheme = NULL; + + // Check whether using a subtitution run char is a win + + if (totChar < 200000 || subHist[subChar] < .5*totChar) + subChar = -1; + + // If lossy encryption is enabled then scale insertions and merge QVs. + + if (lossy) + { int k; + + for (k = 0; k < 256; k += 2) + { insHist[k] += insHist[k+1]; + insHist[k+1] = 0; + } + + for (k = 0; k < 256; k += 4) + { mrgHist[k] += mrgHist[k+1]; + mrgHist[k] += mrgHist[k+2]; + mrgHist[k] += mrgHist[k+3]; + mrgHist[k+1] = 0; + mrgHist[k+2] = 0; + mrgHist[k+3] = 0; + } + } + + // Build a Huffman scheme for each stream entity from the histograms + +#define SCHEME_MACRO(meme,hist,label,bits) \ + scheme = Huffman( (hist), NULL); \ + if (scheme == NULL) \ + goto error; \ + if (scheme->type) \ + { (meme) = Huffman( (hist), scheme); \ + free(scheme); \ + } \ + else \ + (meme) = scheme; + +#ifdef DEBUG + +#define MAKE_SCHEME(meme,hist,label,bits) \ + SCHEME_MACRO(meme,hist,label,bits) \ + printf("\n%s\n", (label) ); \ + Print_Histogram( (hist)); \ + Print_Table( (meme), (hist), (bits)); + +#else + +#define MAKE_SCHEME(meme,hist,label,bits) \ + SCHEME_MACRO(meme,hist,label,bits) + +#endif + + { HScheme *scheme; + + if (delChar < 0) + { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); + dRunScheme = NULL; + } + else + { delHist[delChar] = 0; + MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); + MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); +#ifdef DEBUG + printf("\nRun char is '%c'\n",delChar); +#endif + } + +#ifdef DEBUG + { int k; + uint64 count; + + count = 0; + for (k = 0; k < 256; k++) + count += delHist[k]; + printf("\nDelTag will require %lld bytes\n",count/4); + } +#endif + + MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); + MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); + + if (subChar < 0) + { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); + sRunScheme = NULL; + } + else + { subHist[subChar] = 0; + MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); + MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); +#ifdef DEBUG + printf("\nRun char is '%c'\n",subChar); +#endif + } + } + + // Setup endian handling + + Set_Endian(0); + + coding.delScheme = delScheme; + coding.insScheme = insScheme; + coding.mrgScheme = mrgScheme; + coding.subScheme = subScheme; + coding.dRunScheme = dRunScheme; + coding.sRunScheme = sRunScheme; + coding.delChar = delChar; + coding.subChar = subChar; + coding.prefix = NULL; + coding.flip = 0; + + return (&coding); + +error: + if (delScheme != NULL) + free(delScheme); + if (dRunScheme != NULL) + free(dRunScheme); + if (insScheme != NULL) + free(insScheme); + if (mrgScheme != NULL) + free(mrgScheme); + if (subScheme != NULL) + free(subScheme); + if (sRunScheme != NULL) + free(sRunScheme); + EXIT(NULL); +} + + // Write the encoding scheme 'coding' to 'output' + +void Write_QVcoding(FILE *output, QVcoding *coding) +{ + // Write out the endian key, run chars, and prefix (if not NULL) + + { uint16 half; + int len; + + half = 0x33cc; + fwrite(&half,sizeof(uint16),1,output); + + if (coding->delChar < 0) + half = 256; + else + half = (uint16) (coding->delChar); + fwrite(&half,sizeof(uint16),1,output); + + if (coding->subChar < 0) + half = 256; + else + half = (uint16) (coding->subChar); + fwrite(&half,sizeof(uint16),1,output); + + len = strlen(coding->prefix); + fwrite(&len,sizeof(int),1,output); + fwrite(coding->prefix,1,len,output); + } + + // Write out the scheme tables + + Write_Scheme(coding->delScheme,output); + if (coding->delChar >= 0) + Write_Scheme(coding->dRunScheme,output); + Write_Scheme(coding->insScheme,output); + Write_Scheme(coding->mrgScheme,output); + Write_Scheme(coding->subScheme,output); + if (coding->subChar >= 0) + Write_Scheme(coding->sRunScheme,output); +} + + // Read the encoding scheme 'coding' to 'output' + +QVcoding *Read_QVcoding(FILE *input) +{ static QVcoding coding; + + // Read endian key, run chars, and short name common to all headers + + { uint16 half; + int len; + + if (fread(&half,sizeof(uint16),1,input) != 1) + { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); + EXIT(NULL); + } + coding.flip = (half != 0x33cc); + + if (fread(&half,sizeof(uint16),1,input) != 1) + { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); + EXIT(NULL); + } + if (coding.flip) + Flip_Short(&half); + coding.delChar = half; + if (coding.delChar >= 256) + coding.delChar = -1; + + if (fread(&half,sizeof(uint16),1,input) != 1) + { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); + EXIT(NULL); + } + if (coding.flip) + Flip_Short(&half); + coding.subChar = half; + if (coding.subChar >= 256) + coding.subChar = -1; + + // Read the short name common to all headers + + if (fread(&len,sizeof(int),1,input) != 1) + { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); + EXIT(NULL); + } + if (coding.flip) + Flip_Long(&len); + coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); + if (coding.prefix == NULL) + EXIT(NULL); + if (len > 0) + { if (fread(coding.prefix,len,1,input) != 1) + { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); + EXIT(NULL); + } + } + coding.prefix[len] = '\0'; + } + + // Setup endian handling + + Set_Endian(coding.flip); + + // Read the Huffman schemes used to compress the data + + coding.delScheme = NULL; + coding.dRunScheme = NULL; + coding.insScheme = NULL; + coding.mrgScheme = NULL; + coding.subScheme = NULL; + coding.sRunScheme = NULL; + + coding.delScheme = Read_Scheme(input); + if (coding.delScheme == NULL) + goto error; + if (coding.delChar >= 0) + { coding.dRunScheme = Read_Scheme(input); + if (coding.dRunScheme == NULL) + goto error; + } + coding.insScheme = Read_Scheme(input); + if (coding.insScheme == NULL) + goto error; + coding.mrgScheme = Read_Scheme(input); + if (coding.mrgScheme == NULL) + goto error; + coding.subScheme = Read_Scheme(input); + if (coding.subScheme == NULL) + goto error; + if (coding.subChar >= 0) + { coding.sRunScheme = Read_Scheme(input); + if (coding.sRunScheme == NULL) + goto error; + } + + return (&coding); + +error: + if (coding.delScheme != NULL) + free(coding.delScheme); + if (coding.dRunScheme != NULL) + free(coding.dRunScheme); + if (coding.insScheme != NULL) + free(coding.insScheme); + if (coding.mrgScheme != NULL) + free(coding.mrgScheme); + if (coding.subScheme != NULL) + free(coding.subScheme); + if (coding.sRunScheme != NULL) + free(coding.sRunScheme); + EXIT(NULL); +} + + // Free all the auxilliary storage associated with the encoding argument + +void Free_QVcoding(QVcoding *coding) +{ if (coding->subChar >= 0) + free(coding->sRunScheme); + free(coding->subScheme); + free(coding->mrgScheme); + free(coding->insScheme); + if (coding->delChar >= 0) + free(coding->dRunScheme); + free(coding->delScheme); + free(coding->prefix); +} + + +/******************************************************************************************* + * + * Encode/Decode (w.r.t. coding) next entry from input and write to output + * + ********************************************************************************************/ + +void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, + FILE *output, QVcoding *coding, int lossy) +{ int clen; + + if (coding->delChar < 0) + { Encode(coding->delScheme, output, (uint8 *) del, rlen); + clen = rlen; + } + else + { Encode_Run(coding->delScheme, coding->dRunScheme, output, + (uint8 *) del, rlen, coding->delChar); + clen = Pack_Tag(tag,del,rlen,coding->delChar); + } + Number_Read(tag); + Compress_Read(clen,tag); + fwrite(tag,1,COMPRESSED_LEN(clen),output); + + if (lossy) + { uint8 *insert = (uint8 *) ins; + uint8 *merge = (uint8 *) mrg; + int k; + + for (k = 0; k < rlen; k++) + { insert[k] = (uint8) ((insert[k] >> 1) << 1); + merge[k] = (uint8) (( merge[k] >> 2) << 2); + } + } + + Encode(coding->insScheme, output, (uint8 *) ins, rlen); + Encode(coding->mrgScheme, output, (uint8 *) mrg, rlen); + if (coding->subChar < 0) + Encode(coding->subScheme, output, (uint8 *) sub, rlen); + else + Encode_Run(coding->subScheme, coding->sRunScheme, output, + (uint8 *) sub, rlen, coding->subChar); + return; +} + +int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) +{ int rlen, clen; + + // Get all 5 streams, compress each with its scheme, and output + + rlen = Read_Lines(input,5); + if (rlen < 0) + { if (rlen == -1) + EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT (-1); + } + + if (coding->delChar < 0) + { Encode(coding->delScheme, output, (uint8 *) Read, rlen); + clen = rlen; + } + else + { Encode_Run(coding->delScheme, coding->dRunScheme, output, + (uint8 *) Read, rlen, coding->delChar); + clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); + } + Number_Read(Read+Rmax); + Compress_Read(clen,Read+Rmax); + fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); + + if (lossy) + { uint8 *insert = (uint8 *) (Read+2*Rmax); + uint8 *merge = (uint8 *) (Read+3*Rmax); + int k; + + for (k = 0; k < rlen; k++) + { insert[k] = (uint8) ((insert[k] >> 1) << 1); + merge[k] = (uint8) (( merge[k] >> 2) << 2); + } + } + + Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); + Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); + if (coding->subChar < 0) + Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); + else + Encode_Run(coding->subScheme, coding->sRunScheme, output, + (uint8 *) (Read+4*Rmax), rlen, coding->subChar); + + return (rlen); +} + +int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) +{ int clen, tlen; + + // Decode each stream and write to output + + if (coding->delChar < 0) + { if (Decode(coding->delScheme, input, entry[0], rlen)) + EXIT(1); + clen = rlen; + tlen = COMPRESSED_LEN(clen); + if (tlen > 0) + { if (fread(entry[1],tlen,1,input) != 1) + { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); + EXIT(1); + } + } + Uncompress_Read(clen,entry[1]); + Lower_Read(entry[1]); + } + else + { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, + entry[0], rlen, coding->delChar)) + EXIT(1); + clen = Packed_Length(entry[0],rlen,coding->delChar); + tlen = COMPRESSED_LEN(clen); + if (tlen > 0) + { if (fread(entry[1],tlen,1,input) != 1) + { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); + EXIT(1); + } + } + Uncompress_Read(clen,entry[1]); + Lower_Read(entry[1]); + Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); + } + + if (Decode(coding->insScheme, input, entry[2], rlen)) + EXIT(1); + + if (Decode(coding->mrgScheme, input, entry[3], rlen)) + EXIT(1); + + if (coding->subChar < 0) + { if (Decode(coding->subScheme, input, entry[4], rlen)) + EXIT(1); + } + else + { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, + entry[4], rlen, coding->subChar)) + EXIT(1); + } + + return (0); +} diff --git a/QV.h b/QV.h new file mode 100644 index 0000000..e5c9485 --- /dev/null +++ b/QV.h @@ -0,0 +1,99 @@ +/******************************************************************************************* + * + * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on + * the histogram of values occuring in a given file. The two low complexity streams + * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant + * character. + * + * Author: Gene Myers + * Date: Jan 18, 2014 + * Modified: July 25, 2014 + * + ********************************************************************************************/ + +#ifndef _QV_COMPRESSOR + +#include + +#define _QV_COMPRESSOR + + // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or + // batch version of the routines in this library are compiled. In batch mode, routines + // print an error message and exit. In interactive mode, the routines place the error + // message in EPLACE (also defined in DB.h) and return an error value, typically NULL + // if the routine returns a pointer, and an unusual integer value if the routine returns + // an integer. + // Below when an error return is described, one should understand that this value is returned + // only if the routine was compiled in INTERACTIVE mode. + + // A PacBio compression scheme + +typedef struct + { void *delScheme; // Huffman scheme for deletion QVs + void *insScheme; // Huffman scheme for insertion QVs + void *mrgScheme; // Huffman scheme for merge QVs + void *subScheme; // Huffman scheme for substitution QVs + void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) + void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) + int delChar; // If > 0, run-encoded deletion value + int subChar; // If > 0, run-encoded substitution value + int flip; // Need to flip multi-byte integers + char *prefix; // Header line prefix + } QVcoding; + + // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. + // If end-of-input is encountered before any further input, -1 is returned. If there is + // an error than -2 is returned. Otherwise the length of the line(s) read is returned. + +int Read_Lines(FILE *input, int nlines); +char *QVentry(); + + // Get and set the line counter for error reporting + +void Set_QV_Line(int line); +int Get_QV_Line(); + + // Read up to the next num entries or until eof from the .quiva file on input and record + // frequency statistics. Copy these entries to the temporary file temp if != NULL. + // If there is an error then -1 is returned, otherwise the number of entries read. + +int QVcoding_Scan(FILE *input, int num, FILE *temp); +void QVcoding_Scan1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub); + + // Given QVcoding_Scan has been called at least once, create an encoding scheme based on + // the accumulated statistics and return a pointer to it. The returned encoding object + // is *statically allocated within the routine. If lossy is set then use a lossy scaling + // for the insertion and merge streams. If there is an error, then NULL is returned. + +QVcoding *Create_QVcoding(int lossy); + + // Read/write a coding scheme to input/output. The encoding object returned by the reader + // is *statically* allocated within the routine. If an error occurs while reading then + // NULL is returned. + +QVcoding *Read_QVcoding(FILE *input); +void Write_QVcoding(FILE *output, QVcoding *coding); + + // Free all the auxiliary storage associated with coding (but not the object itself!) + +void Free_QVcoding(QVcoding *coding); + + // Assuming the file pointer is positioned just beyond an entry header line, read the + // next set of 5 QV lines, compress them according to 'coding', and output. If lossy + // is set then the scheme is a lossy one. A negative value is returned if an error + // occurred, and the sequence length otherwise. + +int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); +void Compress_Next_QVentry1(int rlen, char *del, char *tag, char *ins, char *mrg, char *sub, + FILE *output, QVcoding *coding, int lossy); + + // Assuming the input is position just beyond the compressed encoding of an entry header, + // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, + // and place their decompressed values into entry which is a 5 element array of character + // pointers. The parameter rlen computed from the preceeding header line, critically + // provides the length of each of the 5 vectors. A non-zero value is return only if an + // error occured. + +int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); + +#endif // _QV_COMPRESSOR diff --git a/README.md b/README.md new file mode 100644 index 0000000..93b703f --- /dev/null +++ b/README.md @@ -0,0 +1,536 @@ +# Daligner: The Dazzler "Overlap" Module + +## _Author: Gene Myers_ +## _First: April 10, 2016_ +## _Current: April 19, 2019_ + +For typeset documentation, examples of use, and design philosophy please go to +my [blog](https://dazzlerblog.wordpress.com/command-guides/daligner-command-reference-guide). + +### Version Numbers + +v1.0 has been released, but if you need to refer to a later revision +from the stable master branch, please use ``v1.0.yyyymmdd`` where +``yyyy-mm-dd`` is the date of the commit used. This is important for +method details in scientific papers, and for software packaging +(e.g. Conda, HomeBrew, or Linux distribution packages). + +The commands below permit one to find all significant local alignments between reads +encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II +long read sequencer. That is the reads are long and noisy, up to 15% on average. + +Recall that a database has a current partition that divides it into blocks of a size +that can conveniently be handled by calling the **daligner** overlapper on all the pairs of +blocks producing a collection of .las local alignment files that can then be sorted and +merged into an ordered sequence of sorted files containing all alignments between reads +in the data set. The alignment records are parsimonious in that they do not record an +alignment but simply a set of trace points, typically every 100bp or so, that allow the +efficient reconstruction of alignments on demand. + +All programs add suffixes (e.g. .db, .las) as needed. +For the commands that take multiple .db or .las file blocks as arguments, i.e. **daligner**, **LAsort**, **LAmerge**, **LAcat**, +and **LAcheck**, one can place a @-sign in the name, which is then interpreted as the sequence of files +obtained by replacing the @-sign by 1, 2, 3, ... in sequence until a number is reached for +which no file matches. One can also place a @-sign followed by an integer, say, i, in which +case the sequence starts at i. Lastly, one can also use @i-j where i and j are integers, in +which case the sequence is from i to j, inclusive. + +The formal UNIX command line +descriptions and options for the DALIGNER module commands are as follows: + +``` +1. daligner [-vaAI] + [-k] [-%] [-h] [-w] [-t] [-M] + [-e] [-H] + [-T] [-P] [-m]+ + ... +``` + +Compare sequences in the trimmed *\* block against those in the list of *\* +blocks searching for local alignments involving at least -l base pairs (default 1000) +or more, that have an average correlation rate of -e (default 70%). The local +alignments found will be output in a sparse encoding where a trace point on the +alignment is recorded every -s base pairs of the a-read (default 100bp). Reads are +compared in both orientations and local alignments meeting the criteria are output to +one of several created files described below. The -v option turns on a verbose +reporting mode that gives statistics on each major step of the computation. The +program runs with 4 threads by default, but this may be set to any positive value with +the -T option. + +The options -k, -%, -h, and -w control the initial filtration search for possible matches +between reads. Specifically, our search code looks for a pair of diagonal bands of +width 2w (default 26 = 64) that contain a collection of matching k-mers +(default 16) in the lowest %-percentifle between the two reads, such that the total number of bases covered by the k-mer hits is h (default 50). k cannot be larger than 32 in the current implementation. *These parameters will shortly be superceded with a more intuitive interface.* + +If there are one or more interval tracks specified with the -m option, then the reads +of the DB or DB's to which the mask applies are soft masked with the union of the +intervals of all the interval tracks that apply, that is any k-mers that contain any +bases in any of the masked intervals are ignored for the purposes of seeding a match. +An interval track is a track, such as the "dust" track created by DBdust, that encodes +a set of intervals over either the untrimmed or trimmed DB. + +Invariably, some k-mers are significantly over-represented (e.g. homopolymer runs). +These k-mers create an excessive number of matching k-mer pairs and left unaddressed +would cause daligner to overflow the available physical memory. One way to deal with +this is to explicitly set the -t parameter which suppresses the use of any k-mer that +occurs more than t times in either the subject or target block. However, a better way +to handle the situation is to let the program automatically select a value of t that +meets a given memory usage limit specified (in Gb) by the -M parameter. By default +daligner will use the amount of physical memory as the choice for -M. If you want to +use less, say only 8Gb on a 24Gb HPC cluster node because you want to run 3 daligner +jobs on the node, then specify -M8. Specifying -M0 basically indicates that you do not +want daligner to self adjust k-mer suppression to fit within a given amount of memory. + +Each found alignment is recorded as -- a[ab,ae] x bo[bb,be] -- where a and b are the +indices (in the trimmed DB) of the reads that overlap, o indicates whether the b-read +is from the same or opposite strand, and [ab,ae] and [bb,be] are the intervals of a +and bo, respectively, that align. For each subject, target pair of blocks, say X and Y, +the program reports alignments where the a-read is in X and the b-read is in Y, or +vice versa. However, if the -A option is set ("A" for "asymmetric") then just overlaps +where the a-read is in X and the b-read is in Y are reported, and if X = Y then it +further reports only those overlaps where the a-read index is less than the b-read index. +In either case, if the -I option is set ("I" for "identity") then when X = Y, overlaps +between different portions of the same read will also be found and reported. In summary, +the command `daligner -A X Y` produces a single file `X.Y.las` and `daligner X Y` produces +2 files `X.Y.las` and `Y.X.las` (unless X=Y in which case only a single file, `X.X.las`, is +produced). The overlap records in one of these files are sorted as described for LAsort. +The -a option to daligner is passed directly through to LAsort which is actually called +as a sub-process to produce the sorted file. +In order to produce the aforementioned .las file, several temporary .las files, two for +each thread, are produce in the sub-directory /tmp by default. You can overide this +location by specifying the directory you would like this activity to take place in with +the -P option. + +By default daligner compares all overlaps between reads in the database that are +greater than the minimum cutoff set when the DB or DBs were split, typically 1 or +2 Kbp. However, the HGAP assembly pipeline only wants to correct large reads, say +8Kbp or over, and so needs only the overlaps where the a-read is one of the large +reads. By setting the -H parameter to say N, one alters daligner so that it only +reports overlaps where the a-read is over N base-pairs long. + +While the default parameter settings are good for raw Pacbio data, daligner can be used +for efficiently finding alignments in corrected reads or other less noisy reads. For +example, for mapping applications against .dams we run `daligner -k20 -h60 -e.85` and +on corrected reads, we typically run `daligner -k25 -w5 -h60 -e.95 -s500` and at +these settings it is very fast. + +``` +2. LAsort [-va] ... +``` + +Sort each .las alignment file specified on the command line. For each file it reads in +all the overlaps in the file and sorts them in lexicographical order of (a,b,o,ab) +assuming each alignment is recorded as a[ab,ae] x bo[bb,be]. It then writes them all +to a file named \.S.las (assuming that the input file was \.las). With the +-v option set then the program reports the number of records read and written. If the +-a option is set then it sorts LAs in lexicographical order of (a,ab) alone, which is +desired when sorting a mapping of reads to a reference. + +If the .las file was produced by damapper the local alignments are organized into +chains where the LA segments of a chain are consecutive and ordered in the file. +LAsort can detects that it has been passed such a file and if so treats the chains as +a unit and sorts them on the basis of the first LA in the chain. + +``` +3. LAmerge [-va] [-P] ... +``` + +Merge the .las files \ into a singled sorted file \, where it is assumed +that the input \ files are sorted. There are no limits to how many files can be +merged, but if there are more than 252, a typical UNIX OS limit on the number of simultaneously +open files, then the program recursively spawns sub-processes and creates temporary files +in the directory specified by the -P option, /tmp by default. +With the -v option set the program reports the number of +records read and written. The -a option indicates the sort is as describe for LAsort +above. + +If the .las file was produced by damapper the local alignments are organized into +chains where the LA segments of a chain are consecutive and ordered in the file. When +merging such files, LAmerge treats the chains as a unit and orders them on the basis +of the first LA in the chain. + +Used correctly, LAmerge and LAsort together allow one to perform an "external" sort +that produces a collection of sorted files containing in aggregate all the local +alignments found by the daligner, such that their concatenation is sorted in order of +(a,b,o,ab) (or (a,ab) if the -a option is set). In particular, this means that all the +alignments for a given a-read will be found consecutively in one of the files. So +computations that need to look at all the alignments for a given read can operate in +simple sequential scans of these sorted files. + +``` +4. LAshow [-caroUF] [-i] [-w] [-b] + [ ] + [ | ... ] +``` + +LAshow produces a printed listing of the local alignments contained in the specified +.las file, where the a- and b-reads come from src1 or from src1 and scr2, respectively. +If a file or list of read ranges is given then only the overlaps for which the a-read +is in the set specified by the file or list are displayed. See DBshow for an explanation +of how the file and list of read ranges are interpreted. If the -F option is set then +the roles of the a- and b- reads are reversed in the display. + +If the -c option is given then a cartoon rendering is displayed, and if -a or -r option +is set then an alignment of the local alignment is displayed. The -a option puts +exactly -w columns per segment of the display, whereas the -r option puts exactly -w +a-read symbols in each segment of the display. The -r display mode is useful when one +wants to visually compare two alignments involving the same a-read. If a combination of +the -c, -a, and -r flags is set, then the cartoon comes first, then the -a alignment, +and lastly the -r alignment. The -i option sets the indent for the cartoon and/or +alignment displays, if they are requested. The -b option sets the number of symbols on +either side of the aligned segments in an alignment display, and -U specifies that +uppercase should be used for DNA sequence instead of the default lowercase. If the +-o option is set then only alignments that are proper overlaps (a sequence end occurs +at the each end of the alignment) are displayed. If the -F option is given then the +roles of the A- and B-reads are flipped. + +When examining LAshow output it is important to keep in mind that the coordinates +describing an interval of a read are referring conceptually to positions between bases +starting at 0 for the position to the left of the first base. That is, a coordinate c +refers to the position between the c-1'st and c'th base, and the interval [b,e] captures +the e-b bases from the b'th to the e-1'st, inclusive. We give an example with a cartoon +and (part of an) alignment for which we will explain several additional +important points: + +``` + 1 1,865 c [18,479..20,216] x [ 1,707..0> (24,451 x 7,283 bps, 19 trace pts) + + 18479 4235 + A ========+----------+======> dif/(len1+len2) = 478/(1737+1707) = 27.76% + B <======+----------- + 5576 + + 18469 agccgcctag[tgcctcgcaaacgc-t-cggggcggcgt-gaaagcgg-- + ::::::::::[||||||||||||||*|*|||*|||*|||*||||||||** + 1717 ctcttcttta[tgcctcgcaaacgccttcggcgcg-cgttgaaagcggtt 17.9% + + 18513 -ccggtgggtc--agtggcgagttctggcagtgcgctggg-ctgcgaaat + *||||||*|||**|||||*||||*|*|*|||**|||||||*||*|||||| + 1669 gccggtgcgtcgcagtgg-gagt-c-gtcag--cgctggggcttcgaaat 24.0% + + . . . +``` + +The display of an LA always begins with a line giving the A-read, then the B-read, then +an indication of orientation (i.e. 'n' for same strand, and 'c' for the opposite strand) +followed by the A-interval and B-interval that are aligned and in parentheses +the lengths of the two reads and the number of tracepoints in the alignment between them. +In particular, +note carefully that when the B-read is in the complement orientation (c), then the +B-interval gives the higher coordinate first, the idea being that one will align from +the highest base down to the lowest base in the descending direction on B, complement +the characters as you go. Further note that in the alignment display the coordinates at +the start of each line follow this orientation convention and give the coordinate of the +"tick mark" just left of the first character in each line. It is useful to know if an +interval reaches the end of read, and to signal this we use an angle-bracket \<\> instead +of a square bracket [], e.g. in the example the B-segment starts at the beginning of the +read. Finally, observe that in the cartoon the numbers are not coordinates but rather +indicate the lengths of the unaligned bits left and right of the two aligned intervals. +Finally, observe that in the cartoon the numbers are not coordinates but rather indicate +the lengths of the unaligned bits left and right of the two aligned intervals. + +With the introduction of damapper, .las files can now contain chains. If LAshow detects +that it has been passed a file with chain information then it displays marks at the left +that reveal the chain structure, e.g.: + +``` + > 117 37,630 c [ 253.. 7,980] x [ 331,430.. 324,027] ~ 10.5% + + 117 37,628 n [ 253.. 7,983] x [21,493,673..21,501,079] ~ 10.6% + + 117 57 c [ 253.. 1,086] x [ 2,008,164.. 2,007,369] ~ 9.8% + - 117 57 c [ 1,300.. 7,982] x [ 2,007,351.. 2,000,945] ~ 10.7% + > 117 15 c [ 7,992.. 8,716] x [ 242,529.. 241,822] ~ 7.8% + - 117 15 c [ 8,752..14,299] x [ 241,824.. 236,425] ~ 10.7% + - 117 15 c [14,133..14,832] x [ 236,630.. 235,953] ~ 12.1% + + 117 37,628 n [ 7,992.. 8,716] x [19,202,357..19,203,064] ~ 7.7% + - 117 37,628 n [ 8,752..14,832] x [19,203,062..19,208,974] ~ 10.9% +``` + +A chain begins with either a > or + character, where > indicates this is the highest +scoring chain and + indicates an alternate near optimal chain (controlled by the +-n parameter to damapper). Each additional LA of a chain is marked with a - character. + +``` +5a. LAdump [-cdtlo] [ ] + [ | ... ] + +5b. dumpLA +``` + +Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the +piles in an .las file and select which information to show about them. The difference +is that the information is written in a very simple "1-code" ASCII format that makes it +easy for one to read and parse the information for further use. For each LA the pair of +reads is output on a line. -c requests that one further output the coordinates of the +LA segments be output. The -d option requests that the number of difference in the LA +be output, -t requests that the tracepoint information be output, and -l requests the +length of the two reads be output. Finally, -o requests that only LAs that are proper +overlaps be output. + +The format is very simple. Each requested piece of information occurs on a line. The +first character of every line is a "1-code" character that tells you what information +to expect on the line. The rest of the line contains information where each item is +separated by a single blank space. The trace point line gives the number of trace +point intervals in the LA and is immediately followed by that many lines containing +a pair of integers giving the number of differences and b-displacement in each successive +trace point interval. + +``` + P #a #b #o #c - (#a,#b^#o) have an LA between them where #o is 'n' or 'c' and + #c is '>' (start of best chain), '+' (start of alternate chain), + '-' (continuation of chain), or '.' (no chains in file). + L #la #lb - #la is the length of the a-read and #lb that of the b-read + C #ab #ae #bb #be - #a[#ab,#ae] aligns with #b^#o[#bb,#be] + D # - there are # differences in the LA + T #n - there are #n trace point intervals for the LA + (#d #y )^#n - there are #d difference aligning the #y bp's of B with the + next fixed-size interval of A + + X # - Total amount of X (X = P or T) + % X # - Maximum amount of X in any pile (X = P or T) + @ T # - Maximum number of trace points in any trace +``` + +1-code lines that begin with +, %, or @ are always the first lines in the output. +They give size information about what is contained in the output. Specifically, +'+ X #' gives the total number of LAs (X=P), or the total number of trace point +intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or +the maximum number of trace point intervals (X=T) in a given *pile* (collection of +LAs all with the same a-read (applies only to sorted .las files). A final line: '@ T #', +gives the maximum # of trace point intervals in any trace within the file. +After these lines and before the start of the lines describing alignment records is a +single line of the form 'X #' where the number is the trace point spacing for all +alignments. + +The command dumpLA reads a 1-code file from the standard input and if possible produces a .las +file for it. The 1-code file is any legitimate coding of alignments as might be produced by LAdump. +The 1-code file must contain the P-, C-, and T-lines as well as the X-line and the header lines +beginning with +, %, or @. So for example, a 1-code file produced by LAdump with the -c and -t +options is invertible. + +``` +6a. LAa2b +6b. LAb2a +``` + +Pipes (stdin to stdout) that convert an ASCII output produced by LAdump into a compressed +binary representation (LAa2b) and vice verse (LAb2a). The idea is to save disk space by +keeping the dumps in a more compressed format. + +``` +7. LAcat [-v] ... > .las +``` + +The sequence of \ files (that can contain @-sign block ranges) are +concatenated in order +into a single .las file and pipe the result to the standard output. The -v +option reports the files concatenated and the number of la's within them to +standard error (as the standard output receives the concatenated file). + +``` +8. LAsplit [-v] ( | ) < .las +``` + +If the second argument is an integer n, then divide the alignment file \, piped +in through the standard input, as evenly as possible into n alignment files with the +names specified by template \, subject to the restriction that all alignment +records for a given a-read are in the same file. The name of the n files is the +string \ where the single @-sign that occurs somewhere in it is replaced +by i for i in [1,n] and a .las extension is added if necessary. + +If the second argument refers to a database \.db that has been partitioned, then +divide the input alignment file into block .las files where all records whose a-read is +in \.i.db are in the i'th file generated from the template \. The -v +option reports the files produced and the number of la's within them to standard error. + +``` +9. LAcheck [-vaS] [ ] ... +``` + +LAcheck checks each .las file for structural integrity, where the a- and b-sequences +come from src1 or from src1 and scr2, respectively. That is, it makes sure each file +makes sense as a plausible .las file, e.g. values are not out of bound, the number of +records is correct, the number of trace points for a record is correct, and so on. If +the -S option is set then it further checks that the alignments are in sorted order, +by default pile order, but if -a is also set, then map order. +If the -v option is set then a line is output for each .las file saying either the +file is OK or reporting the first error. If the -v option is not set then the program +runs silently. The exit status is 0 if every file is deemed good, and 1 if at least +one of the files looks corrupted. + +With the introduction of damapper, LAcheck checks to see if a file has chain +information, and if it does, then it checks the validity of chains and checks the +sorting order of chains as a unit according to the -a option. + +``` +10. HPC.daligner [-vad] [-t] [-w] [-l] + [-P] [-B] [-T] [-f] + ( [-k] [-h] [-e] + [-k] [-h] [-e ) + [-m]+ [[-]] +``` + +HPC.daligner writes a UNIX shell script to the standard output or to a series of files +beginning with the prefix \ if the -f option is set, that either performs an +"overlap" computation on all the blocks in a single database, or a "comparison" +computation on all pairs of blocks between two databases, depending on whether it is +given one or two DB's as arguments (\ and \). We describe the overlap +script first and its effect first and then later the comparison script. + +An Overlap Script: consists of a sequence of commands that effectively run daligner on +all pairs of blocks of a split database and then externally sorts and merges them using +LAsort and LAmerge into a collection of alignment files with names \.#.las where # +ranges from 1 to the number of blocks the data base is split into. These sorted files +if concatenated by say LAcat would contain all the alignments in sorted order (of +a-read, then b-read, ...). Moreover, all overlaps for a given a-read are guaranteed +to not be split across files, so one can run artifact analyzers or error correction on +each sorted file in parallel. + +The data base must have been previously split by DBsplit and all the parameters, except +-a, -d, -f, -B, and -D, are passed through to the calls to daligner. The defaults for +these parameters are as for daligner. The -v and -a flags are passed to all calls to +LAsort and LAmerge. All other options are described later. For a database divided into +N sub-blocks, the calls to daligner will produce in total N2 .las files, +on per block pair. +These are then merged so that there is 1 file per row of +the N x N block matrix. So at the end one has N sorted .las files, one per block of +A-reads, that when +concatenated would give a single large sorted overlap file. + +The -B option (default 4) gives the desired number of block comparisons per call to +daligner. Some must contain B-1 comparisons, and the first B-2 block comparisons +even less, but the HPCdaligner "planner" does the best it can to give an average load +of -B block comparisons per command. + +If the integers \ and \ are missing then the script produced is for every +block in the database. If \ is present then HPCdaligner produces an incremental +script that compares blocks \ through \ (\ = \ if not present) +against each other and all previous blocks 1 through \-1, and then incrementally +updates the .las files for blocks 1 through \-1, and creates the .las files for +blocks \ through \. + +A Comparison Script: consists of a sequence of commands that effectively maps every +read in the DB \ against a reference set of sequences in the DB \, recording +all the found local alignments in the sequence of files \.1.\.las, +\.2.\.las, ... where \.\.k.las contains the alignments between all +of \ and the k'th block of \. The parameters are exactly the same as for the +overlap script save that the -k, -h, and -e defaults are set more stringently for +mapping, and the -A, -I , and -H options make no sense as \ and \ are +expected to be distinct data sets. If the integers \ and \ are missing then +the script produced is for every block in the database \. If \ is present +then HPC.daligner produces a script that compares blocks \ through \ (\ += \ if not present) of \ against DAM \. + +The command scripts output by HPC.daligner and other HPC.\ programs consists of +command blocks each of which begins with a comment line (begins with #) followed by a +potentially long list of lines each containing a shell command. Command blocks whose +comment mentions "jobs" and gives the number of said in parenthesis, we call parallel +blocks because each command line in the block can be sent to a node in a cluster for +independent execution, i.e. none of the commands in a block depend on another in the +block. The remaining command blocks we call house-keeping blocks because they can be +executed by the shell on the launch/server node and the commands are either checking +the integrity of .las files with LAcheck, or removing intermediate files with rm. Each +block should be performed in the order given and should complete before the next block +is performed. + +If the -f option is set, then each command block is written to a file with a name of +the form \.#.\ where \ is specified by the user in the -f option +argument, # gives the order in which the command block in the given file is to be +performed in relation to other command block files, and \ is a (very) +short symbolic reminder of what the block is doing. For example, "HPC.daligner -fJOBS +DB" would produce the files: + +``` + JOBS.01.OVL + JOBS.02.CHECK.OPT + JOBS.03.MERGE + JOBS.04.RM.OPT +``` + +There are always 4 command blocks. The files with the suffix .OPT are +optional and need not be executed albeit we highly recommend that one run the +CHECK block. One should *not* run the RM block if one wants to later use +DASrealign after scrubbing. + +A new -d option requests scripts that organize files into a collection of +sub-directories so as not to overwhelm the underlying OS for large genomes. Recall +that for a DB divided into N blocks, the daligner will produce N2 .las-files. +With the -d option set, N sub-directories (with respect to the directory HPC.daligner is +called in) of the form "work\" for i from 1 to N are created in an initial command +block, and then all work files are placed in those sub-directories, with a maximum +of 2N files appearing in any sub-directory at any given point in the process. + +Example: + +``` +// Recall G.db from the example in DAZZ_DB/README + +> cat G.db +files = 1 + 1862 G Sim +blocks = 2 +size = 11 cutoff = 0 all = 0 + 0 0 + 1024 1024 + 1862 1862 +> HPCdaligner -mdust -t5 G | csh -v // Run the HPCdaligner script + +# Dazzler jobs (2) +dazzler -d -t5 -mdust G.1 G.1 +dazzler -d -t5 -mdust G.2 G.1 G.2 +# Initial sort jobs (4) +LAsort G.1.G.1.*.las && LAmerge G.L1.1.1 G.1.G.1.*.S.las && rm G.1.G.1.*.S.las +LAsort G.1.G.2.*.las && LAmerge G.L1.1.2 G.1.G.2.*.S.las && rm G.1.G.2.*.S.las +LAsort G.2.G.1.*.las && LAmerge G.L1.2.1 G.2.G.1.*.S.las && rm G.2.G.1.*.S.las +LAsort G.2.G.2.*.las && LAmerge G.L1.2.2 G.2.G.2.*.S.las && rm G.2.G.2.*.S.las +# Level 1 jobs (2) +LAmerge G.1 G.L1.1.1 G.L1.1.2 && rm G.L1.1.1.las G.L1.1.2.las +LAmerge G.2 G.L1.2.1 G.L1.2.2 && rm G.L1.2.1.las G.L1.2.2.las + +> LAshow -c -a:G -w50 G.1 | more // Take a look at the result ! + +G.1: 34,510 records + + 1 9 c [ 0.. 1,876] x [ 9,017..10,825] ( 18 trace pts) + + 12645 + A ---------+====> dif/(len1+len2) = 398/(1876+1808) = 21.61% + B <====+--------- + 9017 + + 1 ..........gtg-cggt--caggggtgcctgc-t-t-atcgcaatgtta + |||*||||**||||||||*||||*|*|*||**|*|*|||| + 9008 gagaggccaagtggcggtggcaggggtg-ctgcgtcttatatccaggtta 27.5% + + 35 ta-ctgggtggttaaacttagccaggaaacctgttgaaataa-acggtgg + ||*|||||||||||||*|**|*||*|*||||||*|**|||||*|*||||| + 9057 tagctgggtggttaaa-tctg-ca-g-aacctg-t--aataacatggtgg 24.0% + + 83 -ctagtggcttgccgtttacccaacagaagcataatgaaa-tttgaaagt + *||||||||*||||||||*||**||||*|||**|||||||*||||*|||| + 9100 gctagtggc-tgccgttt-ccgcacag-agc--aatgaaaatttg-aagt 20.0% + + 131 ggtaggttcctgctgtct-acatacagaacgacggagcgaaaaggtaccg + ||*|||||||||||||*|*||||*|*|*||||||||||*||||||||||* + 9144 gg-aggttcctgctgt-tcacat-c-ggacgacggagc-aaaaggtacc- 16.0% + +... + +> LAcat G >G.las // Combine G.1.las & G.2.las into a single .las file +> LAshow G G | more // Take another look, now at G.las + +G: 62,654 records + 1 9 c [ 0.. 1,876] x [ 9,017..10,825] : < 398 diffs ( 18 trace pts) + 1 38 c [ 0.. 7,107] x [ 5,381..12,330] : < 1,614 diffs ( 71 trace pts) + 1 49 n [ 5,493..14,521] x [ 0.. 9,065] : < 2,028 diffs ( 91 trace pts) + 1 68 n [12,809..14,521] x [ 0.. 1,758] : < 373 diffs ( 17 trace pts) + 1 147 c [ 0..13,352] x [ 854..14,069] : < 2,993 diffs (133 trace pts) + 1 231 n [10,892..14,521] x [ 0.. 3,735] : < 816 diffs ( 37 trace pts) + 1 292 c [ 3,835..14,521] x [ 0..10,702] : < 2,353 diffs (107 trace pts) + 1 335 n [ 7,569..14,521] x [ 0.. 7,033] : < 1,544 diffs ( 70 trace pts) + 1 377 c [ 9,602..14,521] x [ 0.. 5,009] : < 1,104 diffs ( 49 trace pts) + 1 414 c [ 6,804..14,521] x [ 0.. 7,812] : < 1,745 diffs ( 77 trace pts) + 1 415 c [ 0.. 3,613] x [ 7,685..11,224] : < 840 diffs ( 36 trace pts) + 1 445 c [ 9,828..14,521] x [ 0.. 4,789] : < 1,036 diffs ( 47 trace pts) + 1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts) + +... +``` diff --git a/align.c b/align.c new file mode 100644 index 0000000..c323ce2 --- /dev/null +++ b/align.c @@ -0,0 +1,5453 @@ +/******************************************************************************************* + * + * Fast alignment discovery and trace generation along with utilites for displaying alignments + * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic + * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper. + * A recent cool idea is to not record all the details of an alignment while discovering it + * but simply record trace points through which the optimal alignment passes every 100bp, + * allowing rapid recomputation of the alignment details between trace points. + * + * Author : Gene Myers + * First : June 2013 + * Current: June 1, 2014 + * + ********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +#undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment +#undef DEBUG_POINTS // Show trace points +#undef DEBUG_WAVE // Show waves of Local_Alignment +#undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches +#undef SHOW_TRAIL // Show trace at the end of forward and reverse passes +#undef SHOW_TPS // Show trace points as they are encountered in a wave + +#undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap + +#undef DEBUG_ALIGN // Show division points of Compute_Trace +#undef DEBUG_TRACE // Show trace additions for Compute_Trace +#undef DEBUG_SCRIPT // Show script additions for Compute_Trace +#undef DEBUG_AWAVE // Show F/R waves of Compute_Trace + +#undef SHOW_TRACE // Show full trace for Print_Alignment + +#undef WAVE_STATS + + +/****************************************************************************************\ +* * +* Working Storage Abstraction * +* * +\****************************************************************************************/ + +typedef struct // Hidden from the user, working space for each thread + { int vecmax; + void *vector; + int celmax; + void *cells; + int pntmax; + void *points; + int tramax; + void *trace; + int alnmax; + void *alnpts; + } _Work_Data; + +Work_Data *New_Work_Data() +{ _Work_Data *work; + + work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); + if (work == NULL) + EXIT(NULL); + work->vecmax = 0; + work->vector = NULL; + work->pntmax = 0; + work->points = NULL; + work->tramax = 0; + work->trace = NULL; + work->alnmax = 0; + work->alnpts = NULL; + work->celmax = 0; + work->cells = NULL; + return ((Work_Data *) work); +} + +static int enlarge_vector(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->vector,max,"Enlarging DP vector"); + if (vec == NULL) + EXIT(1); + work->vecmax = max; + work->vector = vec; + return (0); +} + +static int enlarge_points(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->points,max,"Enlarging point vector"); + if (vec == NULL) + EXIT(1); + work->pntmax = max; + work->points = vec; + return (0); +} + +static int enlarge_alnpts(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->alnpts,max,"Enlarging point vector"); + if (vec == NULL) + EXIT(1); + work->alnmax = max; + work->alnpts = vec; + return (0); +} + +static int enlarge_trace(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->trace,max,"Enlarging trace vector"); + if (vec == NULL) + EXIT(1); + work->tramax = max; + work->trace = vec; + return (0); +} + +void Free_Work_Data(Work_Data *ework) +{ _Work_Data *work = (_Work_Data *) ework; + if (work->vector != NULL) + free(work->vector); + if (work->cells != NULL) + free(work->cells); + if (work->trace != NULL) + free(work->trace); + if (work->points != NULL) + free(work->points); + if (work->alnpts != NULL) + free(work->alnpts); + free(work); +} + + +/****************************************************************************************\ +* * +* ADAPTIVE PATH FINDING * +* * +\****************************************************************************************/ + + // Absolute/Fixed Parameters + +#define BVEC uint64 // Can be uint32 if PATH_LEN <= 32 + +#define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last + // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias) + // (max value is 20) + +#define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63) + + // Derivative fixed parameters + +#define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN +#define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1 +#define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1 +#define TRIM_MLAG 250 // How far can last trim point be behind best point +#define WAVE_LAG 30 // How far can worst point be behind the best point + +static double Bias_Factor[10] = { .690, .690, .690, .690, .780, + .850, .900, .933, .966, 1.000 }; + + // Adjustable paramters + +typedef struct + { double ave_corr; + int trace_space; + int reach; + float freq[4]; + int ave_path; + int16 *score; + int16 *table; + } _Align_Spec; + +/* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch) + has a non-negative score for every suffix of the alignment under the scoring scheme + where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT + matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */ + +#define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION + +typedef struct + { int mscore; + int dscore; + int16 *table; + int16 *score; + } Table_Bits; + +static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms) +{ if (bit >= TRIM_LEN) + { parms->table[prefix] = (int16) (score-max); + parms->score[prefix] = (int16) score; + } + else + { if (score > max) + max = score; + set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms); + set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms); + } +} + +/* Create an alignment specification record including path tip tables & values */ + +Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach) +{ _Align_Spec *spec; + Table_Bits parms; + double match; + int bias; + + spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); + if (spec == NULL) + EXIT(NULL); + + spec->ave_corr = ave_corr; + spec->trace_space = trace_space; + spec->reach = reach; + spec->freq[0] = freq[0]; + spec->freq[1] = freq[1]; + spec->freq[2] = freq[2]; + spec->freq[3] = freq[3]; + + match = freq[0] + freq[3]; + if (match > .5) + match = 1.-match; + bias = (int) ((match+.025)*20.-1.); + if (match < .2) + { fprintf(stderr,"Warning: Base bias worse than 80/20%% ! (New_Align_Spec)\n"); + fprintf(stderr," Capping bias at this ratio.\n"); + bias = 3; + } + + spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); + parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr)); + parms.dscore = FRACTION - parms.mscore; + + parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); + if (parms.score == NULL) + { free(spec); + EXIT(NULL); + } + parms.table = parms.score + (TRIM_MASK+1); + + set_table(0,0,0,0,&parms); + + spec->table = parms.table; + spec->score = parms.score; + + return ((Align_Spec *) spec); +} + +void Free_Align_Spec(Align_Spec *espec) +{ _Align_Spec *spec = (_Align_Spec *) espec; + free(spec->score); + free(spec); +} + +double Average_Correlation(Align_Spec *espec) +{ return (((_Align_Spec *) espec)->ave_corr); } + +int Trace_Spacing(Align_Spec *espec) +{ return (((_Align_Spec *) espec)->trace_space); } + +float *Base_Frequencies(Align_Spec *espec) +{ return (((_Align_Spec *) espec)->freq); } + +int Overlap_If_Possible(Align_Spec *espec) +{ return (((_Align_Spec *) espec)->reach); } + + +/****************************************************************************************\ +* * +* LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment * +* * +\****************************************************************************************/ + + +#ifdef WAVE_STATS + +static int64 MAX, TOT, NWV; +static int64 RESTARTS; + +void Init_Stats() +{ MAX = TOT = NWV = 0; + RESTARTS = 0; +} + +void Print_Stats() +{ printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV); + printf("\nRestarts = %lld\n",RESTARTS); +} + +#endif + + +#ifdef DEBUG_WAVE + +static void print_wave(int *V, int *M, int low, int hgh, int besta) +{ int k, bestk; + + (void) M; + printf(" [%6d,%6d]: ",low,hgh); + for (k = low; k <= hgh; k++) + { if (besta == V[k]) + bestk = k; + // printf(" %3d",(V[k]+k)/2); + printf(" %3d",besta-V[k]); + } + printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2); +#ifdef SHOW_MATCH_WAVE + printf(" "); + for (k = low; k <= hgh; k++) + printf(" %3d",M[k]); + printf("\n"); +#endif + fflush(stdout); +} + +#endif + +/* At each furthest reaching point, keep a-coordinate of point (V), bitvector + recording the last TRIM_LEN columns of the implied alignment (T), and the + # of matches (1-bits) in the bitvector (M). */ + +typedef struct + { int ptr; + int diag; + int diff; + int mark; + } Pebble; + +static int VectorEl = 6*sizeof(int) + sizeof(BVEC); + +static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, + int *mind, int maxd, int mida, int minp, int maxp, int aoff, int boff) +{ char *aseq = align->aseq; + char *bseq = align->bseq; + Path *apath = align->path; + + int hgh, low, dif; + int vlen, vmin, vmax; + int *V, *M; + int *_V, *_M; + BVEC *T; + BVEC *_T; + + int *HA, *HB; + int *_HA, *_HB; + int *NA, *NB; + int *_NA, *_NB; + Pebble *cells; + int avail, cmax; + + int TRACE_SPACE = spec->trace_space; + int PATH_AVE = spec->ave_path; + int REACH = spec->reach; + int16 *SCORE = spec->score; + int16 *TABLE = spec->table; + + int besta, besty; + int trima, trimy, trimd; + int trimha, trimhb; + int morea, morey, mored; + int moreha, morehb; + int more, morem, lasta; + int aclip, bclip; + + hgh = maxd; + low = *mind; + dif = 0; + + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEl; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + + cells = (Pebble *) (work->cells); + cmax = work->celmax; + avail = 0; + } + + /* Compute 0-wave starting from mid-line */ + + more = 1; + aclip = INT32_MAX; + bclip = -INT32_MAX; + + besta = trima = morea = lasta = mida; + besty = trimy = morey = (mida-hgh) >> 1; + trimd = mored = 0; + trimha = moreha = 0; + trimhb = morehb = 1; + morem = -1; + + { int k; + char *a; + + a = aseq + hgh; + for (k = hgh; k >= low; k--) + { int y, c, d; + int ha, hb; + int na, nb; + Pebble *pb; + + y = (mida-k) >> 1; + + if (avail >= cmax-1) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } + + na = (((y+k)+(TRACE_SPACE-aoff))/TRACE_SPACE-1)*TRACE_SPACE+aoff; +#ifdef SHOW_TPS + printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = -1; + pb->diag = k; + pb->diff = 0; + pb->mark = na; + ha = avail++; + na += TRACE_SPACE; + + nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff; +#ifdef SHOW_TPS + printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = -1; + pb->diag = k; + pb->diff = 0; + pb->mark = nb; + hb = avail++; + nb += TRACE_SPACE; + + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip < k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y += 1; + } + c = (y << 1) + k; + + while (y+k >= na) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = 0; + pb->mark = na; + ha = avail++; + na += TRACE_SPACE; + } + while (y >= nb) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = hb; + pb->diag = k; + pb->diff = 0; + pb->mark = nb; + hb = avail++; + nb += TRACE_SPACE; + } + + if (c > besta) + { besta = trima = lasta = c; + besty = trimy = y; + trimha = ha; + trimhb = hb; + } + + V[k] = c; + T[k] = PATH_INT; + M[k] = PATH_LEN; + HA[k] = ha; + HB[k] = hb; + NA[k] = na; + NB[k] = nb; + + a -= 1; + } + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta - besty] != 4) + more = 1; + if (hgh >= aclip) + { hgh = aclip-1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + moreha = HA[aclip]; + morehb = HB[aclip]; + } + } + if (low <= bclip) + { low = bclip+1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + moreha = HA[bclip]; + morehb = HB[bclip]; + } + } + aclip = INT32_MAX; + bclip = -INT32_MAX; + } + +#ifdef DEBUG_WAVE + printf("\nFORWARD WAVE:\n"); + print_wave(V,M,low,hgh,besta); +#endif + + /* Compute successive waves until no furthest reaching points remain */ + + while (more && lasta >= besta - TRIM_MLAG) + { int k, n; + int ua, ub; + BVEC t; + int am, ac, ap; + char *a; + + low -= 1; + hgh += 1; + + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move; + int64 vd, md, had, hbd, nad, nbd, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEl)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEl; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (hbd < 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (nbd < 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nbd > 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (hbd > 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + } + + if (low >= minp) + { NA[low] = NA[low+1]; + NB[low] = NB[low+1]; + V[low] = -1; + } + else + low += 1; + + if (hgh <= maxp) + { NA[hgh] = NA[hgh-1]; + NB[hgh] = NB[hgh-1]; + V[hgh] = am = -1; + } + else + am = V[--hgh]; + + dif += 1; + + ac = V[hgh+1] = V[low-1] = -1; + a = aseq + hgh; + t = PATH_INT; + n = PATH_LEN; + ua = ub = -1; + for (k = hgh; k >= low; k--) + { int y, m; + int ha, hb; + int c, d; + BVEC b; + Pebble *pb; + + ap = ac; + ac = am; + am = V[d = k-1]; + + if (ac < am) + if (am < ap) + { c = ap+1; + m = n; + b = t; + ha = ua; + hb = ub; + } + else + { c = am+1; + m = M[d]; + b = T[d]; + ha = HA[d]; + hb = HB[d]; + } + else + if (ac < ap) + { c = ap+1; + m = n; + b = t; + ha = ua; + hb = ub; + } + else + { c = ac+2; + m = M[k]; + b = T[k]; + ha = HA[k]; + hb = HB[k]; + } + + if ((b & PATH_TOP) != 0) + m -= 1; + b <<= 1; + + y = (c-k) >> 1; + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip < k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y += 1; + if ((b & PATH_TOP) == 0) + m += 1; + b = (b << 1) | 1; + } + c = (y << 1) + k; + + while (y+k >= NA[k]) + { if (cells[ha].mark < NA[k]) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), + "Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = dif; + pb->mark = NA[k]; + ha = avail++; + } + NA[k] += TRACE_SPACE; + } + + while (y >= NB[k]) + { if (cells[hb].mark < NB[k]) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), + "Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = hb; + pb->diag = k; + pb->diff = dif; + pb->mark = NB[k]; + hb = avail++; + } + NB[k] += TRACE_SPACE; + } + + if (c > besta) + { besta = c; + besty = y; + if (m >= PATH_AVE) + { lasta = c; + if (TABLE[b & TRIM_MASK] >= 0) + if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) + { trima = c; + trimy = y; + trimd = dif; + trimha = ha; + trimhb = hb; + } + } + } + + t = T[k]; + n = M[k]; + ua = HA[k]; + ub = HB[k]; + V[k] = c; + T[k] = b; + M[k] = m; + HA[k] = ha; + HB[k] = hb; + + a -= 1; + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta-besty] != 4) + more = 1; + if (hgh >= aclip) + { hgh = aclip-1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + mored = dif; + moreha = HA[aclip]; + morehb = HB[aclip]; + } + } + if (low <= bclip) + { low = bclip+1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + mored = dif; + moreha = HA[bclip]; + morehb = HB[bclip]; + } + } + aclip = INT32_MAX; + bclip = -INT32_MAX; + } + + n = besta - WAVE_LAG; + while (hgh >= low) + if (V[hgh] < n) + hgh -= 1; + else + { while (V[low] < n) + low += 1; + break; + } + +#ifdef WAVE_STATS + k = (hgh-low)+1; + if (k > MAX) + MAX = k; + TOT += k; + NWV += 1; +#endif + +#ifdef DEBUG_WAVE + print_wave(V,M,low,hgh,besta); +#endif + } + + { uint16 *atrace = (uint16 *) apath->trace; + uint16 *btrace = (uint16 *) bpath->trace; + int atlen, btlen; + int trimx; + int a, b, k, h; + int d, e; + + if (morem >= 0 && REACH) + { trimx = morea-morey; + trimy = morey; + trimd = mored; + trimha = moreha; + trimhb = morehb; + } + else + trimx = trima-trimy; + + atlen = btlen = 0; + + a = -1; + for (h = trimha; h >= 0; h = b) + { b = cells[h].ptr; + cells[h].ptr = a; + a = h; + } + h = a; + + k = cells[h].diag; + b = (mida-k)/2; + e = 0; +#ifdef SHOW_TRAIL + printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); +#endif + for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) + { k = cells[h].diag; + a = cells[h].mark - k; + d = cells[h].diff; + atrace[atlen++] = (uint16) (d-e); + atrace[atlen++] = (uint16) (a-b); +#ifdef SHOW_TRAIL + printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); +#endif + b = a; + e = d; + } + if (b+k != trimx) + { atrace[atlen++] = (uint16) (trimd-e); + atrace[atlen++] = (uint16) (trimy-b); +#ifdef SHOW_TRAIL + printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); +#endif + } + else if (b != trimy) + { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); + atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); +#ifdef SHOW_TRAIL + printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); +#endif + } + + a = -1; + for (h = trimhb; h >= 0; h = b) + { b = cells[h].ptr; + cells[h].ptr = a; + a = h; + } + h = a; + + k = cells[h].diag; + b = (mida+k)/2; + e = 0; + low = k; +#ifdef SHOW_TRAIL + printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout); +#endif + for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) + { k = cells[h].diag; + a = cells[h].mark + k; + d = cells[h].diff; + btrace[btlen++] = (uint16) (d-e); + btrace[btlen++] = (uint16) (a-b); +#ifdef SHOW_TRAIL + printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout); +#endif + b = a; + e = d; + } + if (b-k != trimy) + { btrace[btlen++] = (uint16) (trimd-e); + btrace[btlen++] = (uint16) (trimx-b); +#ifdef SHOW_TRAIL + printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); +#endif + } + else if (b != trimx) + { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b)); + btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e)); +#ifdef SHOW_TRAIL + printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); +#endif + } + + apath->aepos = trimx; + apath->bepos = trimy; + apath->diffs = trimd; + apath->tlen = atlen; + bpath->tlen = btlen; + } + + *mind = low; + return (0); +} + +/*** Reverse Wave ***/ + +static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, + int mind, int maxd, int mida, int minp, int maxp, int aoff, int boff) +{ char *aseq = align->aseq - 1; + char *bseq = align->bseq - 1; + Path *apath = align->path; + + int hgh, low, dif; + int vlen, vmin, vmax; + int *V, *M; + int *_V, *_M; + BVEC *T; + BVEC *_T; + + int *HA, *HB; + int *_HA, *_HB; + int *NA, *NB; + int *_NA, *_NB; + Pebble *cells; + int avail, cmax; + + int TRACE_SPACE = spec->trace_space; + int PATH_AVE = spec->ave_path; + int REACH = spec->reach; + int16 *SCORE = spec->score; + int16 *TABLE = spec->table; + + int besta, besty; + int trima, trimy, trimd; + int trimha, trimhb; + int morea, morey, mored; + int moreha, morehb; + int more, morem, lasta; + int aclip, bclip; + + hgh = maxd; + low = mind; + dif = 0; + + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEl; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + + cells = (Pebble *) (work->cells); + cmax = work->celmax; + avail = 0; + } + + more = 1; + aclip = -INT32_MAX; + bclip = INT32_MAX; + + besta = trima = morea = lasta = mida; + besty = trimy = morey = (mida-hgh) >> 1; + trimd = mored = 0; + trimha = moreha = 0; + trimhb = morehb = 1; + morem = -1; + + { int k; + char *a; + + a = aseq + low; + for (k = low; k <= hgh; k++) + { int y, c, d; + int ha, hb; + int na, nb; + Pebble *pb; + + y = (mida-k) >> 1; + + if (avail >= cmax-1) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } + + na = (((y+k)+(TRACE_SPACE-aoff)-1)/TRACE_SPACE-1)*TRACE_SPACE+aoff; +#ifdef SHOW_TPS + printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = -1; + pb->diag = k; + pb->diff = 0; + pb->mark = y+k; + ha = avail++; + + nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff; +#ifdef SHOW_TPS + printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = -1; + pb->diag = k; + pb->diff = 0; + pb->mark = y; + hb = avail++; + + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip > k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y -= 1; + } + c = (y << 1) + k; + + while (y+k <= na) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = 0; + pb->mark = na; + ha = avail++; + na -= TRACE_SPACE; + } + while (y <= nb) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = hb; + pb->diag = k; + pb->diff = 0; + pb->mark = nb; + hb = avail++; + nb -= TRACE_SPACE; + } + + if (c < besta) + { besta = trima = lasta = c; + besty = trimy = y; + trimha = ha; + trimhb = hb; + } + + V[k] = c; + T[k] = PATH_INT; + M[k] = PATH_LEN; + HA[k] = ha; + HB[k] = hb; + NA[k] = na; + NB[k] = nb; + + a += 1; + } + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta - besty] != 4) + more = 1; + if (low <= aclip) + { low = aclip+1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + moreha = HA[aclip]; + morehb = HB[aclip]; + } + } + if (hgh >= bclip) + { hgh = bclip-1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + moreha = HA[bclip]; + morehb = HB[bclip]; + } + } + aclip = -INT32_MAX; + bclip = INT32_MAX; + } + +#ifdef DEBUG_WAVE + printf("\nREVERSE WAVE:\n"); + print_wave(V,M,low,hgh,besta); +#endif + + while (more && lasta <= besta + TRIM_MLAG) + { int k, n; + int ua, ub; + BVEC t; + int am, ac, ap; + char *a; + + low -= 1; + hgh += 1; + + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move, vd, md, had, hbd, nad, nbd, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEl)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEl; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (hbd < 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (nbd < 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nbd > 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (hbd > 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + } + + if (low >= minp) + { NA[low] = NA[low+1]; + NB[low] = NB[low+1]; + V[low] = ap = INT32_MAX; + } + else + ap = V[++low]; + + if (hgh <= maxp) + { NA[hgh] = NA[hgh-1]; + NB[hgh] = NB[hgh-1]; + V[hgh] = INT32_MAX; + } + else + hgh -= 1; + + dif += 1; + + ac = V[hgh+1] = V[low-1] = INT32_MAX; + a = aseq + low; + t = PATH_INT; + n = PATH_LEN; + ua = ub = -1; + for (k = low; k <= hgh; k++) + { int y, m; + int ha, hb; + int c, d; + BVEC b; + Pebble *pb; + + am = ac; + ac = ap; + ap = V[d = k+1]; + + if (ac > ap) + if (ap > am) + { c = am-1; + m = n; + b = t; + ha = ua; + hb = ub; + } + else + { c = ap-1; + m = M[d]; + b = T[d]; + ha = HA[d]; + hb = HB[d]; + } + else + if (ac > am) + { c = am-1; + m = n; + b = t; + ha = ua; + hb = ub; + } + else + { c = ac-2; + m = M[k]; + b = T[k]; + ha = HA[k]; + hb = HB[k]; + } + + if ((b & PATH_TOP) != 0) + m -= 1; + b <<= 1; + + y = (c-k) >> 1; + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip > k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y -= 1; + if ((b & PATH_TOP) == 0) + m += 1; + b = (b << 1) | 1; + } + c = (y << 1) + k; + + while (y+k <= NA[k]) + { if (cells[ha].mark > NA[k]) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), + "Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = dif; + pb->mark = NA[k]; + ha = avail++; + } + NA[k] -= TRACE_SPACE; + } + while (y <= NB[k]) + { if (cells[hb].mark > NB[k]) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), + "Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = hb; + pb->diag = k; + pb->diff = dif; + pb->mark = NB[k]; + hb = avail++; + } + NB[k] -= TRACE_SPACE; + } + + if (c < besta) + { besta = c; + besty = y; + if (m >= PATH_AVE) + { lasta = c; + if (TABLE[b & TRIM_MASK] >= 0) + if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) + { trima = c; + trimy = y; + trimd = dif; + trimha = ha; + trimhb = hb; + } + } + } + + t = T[k]; + n = M[k]; + ua = HA[k]; + ub = HB[k]; + V[k] = c; + T[k] = b; + M[k] = m; + HA[k] = ha; + HB[k] = hb; + + a += 1; + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta - besty] != 4) + more = 1; + if (low <= aclip) + { low = aclip+1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + mored = dif; + moreha = HA[aclip]; + morehb = HB[aclip]; + } + } + if (hgh >= bclip) + { hgh = bclip-1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + mored = dif; + moreha = HA[bclip]; + morehb = HB[bclip]; + } + } + aclip = -INT32_MAX; + bclip = INT32_MAX; + } + + n = besta + WAVE_LAG; + while (hgh >= low) + if (V[hgh] > n) + hgh -= 1; + else + { while (V[low] > n) + low += 1; + break; + } + +#ifdef WAVE_STATS + k = (hgh-low)+1; + if (k > MAX) + MAX = k; + TOT += k; + NWV += 1; +#endif + +#ifdef DEBUG_WAVE + print_wave(V,M,low,hgh,besta); +#endif + } + + { uint16 *atrace = (uint16 *) apath->trace; + uint16 *btrace = (uint16 *) bpath->trace; + int atlen, btlen; + int trimx; + int a, b, k, h; + int d, e; + + if (morem >= 0 && REACH) + { trimx = morea-morey; + trimy = morey; + trimd = mored; + trimha = moreha; + trimhb = morehb; + } + else + trimx = trima-trimy; + + atlen = btlen = 0; + + a = -1; + for (h = trimha; h >= 0; h = b) + { b = cells[h].ptr; + cells[h].ptr = a; + a = h; + } + h = a; + + k = cells[h].diag; + b = cells[h].mark - k; + e = 0; +#ifdef SHOW_TRAIL + printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); +#endif + if ((b+k)%TRACE_SPACE != aoff) + { h = cells[h].ptr; + if (h < 0) + { a = trimy; + d = trimd; + } + else + { k = cells[h].diag; + a = cells[h].mark - k; + d = cells[h].diff; + } +#ifdef SHOW_TRAIL + printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); +#endif + if (apath->tlen == 0) + { atrace[--atlen] = (uint16) (b-a); + atrace[--atlen] = (uint16) (d-e); + } + else + { atrace[1] = (uint16) (atrace[1] + (b-a)); + atrace[0] = (uint16) (atrace[0] + (d-e)); + } + b = a; + e = d; + } + if (h >= 0) + { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) + { k = cells[h].diag; + a = cells[h].mark - k; + atrace[--atlen] = (uint16) (b-a); + d = cells[h].diff; + atrace[--atlen] = (uint16) (d-e); +#ifdef SHOW_TRAIL + printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); +#endif + b = a; + e = d; + } + if (b+k != trimx) + { atrace[--atlen] = (uint16) (b-trimy); + atrace[--atlen] = (uint16) (trimd-e); +#ifdef SHOW_TRAIL + printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); +#endif + } + else if (b != trimy) + { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); + atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); +#ifdef SHOW_TRAIL + printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); +#endif + } + } + + a = -1; + for (h = trimhb; h >= 0; h = b) + { b = cells[h].ptr; + cells[h].ptr = a; + a = h; + } + h = a; + + k = cells[h].diag; + b = cells[h].mark + k; + e = 0; +#ifdef SHOW_TRAIL + printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout); +#endif + if ((b-k)%TRACE_SPACE != boff) + { h = cells[h].ptr; + if (h < 0) + { a = trimx; + d = trimd; + } + else + { k = cells[h].diag; + a = cells[h].mark + k; + d = cells[h].diff; + } +#ifdef SHOW_TRAIL + printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); +#endif + if (bpath->tlen == 0) + { btrace[--btlen] = (uint16) (b-a); + btrace[--btlen] = (uint16) (b-a); + } + else + { btrace[1] = (uint16) (btrace[1] + (b-a)); + btrace[0] = (uint16) (btrace[0] + (d-e)); + } + b = a; + e = d; + } + + if (h >= 0) + { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) + { k = cells[h].diag; + a = cells[h].mark + k; + btrace[--btlen] = (uint16) (b-a); + d = cells[h].diff; + btrace[--btlen] = (uint16) (d-e); +#ifdef SHOW_TRAIL + printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); +#endif + b = a; + e = d; + } + if (b-k != trimy) + { btrace[--btlen] = (uint16) (b-trimx); + btrace[--btlen] = (uint16) (trimd-e); +#ifdef SHOW_TRAIL + printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); +#endif + } + else if (b != trimx) + { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx)); + btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e)); +#ifdef SHOW_TRAIL + printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); +#endif + } + } + + apath->abpos = trimx; + apath->bbpos = trimy; + apath->diffs = apath->diffs + trimd; + apath->tlen = apath->tlen - atlen; + apath->trace = atrace + atlen; + bpath->tlen = bpath->tlen - btlen; + bpath->trace = btrace + btlen; + } + + return (0); +} + + +/* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) + See associated .h file for the precise definition of the interface. +*/ + +Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, + int low, int hgh, int anti, int lbord, int hbord) +{ _Work_Data *work = ( _Work_Data *) ework; + _Align_Spec *spec = (_Align_Spec *) espec; + + Path *apath, *bpath; + int aoff, boff; + int minp, maxp; + int selfie; + + { int alen, blen; + int maxtp, wsize; + + alen = align->alen; + blen = align->blen; + + if (hgh-low >= 7500) + wsize = VectorEl*(hgh-low+1); + else + wsize = VectorEl*10000; + if (wsize >= work->vecmax) + if (enlarge_vector(work,wsize)) + EXIT(NULL); + + if (alen < blen) + maxtp = 2*(blen/spec->trace_space+2); + else + maxtp = 2*(alen/spec->trace_space+2); + wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); + if (wsize > work->pntmax) + if (enlarge_points(work,wsize)) + EXIT(NULL); + + apath = align->path; + bpath = (Path *) work->points; + + apath->trace = ((uint16 *) (bpath+1)) + maxtp; + bpath->trace = ((uint16 *) apath->trace) + 2*maxtp; + } + +#ifdef DEBUG_PASSES + printf("\n"); +#endif + + selfie = (align->aseq == align->bseq); + + if (lbord < 0) + { if (selfie && low >= 0) + minp = 1; + else + minp = -INT32_MAX; + } + else + minp = low-lbord; + if (hbord < 0) + { if (selfie && hgh <= 0) + maxp = -1; + else + maxp = INT32_MAX; + } + else + maxp = hgh+hbord; + + if (ACOMP(align->flags)) + { aoff = align->alen % spec->trace_space; + boff = 0; + } + else if (COMP(align->flags)) + { aoff = 0; + boff = align->blen % spec->trace_space; + } + else + { aoff = 0; + boff = 0; + } + + if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp,aoff,boff)) + EXIT(NULL); + +#ifdef DEBUG_PASSES + printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", + (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, + apath->aepos,apath->bepos,apath->diffs); +#endif + + if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp,aoff,boff)) + EXIT(NULL); + +#ifdef DEBUG_PASSES + printf("R1 (%d,%d) => (%d,%d) %d\n", + (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); +#endif + + bpath->diffs = apath->diffs; + if (ACOMP(align->flags)) + { uint16 *trace = (uint16 *) apath->trace; + uint16 p; + int i, j; + + bpath->aepos = apath->bepos; + bpath->bepos = apath->aepos; + bpath->abpos = apath->bbpos; + bpath->bbpos = apath->abpos; + + apath->abpos = align->alen - bpath->bepos; + apath->bbpos = align->blen - bpath->aepos; + apath->aepos = align->alen - bpath->bbpos; + apath->bepos = align->blen - bpath->abpos; + i = apath->tlen-2; + j = 0; + while (j < i) + { p = trace[i]; + trace[i] = trace[j]; + trace[j] = p; + p = trace[i+1]; + trace[i+1] = trace[j+1]; + trace[j+1] = p; + i -= 2; + j += 2; + } + } + else if (COMP(align->flags)) + { uint16 *trace = (uint16 *) bpath->trace; + uint16 p; + int i, j; + + bpath->abpos = align->blen - apath->bepos; + bpath->bbpos = align->alen - apath->aepos; + bpath->aepos = align->blen - apath->bbpos; + bpath->bepos = align->alen - apath->abpos; + i = bpath->tlen-2; + j = 0; + while (j < i) + { p = trace[i]; + trace[i] = trace[j]; + trace[j] = p; + p = trace[i+1]; + trace[i+1] = trace[j+1]; + trace[j+1] = p; + i -= 2; + j += 2; + } + } + else + { bpath->aepos = apath->bepos; + bpath->bepos = apath->aepos; + bpath->abpos = apath->bbpos; + bpath->bbpos = apath->abpos; + } + +#ifdef DEBUG_POINTS + { uint16 *trace = (uint16 *) apath->trace; + int a, h; + + printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); + printf(" %c\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n')); + a = apath->bbpos; + for (h = 1; h < apath->tlen; h += 2) + { int dif = trace[h-1]; + int del = trace[h]; + a += del; + printf(" %d / %d (%d)\n",dif,del,a); + } + } + + { uint16 *trace = (uint16 *) bpath->trace; + int a, h; + + printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos); + printf(" %c [%d,%d]\n",((COMP(align->flags) || ACOMP(align->flags)) ? 'c' : 'n'), + align->blen,align->alen); + a = bpath->bbpos; + for (h = 1; h < bpath->tlen; h += 2) + { int dif = trace[h-1]; + int del = trace[h]; + a += del; + printf(" %d / %d (%d)\n",dif,del,a); + } + } +#endif + + return (bpath); +} + + +/****************************************************************************************\ +* * +* EXTENSION VERSION OF LOCAL ALIGNMENT * +* * +\****************************************************************************************/ + +static int VectorEn = 4*sizeof(int) + sizeof(BVEC); + +static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, + int midd, int mida, int minp, int maxp) +{ char *aseq = align->aseq; + char *bseq = align->bseq; + Path *apath = align->path; + + int hgh, low, dif; + int vlen, vmin, vmax; + int *V, *M; + int *_V, *_M; + BVEC *T; + BVEC *_T; + + int *HA, *NA; + int *_HA, *_NA; + Pebble *cells; + int avail, cmax; + + int TRACE_SPACE = spec->trace_space; + int PATH_AVE = spec->ave_path; + int16 *SCORE = spec->score; + int16 *TABLE = spec->table; + + int besta, besty; + int trima, trimy, trimd; + int trimha; + int morea, morey, mored; + int moreha; + int more, morem, lasta; + int aclip, bclip; + + hgh = midd; + low = midd; + dif = 0; + + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEn; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _NA = _HA + vlen; + _T = ((BVEC *) (_NA + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + NA = _NA-vmin; + T = _T-vmin; + + cells = (Pebble *) (work->cells); + cmax = work->celmax; + avail = 0; + } + + /* Compute 0-wave starting from mid-line */ + + more = 1; + aclip = INT32_MAX; + bclip = -INT32_MAX; + + besta = trima = morea = lasta = mida; + besty = trimy = morey = (mida-hgh) >> 1; + trimd = mored = 0; + trimha = moreha = 0; + morem = -1; + + { int k; + char *a; + + a = aseq + hgh; + for (k = hgh; k >= low; k--) + { int y, c, d; + int ha, na; + Pebble *pb; + + y = (mida-k) >> 1; + + if (avail >= cmax-1) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } + + na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; +#ifdef SHOW_TPS + printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = -1; + pb->diag = k; + pb->diff = 0; + pb->mark = na; + ha = avail++; + na += TRACE_SPACE; + + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip < k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y += 1; + } + c = (y << 1) + k; + + while (y+k >= na) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = 0; + pb->mark = na; + ha = avail++; + na += TRACE_SPACE; + } + + if (c > besta) + { besta = trima = lasta = c; + besty = trimy = y; + trimha = ha; + } + + V[k] = c; + T[k] = PATH_INT; + M[k] = PATH_LEN; + HA[k] = ha; + NA[k] = na; + + a -= 1; + } + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta - besty] != 4) + more = 1; + if (hgh >= aclip) + { hgh = aclip-1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + moreha = HA[aclip]; + } + } + if (low <= bclip) + { low = bclip+1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + moreha = HA[bclip]; + } + } + aclip = INT32_MAX; + bclip = -INT32_MAX; + } + +#ifdef DEBUG_WAVE + printf("\nFORWARD WAVE:\n"); + print_wave(V,M,low,hgh,besta); +#endif + + /* Compute successive waves until no furthest reaching points remain */ + + while (more && lasta >= besta - TRIM_MLAG) + { int k, n; + int ua; + BVEC t; + int am, ac, ap; + char *a; + + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move; + int64 vd, md, had, nad, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEn)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEn; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _NA = _HA + vlen; + _T = ((BVEC *) (_NA + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + NA = _NA-vmin; + T = _T-vmin; + } + + if (low > minp) + { low -= 1; + NA[low] = NA[low+1]; + V[low] = -1; + } + if (hgh < maxp) + { hgh += 1; + NA[hgh] = NA[hgh-1]; + V[hgh] = am = -1; + } + else + am = V[hgh]; + dif += 1; + + ac = V[hgh+1] = V[low-1] = -1; + a = aseq + hgh; + t = PATH_INT; + n = PATH_LEN; + ua = -1; + for (k = hgh; k >= low; k--) + { int y, m; + int ha; + int c, d; + BVEC b; + Pebble *pb; + + ap = ac; + ac = am; + am = V[d = k-1]; + + if (ac < am) + if (am < ap) + { c = ap+1; + m = n; + b = t; + ha = ua; + } + else + { c = am+1; + m = M[d]; + b = T[d]; + ha = HA[d]; + } + else + if (ac < ap) + { c = ap+1; + m = n; + b = t; + ha = ua; + } + else + { c = ac+2; + m = M[k]; + b = T[k]; + ha = HA[k]; + } + + if ((b & PATH_TOP) != 0) + m -= 1; + b <<= 1; + + y = (c-k) >> 1; + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip < k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y += 1; + if ((b & PATH_TOP) == 0) + m += 1; + b = (b << 1) | 1; + } + c = (y << 1) + k; + + while (y+k >= NA[k]) + { if (cells[ha].mark < NA[k]) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), + "Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = dif; + pb->mark = NA[k]; + ha = avail++; + } + NA[k] += TRACE_SPACE; + } + + if (c > besta) + { besta = c; + besty = y; + if (m >= PATH_AVE) + { lasta = c; + if (TABLE[b & TRIM_MASK] >= 0) + if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) + { trima = c; + trimy = y; + trimd = dif; + trimha = ha; + } + } + } + + t = T[k]; + n = M[k]; + ua = HA[k]; + V[k] = c; + T[k] = b; + M[k] = m; + HA[k] = ha; + + a -= 1; + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta-besty] != 4) + more = 1; + if (hgh >= aclip) + { hgh = aclip-1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + mored = dif; + moreha = HA[aclip]; + } + } + if (low <= bclip) + { low = bclip+1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + mored = dif; + moreha = HA[bclip]; + } + } + aclip = INT32_MAX; + bclip = -INT32_MAX; + } + + n = besta - WAVE_LAG; + while (hgh >= low) + if (V[hgh] < n) + hgh -= 1; + else + { while (V[low] < n) + low += 1; + break; + } + +#ifdef WAVE_STATS + k = (hgh-low)+1; + if (k > MAX) + MAX = k; + TOT += k; + NWV += 1; +#endif + +#ifdef DEBUG_WAVE + print_wave(V,M,low,hgh,besta); +#endif + } + + { uint16 *atrace = (uint16 *) apath->trace; + int atlen; + int trimx; + int a, b, k, h; + int d, e; + + if (morem >= 0) + { trimx = morea-morey; + trimy = morey; + trimd = mored; + trimha = moreha; + } + else + trimx = trima-trimy; + + atlen = 0; + + a = -1; + for (h = trimha; h >= 0; h = b) + { b = cells[h].ptr; + cells[h].ptr = a; + a = h; + } + h = a; + + k = cells[h].diag; + b = (mida-k)/2; + e = 0; +#ifdef SHOW_TRAIL + printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); +#endif + for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) + { k = cells[h].diag; + a = cells[h].mark - k; + d = cells[h].diff; + atrace[atlen++] = (uint16) (d-e); + atrace[atlen++] = (uint16) (a-b); +#ifdef SHOW_TRAIL + printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); +#endif + b = a; + e = d; + } + if (b+k != trimx) + { atrace[atlen++] = (uint16) (trimd-e); + atrace[atlen++] = (uint16) (trimy-b); +#ifdef SHOW_TRAIL + printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); +#endif + } + else if (b != trimy) + { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); + atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); +#ifdef SHOW_TRAIL + printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); +#endif + } + + apath->aepos = trimx; + apath->bepos = trimy; + apath->diffs = trimd; + apath->tlen = atlen; + } + + return (0); +} + +static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, + int midd, int mida, int minp, int maxp) +{ char *aseq = align->aseq - 1; + char *bseq = align->bseq - 1; + Path *apath = align->path; + + int hgh, low, dif; + int vlen, vmin, vmax; + int *V, *M; + int *_V, *_M; + BVEC *T; + BVEC *_T; + + int *HA, *NA; + int *_HA, *_NA; + Pebble *cells; + int avail, cmax; + + int TRACE_SPACE = spec->trace_space; + int PATH_AVE = spec->ave_path; + int16 *SCORE = spec->score; + int16 *TABLE = spec->table; + + int besta, besty; + int trima, trimy, trimd; + int trimha; + int morea, morey, mored; + int moreha; + int more, morem, lasta; + int aclip, bclip; + + hgh = midd; + low = midd; + dif = 0; + + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEn; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _NA = _HA + vlen; + _T = ((BVEC *) (_NA + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + NA = _NA-vmin; + T = _T-vmin; + + cells = (Pebble *) (work->cells); + cmax = work->celmax; + avail = 0; + } + + more = 1; + aclip = -INT32_MAX; + bclip = INT32_MAX; + + besta = trima = morea = lasta = mida; + besty = trimy = morey = (mida-hgh) >> 1; + trimd = mored = 0; + trimha = moreha = 0; + morem = -1; + + { int k; + char *a; + + a = aseq + low; + for (k = low; k <= hgh; k++) + { int y, c, d; + int ha, na; + Pebble *pb; + + y = (mida-k) >> 1; + + if (avail >= cmax-1) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } + + na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; +#ifdef SHOW_TPS + printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = -1; + pb->diag = k; + pb->diff = 0; + pb->mark = y+k; + ha = avail++; + + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip > k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y -= 1; + } + c = (y << 1) + k; + + while (y+k <= na) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = 0; + pb->mark = na; + ha = avail++; + na -= TRACE_SPACE; + } + + if (c < besta) + { besta = trima = lasta = c; + besty = trimy = y; + trimha = ha; + } + + V[k] = c; + T[k] = PATH_INT; + M[k] = PATH_LEN; + HA[k] = ha; + NA[k] = na; + + a += 1; + } + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta - besty] != 4) + more = 1; + if (low <= aclip) + { low = aclip+1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + moreha = HA[aclip]; + } + } + if (hgh >= bclip) + { hgh = bclip-1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + moreha = HA[bclip]; + } + } + aclip = -INT32_MAX; + bclip = INT32_MAX; + } + +#ifdef DEBUG_WAVE + printf("\nREVERSE WAVE:\n"); + print_wave(V,M,low,hgh,besta); +#endif + + while (more && lasta <= besta + TRIM_MLAG) + { int k, n; + int ua; + BVEC t; + int am, ac, ap; + char *a; + + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move, vd, md, had, nad, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEn)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEn; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _NA = _HA + vlen; + _T = ((BVEC *) (_NA + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + NA = _NA-vmin; + T = _T-vmin; + } + + if (low > minp) + { low -= 1; + NA[low] = NA[low+1]; + V[low] = ap = INT32_MAX; + } + else + ap = V[low]; + if (hgh < maxp) + { hgh += 1; + NA[hgh] = NA[hgh-1]; + V[hgh] = INT32_MAX; + } + dif += 1; + + ac = V[hgh+1] = V[low-1] = INT32_MAX; + a = aseq + low; + t = PATH_INT; + n = PATH_LEN; + ua = -1; + for (k = low; k <= hgh; k++) + { int y, m; + int ha; + int c, d; + BVEC b; + Pebble *pb; + + am = ac; + ac = ap; + ap = V[d = k+1]; + + if (ac > ap) + if (ap > am) + { c = am-1; + m = n; + b = t; + ha = ua; + } + else + { c = ap-1; + m = M[d]; + b = T[d]; + ha = HA[d]; + } + else + if (ac > am) + { c = am-1; + m = n; + b = t; + ha = ua; + } + else + { c = ac-2; + m = M[k]; + b = T[k]; + ha = HA[k]; + } + + if ((b & PATH_TOP) != 0) + m -= 1; + b <<= 1; + + y = (c-k) >> 1; + while (1) + { c = bseq[y]; + if (c == 4) + { more = 0; + if (bclip > k) + bclip = k; + break; + } + d = a[y]; + if (c != d) + { if (d == 4) + { more = 0; + aclip = k; + } + break; + } + y -= 1; + if ((b & PATH_TOP) == 0) + m += 1; + b = (b << 1) | 1; + } + c = (y << 1) + k; + + while (y+k <= NA[k]) + { if (cells[ha].mark > NA[k]) + { if (avail >= cmax) + { cmax = ((int) (avail*1.2)) + 10000; + cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), + "Reallocating trace cells"); + if (cells == NULL) + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; + } +#ifdef SHOW_TPS + printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); +#endif + pb = cells+avail; + pb->ptr = ha; + pb->diag = k; + pb->diff = dif; + pb->mark = NA[k]; + ha = avail++; + } + NA[k] -= TRACE_SPACE; + } + + if (c < besta) + { besta = c; + besty = y; + if (m >= PATH_AVE) + { lasta = c; + if (TABLE[b & TRIM_MASK] >= 0) + if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) + { trima = c; + trimy = y; + trimd = dif; + trimha = ha; + } + } + } + + t = T[k]; + n = M[k]; + ua = HA[k]; + V[k] = c; + T[k] = b; + M[k] = m; + HA[k] = ha; + + a += 1; + } + + if (more == 0) + { if (bseq[besty] != 4 && aseq[besta - besty] != 4) + more = 1; + if (low <= aclip) + { low = aclip+1; + if (morem <= M[aclip]) + { morem = M[aclip]; + morea = V[aclip]; + morey = (morea - aclip)/2; + mored = dif; + moreha = HA[aclip]; + } + } + if (hgh >= bclip) + { hgh = bclip-1; + if (morem <= M[bclip]) + { morem = M[bclip]; + morea = V[bclip]; + morey = (morea - bclip)/2; + mored = dif; + moreha = HA[bclip]; + } + } + aclip = -INT32_MAX; + bclip = INT32_MAX; + } + + n = besta + WAVE_LAG; + while (hgh >= low) + if (V[hgh] > n) + hgh -= 1; + else + { while (V[low] > n) + low += 1; + break; + } + +#ifdef WAVE_STATS + k = (hgh-low)+1; + if (k > MAX) + MAX = k; + TOT += k; + NWV += 1; +#endif + +#ifdef DEBUG_WAVE + print_wave(V,M,low,hgh,besta); +#endif + } + + { uint16 *atrace = (uint16 *) apath->trace; + int atlen; + int trimx; + int a, b, k, h; + int d, e; + + if (morem >= 0) + { trimx = morea-morey; + trimy = morey; + trimd = mored; + trimha = moreha; + } + else + trimx = trima-trimy; + + atlen = 0; + + a = -1; + for (h = trimha; h >= 0; h = b) + { b = cells[h].ptr; + cells[h].ptr = a; + a = h; + } + h = a; + + k = cells[h].diag; + b = cells[h].mark - k; + e = 0; +#ifdef SHOW_TRAIL + printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); +#endif + if ((b+k)%TRACE_SPACE != 0) + { h = cells[h].ptr; + if (h < 0) + { a = trimy; + d = trimd; + } + else + { k = cells[h].diag; + a = cells[h].mark - k; + d = cells[h].diff; + } +#ifdef SHOW_TRAIL + printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); +#endif + atrace[--atlen] = (uint16) (b-a); + atrace[--atlen] = (uint16) (d-e); + b = a; + e = d; + } + if (h >= 0) + { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) + { k = cells[h].diag; + a = cells[h].mark - k; + atrace[--atlen] = (uint16) (b-a); + d = cells[h].diff; + atrace[--atlen] = (uint16) (d-e); +#ifdef SHOW_TRAIL + printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); +#endif + b = a; + e = d; + } + if (b+k != trimx) + { atrace[--atlen] = (uint16) (b-trimy); + atrace[--atlen] = (uint16) (trimd-e); +#ifdef SHOW_TRAIL + printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); +#endif + } + else if (b != trimy) + { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); + atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); +#ifdef SHOW_TRAIL + printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); +#endif + } + } + + apath->abpos = trimx; + apath->bbpos = trimy; + apath->diffs = trimd; + apath->tlen = - atlen; + apath->trace = atrace + atlen; + } + + return (0); +} + + +/* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) + See associated .h file for the precise definition of the interface. +*/ + +int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, + int diag, int anti, int lbord, int hbord, int prefix) +{ _Work_Data *work = ( _Work_Data *) ework; + _Align_Spec *spec = (_Align_Spec *) espec; + + Path *apath; + int minp, maxp; + + { int alen, blen; + int maxtp, wsize; + + alen = align->alen; + blen = align->blen; + + wsize = VectorEn*10000; + if (wsize >= work->vecmax) + if (enlarge_vector(work,wsize)) + EXIT(1); + + if (alen < blen) + maxtp = 2*(blen/spec->trace_space+2); + else + maxtp = 2*(alen/spec->trace_space+2); + wsize = 2*maxtp*sizeof(uint16); + if (wsize > work->pntmax) + if (enlarge_points(work,wsize)) + EXIT(1); + + apath = align->path; + apath->trace = ((uint16 *) work->points) + maxtp; + } + +#ifdef DEBUG_PASSES + printf("\n"); +#endif + + if (lbord < 0) + minp = -INT32_MAX; + else + minp = diag-lbord; + if (hbord < 0) + maxp = INT32_MAX; + else + maxp = diag+hbord; + + if (prefix) + { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) + EXIT(1); + apath->aepos = (anti+diag)/2; + apath->bepos = (anti-diag)/2; +#ifdef DEBUG_PASSES + printf("E1 (%d,%d) => (%d,%d) %d\n", + (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs); +#endif + } + else + { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) + EXIT(1); + apath->abpos = (anti+diag)/2; + apath->bbpos = (anti-diag)/2; +#ifdef DEBUG_PASSES + printf("F1 (%d,%d) => (%d,%d) %d\n", + (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs); +#endif + } + +#ifdef DEBUG_POINTS + { uint16 *trace = (uint16 *) apath->trace; + int a, h; + + printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); + printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); + a = apath->bbpos; + for (h = 1; h < apath->tlen; h += 2) + { int dif = trace[h-1]; + int del = trace[h]; + a += del; + printf(" %d / %d (%d)\n",dif,del,a); + } + } +#endif + + return (0); +} + + +/****************************************************************************************\ +* * +* OVERLAP MANIPULATION * +* * +\****************************************************************************************/ + +static int64 PtrSize = sizeof(void *); +static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *); + +int Read_Overlap(FILE *input, Overlap *ovl) +{ if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1) + return (1); + return (0); +} + +int Read_Trace(FILE *input, Overlap *ovl, int tbytes) +{ if (tbytes > 0 && ovl->path.tlen > 0) + { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1) + return (1); + } + return (0); +} + +int Write_Overlap(FILE *output, Overlap *ovl, int tbytes) +{ if (fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output) != 1) + return (1); + if (ovl->path.trace != NULL) + if (fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output) != (size_t) ovl->path.tlen) + return (1); + return (0); +} + +int Compress_TraceTo8(Overlap *ovl, int check) +{ uint16 *t16 = (uint16 *) ovl->path.trace; + uint8 *t8 = (uint8 *) ovl->path.trace; + int j, x; + + if (check) + for (j = 0; j < ovl->path.tlen; j++) + { x = t16[j]; + if (x > 255) + { fprintf(stderr,"%s: Compression of trace to bytes fails, value too big\n",Prog_Name); + EXIT(1); + } + t8[j] = (uint8) x; + } + else + for (j = 0; j < ovl->path.tlen; j++) + t8[j] = (uint8) (t16[j]); + return (0); +} + +void Decompress_TraceTo16(Overlap *ovl) +{ uint16 *t16 = (uint16 *) ovl->path.trace; + uint8 *t8 = (uint8 *) ovl->path.trace; + int j; + + for (j = ovl->path.tlen-1; j >= 0; j--) + t16[j] = t8[j]; +} + +void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent) +{ int i; + + fprintf(output,"%*s%d vs. ",indent,"",ovl->aread); + if (COMP(ovl->flags)) + fprintf(output,"c(%d)\n",ovl->bread); + else + fprintf(output,"%d\n",ovl->bread); + fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"", + ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs); + + if (tbytes == 1) + { uint8 *trace = (uint8 *) (ovl->path.trace); + if (trace != NULL) + { int p = ovl->path.bbpos + trace[1]; + fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); + for (i = 3; i < ovl->path.tlen; i += 2) + { if (i%10 == 0) + fprintf(output,"\n%*s",indent+6,""); + p += trace[i]; + fprintf(output," %3d/%5d",trace[i-1],p); + } + fprintf(output,"\n"); + } + } + else + { uint16 *trace = (uint16 *) (ovl->path.trace); + if (trace != NULL) + { int p = ovl->path.bbpos + trace[1]; + fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); + for (i = 3; i < ovl->path.tlen; i += 2) + { if (i%10 == 0) + fprintf(output,"\n%*s",indent+6,""); + p += trace[i]; + fprintf(output," %3d/%5d",trace[i-1],p); + } + fprintf(output,"\n"); + } + } +} + +int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) +{ int i, p, q; + + if (tspace != 0) + { if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) + { if (verbose) + EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); + return (1); + } + p = ovl->path.bbpos; + if (tspace <= TRACE_XOVR) + { uint8 *trace8 = (uint8 *) ovl->path.trace; + for (i = 1; i < ovl->path.tlen; i += 2) + p += trace8[i]; + } + else + { uint16 *trace16 = (uint16 *) ovl->path.trace; + for (i = 1; i < ovl->path.tlen; i += 2) + p += trace16[i]; + } + if (p != ovl->path.bepos) + { if (verbose) + EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); + return (1); + } + } + else + { uint16 *trace16 = (uint16 *) ovl->path.trace; + + p = ovl->path.bbpos; + q = ovl->path.abpos; + for (i = 1; i < ovl->path.tlen; i += 2) + { p += trace16[i]; + q += trace16[i-1]; + } + if (p != ovl->path.bepos || q != ovl->path.aepos) + { if (verbose) + EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); + return (1); + } + } + return (0); +} + + +void Flip_Alignment(Alignment *align, int full) +{ char *aseq = align->aseq; + char *bseq = align->bseq; + int alen = align->alen; + int blen = align->blen; + Path *path = align->path; + int comp = COMP(align->flags); + + int *trace = (int *) path->trace; + int tlen = path->tlen; + + int i, j, p; + + if (comp) + { p = path->abpos; + path->abpos = blen - path->bepos; + path->bepos = alen - p; + p = path->aepos; + path->aepos = blen - path->bbpos; + path->bbpos = alen - p; + + if (full) + { alen += 2; + blen += 2; + + for (i = 0; i < tlen; i++) + if ((p = trace[i]) < 0) + trace[i] = alen + p; + else + trace[i] = p - blen; + + i = tlen-1; + j = 0; + while (j < i) + { p = trace[i]; + trace[i] = trace[j]; + trace[j] = p; + i -= 1; + j += 1; + } + + alen -= 2; + blen -= 2; + } + } + else + { p = path->abpos; + path->abpos = path->bbpos; + path->bbpos = p; + p = path->aepos; + path->aepos = path->bepos; + path->bepos = p; + + if (full) + for (i = 0; i < tlen; i++) + trace[i] = - (trace[i]); + } + + align->aseq = bseq; + align->bseq = aseq; + align->alen = blen; + align->blen = alen; +} + + +/****************************************************************************************\ +* * +* ALIGNMENT PRINTING * +* * +\****************************************************************************************/ + +/* Complement the sequence in fragment aseq. The operation does the + complementation/reversal in place. Calling it a second time on a + given fragment restores it to its original state. */ + +void Complement_Seq(char *aseq, int len) +{ char *s, *t; + int c; + + s = aseq; + t = aseq + (len-1); + while (s < t) + { c = 3 - *s; + *s++ = (char) (3 - *t); + *t-- = (char) c; + } + if (s == t) + *s = (char) (3 - *s); +} + + +/* Print an alignment to file between a and b given in trace (unpacked). + Prefix gives the length of the initial prefix of a that is unaligned. */ + +static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; +static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; + +int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, + int indent, int width, int border, int upper, int coord) +{ _Work_Data *work = (_Work_Data *) ework; + int *trace = align->path->trace; + int tlen = align->path->tlen; + + char *Abuf, *Bbuf, *Dbuf; + int i, j, o; + char *a, *b; + char mtag, dtag; + int prefa, prefb; + int aend, bend; + int comp, blen; + int sa, sb; + int match, diff; + char *N2A; + + if (trace == NULL) return (0); + +#ifdef SHOW_TRACE + fprintf(file,"\nTrace:\n"); + for (i = 0; i < tlen; i++) + fprintf(file," %3d\n",trace[i]); +#endif + + o = sizeof(char)*3*(width+1); + if (o > work->vecmax) + if (enlarge_vector(work,o)) + EXIT(1); + + if (upper) + N2A = ToU; + else + N2A = ToL; + + Abuf = (char *) work->vector; + Bbuf = Abuf + (width+1); + Dbuf = Bbuf + (width+1); + + aend = align->path->aepos; + bend = align->path->bepos; + + comp = COMP(align->flags); + blen = align->blen; + + Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; + /* buffer/output next column */ +#define COLUMN(x,y) \ +{ int u, v; \ + if (o >= width) \ + { fprintf(file,"\n"); \ + fprintf(file,"%*s",indent,""); \ + if (coord > 0) \ + { if (sa < aend) \ + fprintf(file," %*d",coord,sa); \ + else \ + fprintf(file," %*s",coord,""); \ + fprintf(file," %s\n",Abuf); \ + fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ + fprintf(file,"%*s",indent,""); \ + if (sb < bend) \ + if (comp) \ + fprintf(file," %*d",coord,blen-sb); \ + else \ + fprintf(file," %*d",coord,sb); \ + else \ + fprintf(file," %*s",coord,""); \ + fprintf(file," %s",Bbuf); \ + } \ + else \ + { fprintf(file," %s\n",Abuf); \ + fprintf(file,"%*s %s\n",indent,"",Dbuf); \ + fprintf(file,"%*s %s",indent,"",Bbuf); \ + } \ + fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ + o = 0; \ + sa = i-1; \ + sb = j-1; \ + match = diff = 0; \ + } \ + u = (x); \ + v = (y); \ + if (u == 4 || v == 4) \ + Dbuf[o] = ' '; \ + else if (u == v) \ + Dbuf[o] = mtag; \ + else \ + Dbuf[o] = dtag; \ + Abuf[o] = N2A[u]; \ + Bbuf[o] = N2A[v]; \ + o += 1; \ +} + + a = align->aseq - 1; + b = align->bseq - 1; + + o = 0; + i = j = 1; + + prefa = align->path->abpos; + prefb = align->path->bbpos; + + if (prefa > border) + { i = prefa-(border-1); + prefa = border; + } + if (prefb > border) + { j = prefb-(border-1); + prefb = border; + } + + sa = i-1; + sb = j-1; + mtag = ':'; + dtag = ':'; + + while (prefa > prefb) + { COLUMN(a[i],4) + i += 1; + prefa -= 1; + } + while (prefb > prefa) + { COLUMN(4,b[j]) + j += 1; + prefb -= 1; + } + while (prefa > 0) + { COLUMN(a[i],b[j]) + i += 1; + j += 1; + prefa -= 1; + } + + mtag = '['; + if (prefb > 0) + COLUMN(5,5) + + mtag = '|'; + dtag = '*'; + + match = diff = 0; + + { int p, c; /* Output columns of alignment til reach trace end */ + + for (c = 0; c < tlen; c++) + if ((p = trace[c]) < 0) + { p = -p; + while (i != p) + { COLUMN(a[i],b[j]) + if (a[i] == b[j]) + match += 1; + else + diff += 1; + i += 1; + j += 1; + } + COLUMN(7,b[j]) + j += 1; + diff += 1; + } + else + { while (j != p) + { COLUMN(a[i],b[j]) + if (a[i] == b[j]) + match += 1; + else + diff += 1; + i += 1; + j += 1; + } + COLUMN(a[i],7) + i += 1; + diff += 1; + } + p = align->path->aepos; + while (i <= p) + { COLUMN(a[i],b[j]) + if (a[i] == b[j]) + match += 1; + else + diff += 1; + i += 1; + j += 1; + } + } + + { int c; /* Output remaining column including unaligned suffix */ + + mtag = ']'; + if (a[i] != 4 && b[j] != 4 && border > 0) + COLUMN(6,6) + + mtag = ':'; + dtag = ':'; + + c = 0; + while (c < border && (a[i] != 4 || b[j] != 4)) + { if (a[i] != 4) + if (b[j] != 4) + { COLUMN(a[i],b[j]) + i += 1; + j += 1; + } + else + { COLUMN(a[i],4) + i += 1; + } + else + { COLUMN(4,b[j]) + j += 1; + } + c += 1; + } + } + + /* Print remainder of buffered col.s */ + + fprintf(file,"\n"); + fprintf(file,"%*s",indent,""); + if (coord > 0) + { if (sa < aend) + fprintf(file," %*d",coord,sa); + else + fprintf(file," %*s",coord,""); + fprintf(file," %.*s\n",o,Abuf); + fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); + fprintf(file,"%*s",indent,""); + if (sb < bend) + if (comp) + fprintf(file," %*d",coord,blen-sb); + else + fprintf(file," %*d",coord,sb); + else + fprintf(file," %*s",coord,""); + fprintf(file," %.*s",o,Bbuf); + } + else + { fprintf(file," %.*s\n",o,Abuf); + fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); + fprintf(file,"%*s %.*s",indent,"",o,Bbuf); + } + if (diff+match > 0) + fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); + else + fprintf(file,"\n"); + + fflush(file); + return (0); +} + +int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, + int indent, int block, int border, int upper, int coord) +{ _Work_Data *work = (_Work_Data *) ework; + int *trace = align->path->trace; + int tlen = align->path->tlen; + + char *Abuf, *Bbuf, *Dbuf; + int i, j, o; + char *a, *b; + char mtag, dtag; + int prefa, prefb; + int aend, bend; + int comp, blen; + int sa, sb, s0; + int match, diff; + char *N2A; + int vmax; + + if (trace == NULL) return (0); + +#ifdef SHOW_TRACE + fprintf(file,"\nTrace:\n"); + for (i = 0; i < tlen; i++) + fprintf(file," %3d\n",trace[i]); +#endif + + vmax = work->vecmax/3; + o = sizeof(char)*6*(block+1); + if (o > vmax) + { if (enlarge_vector(work,3*o)) + EXIT(1); + vmax = work->vecmax/3; + } + + Abuf = (char *) work->vector; + Bbuf = Abuf + vmax; + Dbuf = Bbuf + vmax; + + if (upper) + N2A = ToU; + else + N2A = ToL; + + aend = align->path->aepos; + bend = align->path->bepos; + + comp = COMP(align->flags); + blen = align->blen; + +#define BLOCK(x,y) \ +{ int u, v; \ + if (i%block == 1 && i != s0 && x < 4 && o > 0) \ + { fprintf(file,"\n"); \ + fprintf(file,"%*s",indent,""); \ + if (coord > 0) \ + { if (sa < aend) \ + fprintf(file," %*d",coord,sa); \ + else \ + fprintf(file," %*s",coord,""); \ + fprintf(file," %.*s\n",o,Abuf); \ + fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \ + fprintf(file,"%*s",indent,""); \ + if (sb < bend) \ + if (comp) \ + fprintf(file," %*d",coord,blen-sb); \ + else \ + fprintf(file," %*d",coord,sb); \ + else \ + fprintf(file," %*s",coord,""); \ + fprintf(file," %.*s",o,Bbuf); \ + } \ + else \ + { fprintf(file," %.*s\n",o,Abuf); \ + fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \ + fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \ + } \ + fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ + o = 0; \ + sa = i-1; \ + sb = j-1; \ + match = diff = 0; \ + } \ + u = (x); \ + v = (y); \ + if (u == 4 || v == 4) \ + Dbuf[o] = ' '; \ + else if (u == v) \ + Dbuf[o] = mtag; \ + else \ + Dbuf[o] = dtag; \ + Abuf[o] = N2A[u]; \ + Bbuf[o] = N2A[v]; \ + o += 1; \ + if (o >= vmax) \ + { if (enlarge_vector(work,3*o)) \ + EXIT(1); \ + vmax = work->vecmax/3; \ + memmove(work->vector+2*vmax,Dbuf,o); \ + memmove(work->vector+vmax,Bbuf,o); \ + memmove(work->vector,Abuf,o); \ + Abuf = (char *) work->vector; \ + Bbuf = Abuf + vmax; \ + Dbuf = Bbuf + vmax; \ + } \ +} + + a = align->aseq - 1; + b = align->bseq - 1; + + o = 0; + i = j = 1; + + prefa = align->path->abpos; + prefb = align->path->bbpos; + + if (prefa > border) + { i = prefa-(border-1); + prefa = border; + } + if (prefb > border) + { j = prefb-(border-1); + prefb = border; + } + + s0 = i; + sa = i-1; + sb = j-1; + mtag = ':'; + dtag = ':'; + + while (prefa > prefb) + { BLOCK(a[i],4) + i += 1; + prefa -= 1; + } + while (prefb > prefa) + { BLOCK(4,b[j]) + j += 1; + prefb -= 1; + } + while (prefa > 0) + { BLOCK(a[i],b[j]) + i += 1; + j += 1; + prefa -= 1; + } + + mtag = '['; + if (prefb > 0) + BLOCK(5,5) + + mtag = '|'; + dtag = '*'; + + match = diff = 0; + + { int p, c; /* Output columns of alignment til reach trace end */ + + for (c = 0; c < tlen; c++) + if ((p = trace[c]) < 0) + { p = -p; + while (i != p) + { BLOCK(a[i],b[j]) + if (a[i] == b[j]) + match += 1; + else + diff += 1; + i += 1; + j += 1; + } + BLOCK(7,b[j]) + j += 1; + diff += 1; + } + else + { while (j != p) + { BLOCK(a[i],b[j]) + if (a[i] == b[j]) + match += 1; + else + diff += 1; + i += 1; + j += 1; + } + BLOCK(a[i],7) + i += 1; + diff += 1; + } + p = align->path->aepos; + while (i <= p) + { BLOCK(a[i],b[j]) + if (a[i] == b[j]) + match += 1; + else + diff += 1; + i += 1; + j += 1; + } + } + + { int c; /* Output remaining column including unaligned suffix */ + + mtag = ']'; + if (a[i] != 4 && b[j] != 4 && border > 0) + BLOCK(6,6) + + mtag = ':'; + dtag = ':'; + + c = 0; + while (c < border && (a[i] != 4 || b[j] != 4)) + { if (a[i] != 4) + if (b[j] != 4) + { BLOCK(a[i],b[j]) + i += 1; + j += 1; + } + else + { BLOCK(a[i],4) + i += 1; + } + else + { BLOCK(4,b[j]) + j += 1; + } + c += 1; + } + } + + /* Print remainder of buffered col.s */ + + fprintf(file,"\n"); + fprintf(file,"%*s",indent,""); + if (coord > 0) + { if (sa < aend) + fprintf(file," %*d",coord,sa); + else + fprintf(file," %*s",coord,""); + fprintf(file," %.*s\n",o,Abuf); + fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); + fprintf(file,"%*s",indent,""); + if (sb < bend) + if (comp) + fprintf(file," %*d",coord,blen-sb); + else + fprintf(file," %*d",coord,sb); + else + fprintf(file," %*s",coord,""); + fprintf(file," %.*s",o,Bbuf); + } + else + { fprintf(file," %.*s\n",o,Abuf); + fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); + fprintf(file,"%*s %.*s",indent,"",o,Bbuf); + } + if (diff+match > 0) + fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); + else + fprintf(file,"\n"); + + fflush(file); + return (0); +} + +/* Print an ASCII representation of the overlap in align between fragments + a and b to given file. */ + +static inline void repchar(FILE *file, int symbol, int rep) +{ while (rep-- > 0) + fputc(symbol,file); +} + +void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord) +{ int alen = align->alen; + int blen = align->blen; + Path *path = align->path; + int comp = COMP(align->flags); + int w; + + fprintf(file,"%*s",indent,""); + if (path->abpos > 0) + fprintf(file," %*d ",coord,path->abpos); + else + fprintf(file,"%*s",coord+5,""); + if (path->aepos < alen) + fprintf(file,"%*s%d",coord+8,"",alen-path->aepos); + fprintf(file,"\n"); + + fprintf(file,"%*s",indent,""); + if (path->abpos > 0) + { fprintf(file,"A "); + w = Number_Digits((int64) path->abpos); + repchar(file,' ',coord-w); + repchar(file,'=',w+3); + fputc('+',file); + repchar(file,'-',coord+5); + } + else + { fprintf(file,"A %*s",coord+4,""); + repchar(file,'-',coord+5); + } + + if (path->aepos < alen) + { fputc('+',file); + w = Number_Digits((int64) (alen-path->aepos)); + repchar(file,'=',w+2); + fputc('>',file); + repchar(file,' ',w); + } + else + { fputc('>',file); + repchar(file,' ',coord+3); + } + + { int asub, bsub; + + asub = path->aepos - path->abpos; + bsub = path->bepos - path->bbpos; + fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n", + path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub)); + } + + { int sym1e, sym2e; + int sym1p, sym2p; + + if (comp > 0) + { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; } + else + { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; } + + fprintf(file,"%*s",indent,""); + if (path->bbpos > 0) + { fprintf(file,"B "); + w = Number_Digits((int64) path->bbpos); + repchar(file,' ',coord-w); + fputc(sym1e,file); + repchar(file,'=',w+2); + fputc('+',file); + repchar(file,'-',coord+5); + } + else + { fprintf(file,"B "); + repchar(file,' ',coord+3); + fputc(sym1p,file); + repchar(file,'-',coord+5); + } + if (path->bepos < blen) + { fprintf(file,"+"); + w = Number_Digits((int64) (blen-path->bepos)); + repchar(file,'=',w+2); + fprintf(file,"%c\n",sym2e); + } + else + fprintf(file,"%c\n",sym2p); + } + + fprintf(file,"%*s",indent,""); + if (path->bbpos > 0) + fprintf(file," %*d ",coord,path->bbpos); + else + fprintf(file,"%*s",coord+5,""); + if (path->bepos < blen) + fprintf(file,"%*s%d",coord+8,"",blen-path->bepos); + fprintf(file,"\n"); + + fflush(file); +} + + +/****************************************************************************************\ +* * +* O(ND) trace algorithm * +* * +\****************************************************************************************/ + + +#ifdef DEBUG_AWAVE + +static void print_awave(int *V, int low, int hgh) +{ int k; + + printf(" [%6d,%6d]: ",low,hgh); + for (k = low; k <= hgh; k++) + printf(" %3d",V[k]); + printf("\n"); + fflush(stdout); +} + +#endif + +#ifdef DEBUG_ALIGN + +static int depth = 0; + +#endif + +typedef struct + { int *Stop; // Ongoing stack of alignment indels + uint16 *Trace; // Base of Trace Vector + char *Aabs, *Babs; // Absolute base of A and B sequences + + int **PVF, **PHF; // List of waves for iterative np algorithms + int mida, midb; // mid point division for mid-point algorithms + + int *VF, *VB; // Forward/Reverse waves for nd algorithms + } Trace_Waves; + +static int split_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int *px, int *py) +{ int x, y; + int D; + + int *VF = wave->VF; + int *VB = wave->VB; + int flow; // fhgh == D ! + int blow, bhgh; + char *a; + + y = 0; + if (N < M) + while (y < N && B[y] == A[y]) + y += 1; + else + { while (y < M && B[y] == A[y]) + y += 1; + if (y >= M && N == M) + { *px = *py = M; + return (0); + } + } + + flow = 0; + VF[0] = y; + VF[-1] = -2; + + x = N-M; + a = A-x; + y = N-1; + if (N > M) + while (y >= x && B[y] == a[y]) + y -= 1; + else + while (y >= 0 && B[y] == a[y]) + y -= 1; + + blow = bhgh = -x; + VB += x; + VB[blow] = y; + VB[blow-1] = N+1; + + for (D = 1; 1; D += 1) + { int k, r; + int am, ac, ap; + + // Forward wave + + flow -= 1; + am = ac = VF[flow-1] = -2; + + a = A + D; + x = M - D; + for (k = D; k >= flow; k--) + { ap = ac; + ac = am+1; + am = VF[k-1]; + + if (ac < am) + if (ap < am) + y = am; + else + y = ap; + else + if (ap < ac) + y = ac; + else + y = ap; + + if (blow <= k && k <= bhgh) + { r = VB[k]; + if (y > r) + { D = (D<<1)-1; + if (ap > r) + y = ap; + else if (ac > r) + y = ac; + else + y = r+1; + x = k+y; + *px = x; + *py = y; + return (D); + } + } + + if (N < x) + while (y < N && B[y] == a[y]) + y += 1; + else + while (y < x && B[y] == a[y]) + y += 1; + + VF[k] = y; + a -= 1; + x += 1; + } + +#ifdef DEBUG_AWAVE + print_awave(VF,flow,D); +#endif + + // Reverse Wave + + bhgh += 1; + blow -= 1; + am = ac = VB[blow-1] = N+1; + + a = A + bhgh; + x = -bhgh; + for (k = bhgh; k >= blow; k--) + { ap = ac+1; + ac = am; + am = VB[k-1]; + + if (ac > am) + if (ap > am) + y = am; + else + y = ap; + else + if (ap > ac) + y = ac; + else + y = ap; + + if (flow <= k && k <= D) + { r = VF[k]; + if (y <= r) + { D = (D << 1); + if (ap <= r) + y = ap; + else if (ac <= r) + y = ac; + else + y = r; + x = k+y; + *px = x; + *py = y; + return (D); + } + } + + y -= 1; + if (x > 0) + while (y >= x && B[y] == a[y]) + y -= 1; + else + while (y >= 0 && B[y] == a[y]) + y -= 1; + + VB[k] = y; + a -= 1; + x += 1; + } + +#ifdef DEBUG_AWAVE + print_awave(VB,blow,bhgh); +#endif + } +} + +static int trace_nd(char *A, int M, char *B, int N, Trace_Waves *wave, int tspace) +{ int x, y; + int D, s; + +#ifdef DEBUG_ALIGN + printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); + fflush(stdout); +#endif + + if (M <= 0) + { y = (((A-wave->Aabs)/tspace) << 1); + wave->Trace[y] += N; + wave->Trace[y+1] += N; +#ifdef DEBUG_TRACE + printf("%*s Adding1 (%d,%d) to tp %d(%d,%d)\n",depth,"",N,N,y>>1, + wave->Trace[y+1],wave->Trace[y]); + fflush(stdout); +#endif + return (N); + } + + if (N <= 0) + { x = A - wave->Aabs; + y = x/tspace; + x = (y+1)*tspace - x; + y <<= 1; + for (s = M; s > 0; s -= x, x = tspace) + { if (x > s) + x = s; + wave->Trace[y] += x; +#ifdef DEBUG_TRACE + printf("%*s Adding2 (0,%d) to tp %d(%d,%d)\n",depth,"",x,y>>1, + wave->Trace[y+1],wave->Trace[y]); + fflush(stdout); +#endif + y += 2; + } + return (M); + } + + D = split_nd(A,M,B,N,wave,&x,&y); + + if (D > 1) + { +#ifdef DEBUG_ALIGN + printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); + fflush(stdout); + depth += 2; +#endif + + s = A-wave->Aabs; + if ((s/tspace+1)*tspace - s >= x) + { s = ((s/tspace)<<1); + wave->Trace[s] += (D+1)/2; + wave->Trace[s+1] += y; +#ifdef DEBUG_TRACE + printf("%*s Adding3 (%d,%d) to tp %d(%d,%d)\n",depth,"",y,(D+1)/2,s>>1, + wave->Trace[s+1],wave->Trace[s]); + fflush(stdout); +#endif + } + else + trace_nd(A,x,B,y,wave,tspace); + + s = (A+x)-wave->Aabs; + if ((s/tspace+1)*tspace - s >= M-x) + { s = ((s/tspace)<<1); + wave->Trace[s] += D/2; + wave->Trace[s+1] += N-y; +#ifdef DEBUG_TRACE + printf("%*s Adding4 (%d,%d)) to tp %d(%d,%d)\n",depth,"",N-y,D/2,s>>1, + wave->Trace[s+1],wave->Trace[s]); + fflush(stdout); +#endif + } + else + trace_nd(A+x,M-x,B+y,N-y,wave,tspace); + +#ifdef DEBUG_ALIGN + depth -= 2; +#endif + } + + else + { int u, v; + + if (D == 0 || M < N) + s = x; + else + s = x-1; + if (s > 0) + { u = A - wave->Aabs; + v = u/tspace; + u = (v+1)*tspace - u; + for (v <<= 1; s > 0; s -= u, u = tspace) + { if (u > s) + u = s; + wave->Trace[v+1] += u; +#ifdef DEBUG_TRACE + printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1, + wave->Trace[v+1],wave->Trace[v]); + fflush(stdout); +#endif + v += 2; + } + } + + if (D == 0) + return (D); + + if (M < N) + y = ((((A+x)-wave->Aabs)/tspace)<<1); + else + y = ((((A+(x-1))-wave->Aabs)/tspace)<<1); + wave->Trace[y] += 1; + if (M <= N) + wave->Trace[y+1] += 1; +#ifdef DEBUG_TRACE + printf("%*s Adding5 (%d,1)) to tp %d(%d,%d)\n",depth,"",N>=M,y>>1, + wave->Trace[y+1],wave->Trace[y]); + fflush(stdout); +#endif + + s = M-x; + if (s > 0) + { u = (A+x) - wave->Aabs; + v = u/tspace; + u = (v+1)*tspace - u; + for (v <<= 1; s > 0; s -= u, u = tspace) + { if (u > s) + u = s; + wave->Trace[v+1] += u; +#ifdef DEBUG_TRACE + printf("%*s Adding5 (%d,0)) to tp %d(%d,%d)\n",depth,"",u,v>>1, + wave->Trace[v+1],wave->Trace[v]); + fflush(stdout); +#endif + v += 2; + } + } + } + + return (D); +} + +static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) +{ int x, y; + int D; + +#ifdef DEBUG_ALIGN + printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); +#endif + + if (M <= 0) + { x = (wave->Aabs-A)-1; + for (y = 1; y <= N; y++) + { *wave->Stop++ = x; +#ifdef DEBUG_SCRIPT + printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1); +#endif + } + return (N); + } + + if (N <= 0) + { y = (B-wave->Babs)+1; + for (x = 1; x <= M; x++) + { *wave->Stop++ = y; +#ifdef DEBUG_SCRIPT + printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1); +#endif + } + return (M); + } + + D = split_nd(A,M,B,N,wave,&x,&y); + + if (D > 1) + { +#ifdef DEBUG_ALIGN + printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); + fflush(stdout); + depth += 2; +#endif + + dandc_nd(A,x,B,y,wave); + dandc_nd(A+x,M-x,B+y,N-y,wave); + +#ifdef DEBUG_ALIGN + depth -= 2; +#endif + } + + else if (D == 1) + + { if (M > N) + { *wave->Stop++ = (B-wave->Babs)+y+1; +#ifdef DEBUG_SCRIPT + printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1); +#endif + } + + else if (M < N) + { *wave->Stop++ = (wave->Aabs-A)-x-1; +#ifdef DEBUG_SCRIPT + printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1); +#endif + } + +#ifdef DEBUG_SCRIPT + else + printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y); +#endif + } + + return (D); +} + +int Compute_Alignment(Alignment *align, Work_Data *ework, int task, int tspace) +{ _Work_Data *work = (_Work_Data *) ework; + Trace_Waves wave; + + int L, D; + int asub, bsub; + char *aseq, *bseq; + Path *path; + int *trace; + uint16 *strace; + + path = align->path; + asub = path->aepos-path->abpos; + bsub = path->bepos-path->bbpos; + aseq = align->aseq+path->abpos; + bseq = align->bseq+path->bbpos; + + L = 0; + if (task != DIFF_ONLY) + { if (task == DIFF_TRACE || task == PLUS_TRACE) + L = 2*(((path->aepos + (tspace-1))/tspace - path->abpos/tspace) + 1)*sizeof(uint16); + else if (asub < bsub) + L = bsub*sizeof(int); + else + L = asub*sizeof(int); + if (L > work->alnmax) + if (enlarge_alnpts(work,L)) + EXIT(1); + } + + trace = ((int *) work->alnpts); + strace = ((uint16 *) work->alnpts); + + if (asub > bsub) + D = (4*asub+6)*sizeof(int); + else + D = (4*bsub+6)*sizeof(int); + if (D > work->vecmax) + if (enlarge_vector(work,D)) + EXIT(1); + + if (asub > bsub) + { wave.VF = ((int *) work->vector) + (asub+1); + wave.VB = wave.VF + (2*asub+3); + } + else + { wave.VF = ((int *) work->vector) + (bsub+1); + wave.VB = wave.VF + (2*bsub+3); + } + + wave.Aabs = align->aseq; + wave.Babs = align->bseq; + + if (task == DIFF_ONLY) + { wave.mida = -1; + if (asub <= 0) + path->diffs = bsub; + else if (bsub <= 0) + path->diffs = asub; + else + path->diffs = split_nd(aseq,asub,bseq,bsub,&wave,&wave.mida,&wave.midb); + path->trace = NULL; + path->tlen = -1; + return (0); + } + + else if (task < DIFF_ONLY && wave.mida >= 0) + { int x = wave.mida; + int y = wave.midb; + + if (task == PLUS_ALIGN) + { wave.Stop = trace; + dandc_nd(aseq,x,bseq,y,&wave); + dandc_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave); + path->tlen = wave.Stop - trace; + } + else + { int i, n; + + wave.Trace = strace - 2*(path->abpos/tspace); + n = L/sizeof(uint16); + for (i = 0; i < n; i++) + strace[i] = 0; + + trace_nd(aseq,x,bseq,y,&wave,tspace); + trace_nd(aseq+x,asub-x,bseq+y,bsub-y,&wave,tspace); + + if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary + { strace[n-3] += strace[n-1]; + strace[n-4] += strace[n-2]; + } + path->tlen = n-2; + +#ifdef DEBUG_SCRIPT + printf(" Trace:\n"); + for (i = 0; i < path->tlen; i += 2) + printf(" %3d %3d\n",strace[i],strace[i+1]); + fflush(stdout); +#endif + } + } + + else + { if (task == DIFF_ALIGN) + { wave.Stop = trace; + path->diffs = dandc_nd(aseq,asub,bseq,bsub,&wave); + path->tlen = wave.Stop - trace; + } + else + { int i, n; + + wave.Trace = strace - 2*(path->abpos/tspace); + n = L/sizeof(uint16); + for (i = 0; i < n; i++) + strace[i] = 0; + path->diffs = trace_nd(aseq,asub,bseq,bsub,&wave,tspace); + + if (strace[n-1] != 0) // Last element is to capture all inserts on TP boundary + { strace[n-3] += strace[n-1]; + strace[n-4] += strace[n-2]; + } + path->tlen = n-2; + +#ifdef DEBUG_SCRIPT + printf(" Trace:\n"); + for (i = 0; i < path->tlen; i += 2) + printf(" %3d %3d\n",strace[i],strace[i+1]); + fflush(stdout); +#endif + } + } + + path->trace = trace; + return (0); +} + + +/****************************************************************************************\ +* * +* O(NP) tracing algorithms * +* * +\****************************************************************************************/ + +/* Iterative O(np) algorithm for finding the alignment between two substrings (specified + by a Path record). The variation includes handling substitutions and guarantees + to find left-most alignments so that low complexity runs are always aligned in + the same way. +*/ + +#ifdef DEBUG_ALIGN + +static int ToA[4] = { 'a', 'c', 'g', 't' }; + +#endif + +static char *TP_Align = + "Bad alignment between trace points (Compute_Trace), source DB likely incorrect"; + +static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) +{ int **PVF = wave->PVF; + int **PHF = wave->PHF; + int D; + int del = M-N; + + { int *F0, *F1, *F2; + int *HF; + int low, hgh; + int posl, posh; + +#ifdef DEBUG_ALIGN + printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N); + printf(" A = "); + for (D = 0; D < M; D++) + printf("%c",ToA[(int) A[D]]); + printf("\n"); + printf(" B = "); + for (D = 0; D < N; D++) + printf("%c",ToA[(int) B[D]]); + printf("\n"); +#endif + + if (del >= 0) + { low = 0; + hgh = del; + } + else + { low = del; + hgh = 0; + } + + posl = -dmax; + posh = dmax; + if (wave->Aabs == wave->Babs) + { if (B == A) + { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); + EXIT(-1); + } + else if (B < A) + { if ((B-A)+1 > posl) + posl = (B-A)+1; + } + else + { if ((B-A)-1 < posh) + posh = (B-A)-1; + } + } + + F1 = PVF[-2]; + F0 = PVF[-1]; + + for (D = low-1; D <= hgh+1; D++) + F1[D] = F0[D] = -2; + F0[0] = -1; + + low += 1; + hgh -= 1; + + for (D = 0; 1; D += 1) + { int k, i, j; + int am, ac, ap; + char *a; + + if (D > dmax) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); + EXIT(-1); + } + + F2 = F1; + F1 = F0; + F0 = PVF[D]; + HF = PHF[D]; + + if ((D & 0x1) == 0) + { if (low > posl) + low -= 1; + if (hgh < posh) + hgh += 1; + } + F0[hgh+1] = F0[low-1] = -2; + +#define FS_MOVE(mdir,pdir) \ + ac = F1[k]+1; \ + if (ac < am) \ + if (ap < am) \ + { HF[k] = mdir; \ + j = am; \ + } \ + else \ + { HF[k] = pdir; \ + j = ap; \ + } \ + else \ + if (ap < ac) \ + { HF[k] = 0; \ + j = ac; \ + } \ + else \ + { HF[k] = pdir; \ + j = ap; \ + } \ + \ + if (N < i) \ + while (j < N && B[j] == a[j]) \ + j += 1; \ + else \ + while (j < i && B[j] == a[j]) \ + j += 1; \ + F0[k] = j; + + j = -2; + a = A + hgh; + i = M - hgh; + for (k = hgh; k > del; k--) + { ap = j+1; + am = F2[k-1]; + FS_MOVE(-1,4) + a -= 1; + i += 1; + } + + j = -2; + a = A + low; + i = M - low; + for (k = low; k < del; k++) + { ap = F2[k+1]+1; + am = j; + FS_MOVE(2,1) + a += 1; + i -= 1; + } + + ap = F0[del+1]+1; + am = j; + FS_MOVE(2,4) + +#ifdef DEBUG_AWAVE + print_awave(F0,low,hgh); + print_awave(HF,low,hgh); +#endif + + if (F0[del] >= N) + break; + } + } + + { int k, h, m, e, c; + int ap = (wave->Aabs-A)-1; + int bp = (B-wave->Babs)+1; + + PHF[0][0] = 3; + + c = N; + k = del; + e = PHF[D][k]; + PHF[D][k] = 3; + + if (mode == UPPERMOST) + + while (e != 3) + { h = k+e; + if (e > 1) + h -= 3; + else if (e == 0) + D -= 1; + else + D -= 2; + + if (h < k) // => e = -1 or 2, UPPERMOST + { char *a; + + a = A + k; + if (k < 0) + m = -k; + else + m = 0; + if (PVF[D][h] <= c) + c = PVF[D][h]-1; + while (c >= m && a[c] == B[c]) + c -= 1; + if (e == -1) // => edge is 2, others are 1, and 0 + { if (c <= PVF[D+2][k+1]) + { e = 4; + h = k+1; + D = D+2; + } + else if (c == PVF[D+1][k]) + { e = 0; + h = k; + D = D+1; + } + else + PVF[D][h] = c+1; + } + else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) + { if (k == del) + m = D; + else + m = D-2; + if (c <= PVF[m][k+1]) + { if (k == del) + e = 4; + else + e = 1; + h = k+1; + D = m; + } + else if (c == PVF[D-1][k]) + { e = 0; + h = k; + D = D-1; + } + else + PVF[D][h] = c+1; + } + } + + m = PHF[D][h]; + PHF[D][h] = e; + e = m; + k = h; + } + + else if (mode == LOWERMOST) + + while (e != 3) + { h = k+e; + if (e > 1) + h -= 3; + else if (e == 0) + D -= 1; + else + D -= 2; + + if (h > k) // => e = 1 or 4, LOWERMOST + { char *a; + + a = A + k; + if (k < 0) + m = -k; + else + m = 0; + if (PVF[D][h] < c) + c = PVF[D][h]; + while (c >= m && a[c] == B[c]) + c -= 1; + if (e == 1) // => edge is 2, others are 1, and 0 + { if (c < PVF[D+2][k-1]) + { e = 2; + h = k-1; + D = D+2; + } + else if (c == PVF[D+1][k]) + { e = 0; + h = k; + D = D+1; + } + else + PVF[D][h] = c--; + } + else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) + { if (k == del) + m = D; + else + m = D-2; + if (c < PVF[m][k-1]) + { if (k == del) + e = 2; + else + e = -1; + h = k-1; + D = m; + } + else if (c == PVF[D-1][k]) + { e = 0; + h = k; + D = D-1; + } + else + PVF[D][h] = c--; + } + } + + m = PHF[D][h]; + PHF[D][h] = e; + e = m; + k = h; + } + + else // mode == GREEDIEST + + while (e != 3) + { h = k+e; + if (e > 1) + h -= 3; + else if (e == 0) + D -= 1; + else + D -= 2; + + m = PHF[D][h]; + PHF[D][h] = e; + e = m; + k = h; + } + + k = D = 0; + e = PHF[D][k]; + while (e != 3) + { h = k-e; + c = PVF[D][k]; + if (e > 1) + h += 3; + else if (e == 0) + D += 1; + else + D += 2; +#ifdef DEBUG_SCRIPT + if (h > k) + printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp); + else if (h < k) + printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap); + else + printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1)); +#endif + if (h > k) + *wave->Stop++ = bp+c; + else if (h < k) + *wave->Stop++ = ap-(c+k); + k = h; + e = PHF[D][h]; + } + } + + return (D + abs(del)); +} + +static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode, int dmax) +{ int **PVF = wave->PVF; + int **PHF = wave->PHF; + int D; + int del = M-N; + + { int *F0, *F1, *F2; + int *HF; + int low, hgh; + int posl, posh; + +#ifdef DEBUG_ALIGN + printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); + printf("%*s A = ",depth,""); + for (D = 0; D < M; D++) + printf("%c",ToA[(int) A[D]]); + printf("\n"); + printf("%*s B = ",depth,""); + for (D = 0; D < N; D++) + printf("%c",ToA[(int) B[D]]); + printf("\n"); +#endif + + if (del >= 0) + { low = 0; + hgh = del; + } + else + { low = del; + hgh = 0; + } + + posl = -dmax; + posh = dmax; + if (wave->Aabs == wave->Babs) + { if (B == A) + { EPRINTF(EPLACE,"%s: self comparison starts on diagonal 0 (Compute_Trace)\n",Prog_Name); + EXIT(1); + } + else if (B < A) + { if ((B-A)+1 > posl) + posl = (B-A)+1; + } + else + { if ((B-A)-1 < posh) + posh = (B-A)-1; + } + } + + F1 = PVF[-2]; + F0 = PVF[-1]; + + for (D = low-1; D <= hgh+1; D++) + F1[D] = F0[D] = -2; + F0[0] = -1; + + low += 1; + hgh -= 1; + + for (D = 0; 1; D += 1) + { int k, i, j; + int am, ac, ap; + char *a; + + if (D > dmax) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Align); + EXIT(-1); + } + + F2 = F1; + F1 = F0; + F0 = PVF[D]; + HF = PHF[D]; + + if ((D & 0x1) == 0) + { if (low > posl) + low -= 1; + if (hgh < posh) + hgh += 1; + } + F0[hgh+1] = F0[low-1] = -2; + + j = -2; + a = A + hgh; + i = M - hgh; + for (k = hgh; k > del; k--) + { ap = j+1; + am = F2[k-1]; + FS_MOVE(-1,4) + a -= 1; + i += 1; + } + + j = -2; + a = A + low; + i = M - low; + for (k = low; k < del; k++) + { ap = F2[k+1]+1; + am = j; + FS_MOVE(2,1) + a += 1; + i -= 1; + } + + ap = F0[del+1]+1; + am = j; + FS_MOVE(2,4) + +#ifdef DEBUG_AWAVE + print_awave(F0,low,hgh); + print_awave(HF,low,hgh); +#endif + + if (F0[del] >= N) + break; + } + } + + { int k, h, m, e, c; + int d, f; + + d = D + abs(del); + c = N; + k = del; + + if (mode == UPPERMOST) + + for (f = d/2; d > f; d--) + { e = PHF[D][k]; + h = k+e; + if (e > 1) + h -= 3; + else if (e == 0) + D -= 1; + else + D -= 2; + + if (h < k) // => e = -1 or 2, UPPERMOST + { char *a; + + a = A + k; + if (k < 0) + m = -k; + else + m = 0; + if (PVF[D][h] <= c) + c = PVF[D][h]-1; + while (c >= m && a[c] == B[c]) + c -= 1; + if (e == -1) // => edge is 2, others are 1, and 0 + { if (c <= PVF[D+2][k+1]) + { e = 4; + h = k+1; + D = D+2; + } + else if (c == PVF[D+1][k]) + { e = 0; + h = k; + D = D+1; + } + else + PVF[D][h] = c+1; + } + else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) + { if (k == del) + m = D; + else + m = D-2; + if (c <= PVF[m][k+1]) + { if (k == del) + e = 4; + else + e = 1; + h = k+1; + D = m; + } + else if (c == PVF[D-1][k]) + { e = 0; + h = k; + D = D-1; + } + else + PVF[D][h] = c+1; + } + } + + k = h; + } + + else if (mode == LOWERMOST) + + for (f = d/2; d > f; d--) + { e = PHF[D][k]; + h = k+e; + if (e > 1) + h -= 3; + else if (e == 0) + D -= 1; + else + D -= 2; + + if (h > k) // => e = 1 or 4, LOWERMOST + { char *a; + + a = A + k; + if (k < 0) + m = -k; + else + m = 0; + if (PVF[D][h] < c) + c = PVF[D][h]; + while (c >= m && a[c] == B[c]) + c -= 1; + if (e == 1) // => edge is 2, others are 1, and 0 + { if (c < PVF[D+2][k-1]) + { e = 2; + h = k-1; + D = D+2; + } + else if (c == PVF[D+1][k]) + { e = 0; + h = k; + D = D+1; + } + else + PVF[D][h] = c--; + } + else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) + { if (k == del) + m = D; + else + m = D-2; + if (c < PVF[m][k-1]) + { if (k == del) + e = 2; + else + e = -1; + h = k-1; + D = m; + } + else if (c == PVF[D-1][k]) + { e = 0; + h = k; + D = D-1; + } + else + PVF[D][h] = c--; + } + } + + k = h; + } + + else // mode == GREEDIEST + + for (f = d/2; d > f; d--) + { e = PHF[D][k]; + h = k+e; + if (e > 1) + h -= 3; + else if (e == 0) + D -= 1; + else + D -= 2; + k = h; + } + + wave->midb = (B-wave->Babs) + PVF[D][k]; + wave->mida = (A-wave->Aabs) + k + PVF[D][k]; + } + + return (0); +} + + +/****************************************************************************************\ +* * +* COMPUTE_TRACE FLAVORS * +* * +\****************************************************************************************/ + +static char *TP_Error = "Trace point out of bounds (Compute_Trace), source DB likely incorrect"; + +int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode) +{ _Work_Data *work = (_Work_Data *) ework; + Trace_Waves wave; + + Path *path; + char *aseq, *bseq; + int alen, blen; + uint16 *points; + int tlen; + int ab, bb; + int ae, be; + int diffs, dmax; + + alen = align->alen; + blen = align->blen; + path = align->path; + aseq = align->aseq; + bseq = align->bseq; + tlen = path->tlen; + points = (uint16 *) path->trace; + + { int64 s; + int d; + int M, N; + int nmax; + int **PVF, **PHF; + + M = path->aepos-path->abpos; + N = path->bepos-path->bbpos; + if (M < N) + s = N*sizeof(int); + else + s = M*sizeof(int); + if (s > work->tramax) + if (enlarge_trace(work,s)) + EXIT(1); + + nmax = 0; + dmax = 0; + for (d = 1; d < tlen; d += 2) + { if (points[d-1] > dmax) + dmax = points[d-1]; + if (points[d] > nmax) + nmax = points[d]; + } + if (tlen <= 1) + nmax = N; + + s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); + + if (s > work->vecmax) + if (enlarge_vector(work,s)) + EXIT(1); + + wave.PVF = PVF = ((int **) (work->vector)) + 2; + wave.PHF = PHF = PVF + (dmax+3); + + s = trace_spacing+nmax+3; + PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); + for (d = -1; d <= dmax; d++) + PVF[d] = PVF[d-1] + s; + PHF[-2] = PVF[dmax] + s; + for (d = -1; d <= dmax; d++) + PHF[d] = PHF[d-1] + s; + } + + wave.Stop = (int *) (work->trace); + wave.Aabs = aseq; + wave.Babs = bseq; + + { int i, d; + + diffs = 0; + ab = path->abpos; + ae = (ab/trace_spacing)*trace_spacing; + bb = path->bbpos; + tlen -= 2; + for (i = 1; i < tlen; i += 2) + { ae = ae + trace_spacing; + be = bb + points[i]; + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); + if (d < 0) + EXIT(1); + diffs += d; + ab = ae; + bb = be; + } + ae = path->aepos; + be = path->bepos; + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); + if (d < 0) + EXIT(1); + diffs += d; + } + + path->trace = work->trace; + path->tlen = wave.Stop - ((int *) path->trace); + path->diffs = diffs; + + return (0); +} + +int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode) +{ _Work_Data *work = (_Work_Data *) ework; + Trace_Waves wave; + + Path *path; + char *aseq, *bseq; + int alen, blen; + uint16 *points; + int tlen; + int ab, bb; + int ae, be; + int diffs, dmax; + + alen = align->alen; + blen = align->blen; + path = align->path; + aseq = align->aseq; + bseq = align->bseq; + tlen = path->tlen; + points = (uint16 *) path->trace; + + { int64 s; + int d; + int M, N; + int nmax; + int **PVF, **PHF; + + M = path->aepos-path->abpos; + N = path->bepos-path->bbpos; + if (M < N) + s = N*sizeof(int); + else + s = M*sizeof(int); + if (s > work->tramax) + if (enlarge_trace(work,s)) + EXIT(1); + + nmax = 0; + dmax = 0; + for (d = 1; d < tlen; d += 2) + { if (points[d-1] > dmax) + dmax = points[d-1]; + if (points[d] > nmax) + nmax = points[d]; + } + if (tlen <= 1) + nmax = N; + + s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); + + if (s > work->vecmax) + if (enlarge_vector(work,s)) + EXIT(1); + + wave.PVF = PVF = ((int **) (work->vector)) + 2; + wave.PHF = PHF = PVF + (dmax+3); + + s = trace_spacing+nmax+3; + PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); + for (d = -1; d <= dmax; d++) + PVF[d] = PVF[d-1] + s; + PHF[-2] = PVF[dmax] + s; + for (d = -1; d <= dmax; d++) + PHF[d] = PHF[d-1] + s; + } + + wave.Stop = ((int *) work->trace); + wave.Aabs = aseq; + wave.Babs = bseq; + + { int i, d; + int as, bs; + int af, bf; + + diffs = 0; + ab = as = af = path->abpos; + ae = (ab/trace_spacing)*trace_spacing; + bb = bs = bf = path->bbpos; + tlen -= 2; + for (i = 1; i < tlen; i += 2) + { ae = ae + trace_spacing; + be = bb + points[i]; + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) + EXIT(1); + af = wave.mida; + bf = wave.midb; + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); + if (d < 0) + EXIT(1); + diffs += d; + ab = ae; + bb = be; + as = af; + bs = bf; + } + + ae = path->aepos; + be = path->bepos; + + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax)) + EXIT(1); + af = wave.mida; + bf = wave.midb; + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode,dmax); + if (d < 0) + EXIT(1); + diffs += d; + as = af; + bs = bf; + + d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode,dmax); + if (d < 0) + EXIT(1); + diffs += d; + } + + path->trace = work->trace; + path->tlen = wave.Stop - ((int *) path->trace); + path->diffs = diffs; + + return (0); +} + +int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) +{ _Work_Data *work = (_Work_Data *) ework; + Trace_Waves wave; + + Path *path; + char *aseq, *bseq; + int alen, blen; + uint16 *points; + int tlen; + int ab, bb; + int ae, be; + int diffs, dmax; + + alen = align->alen; + blen = align->blen; + path = align->path; + aseq = align->aseq; + bseq = align->bseq; + tlen = path->tlen; + points = (uint16 *) path->trace; + + { int64 s; + int d; + int M, N; + int mmax, nmax; + int **PVF, **PHF; + + M = path->aepos-path->abpos; + N = path->bepos-path->bbpos; + if (M < N) + s = N*sizeof(int); + else + s = M*sizeof(int); + if (s > work->tramax) + if (enlarge_trace(work,s)) + EXIT(1); + + nmax = mmax = 0; + for (d = 0; d < tlen; d += 2) + { if (points[d] > mmax) + mmax = points[d]; + if (points[d+1] > nmax) + nmax = points[d+1]; + } + if (tlen <= 1) + { mmax = M; + nmax = N; + } + if (mmax > nmax) + dmax = nmax; + else + dmax = mmax; + + s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *)); + + if (s > work->vecmax) + if (enlarge_vector(work,s)) + EXIT(1); + + wave.PVF = PVF = ((int **) (work->vector)) + 2; + wave.PHF = PHF = PVF + (dmax+3); + + s = mmax+nmax+3; + PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); + for (d = -1; d <= dmax; d++) + PVF[d] = PVF[d-1] + s; + PHF[-2] = PVF[dmax] + s; + for (d = -1; d <= dmax; d++) + PHF[d] = PHF[d-1] + s; + } + + wave.Stop = (int *) (work->trace); + wave.Aabs = aseq; + wave.Babs = bseq; + + { int i, d; + + diffs = 0; + ab = path->abpos; + bb = path->bbpos; + for (i = 0; i < tlen; i += 2) + { ae = ab + points[i]; + be = bb + points[i+1]; + if (ae > alen || be > blen) + { EPRINTF(EPLACE,"%s: %s\n",Prog_Name,TP_Error); + EXIT(1); + } + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode,dmax); + if (d < 0) + EXIT(1); + diffs += d; + ab = ae; + bb = be; + } + } + + path->trace = work->trace; + path->tlen = wave.Stop - ((int *) path->trace); + path->diffs = diffs; + + return (0); +} diff --git a/align.h b/align.h new file mode 100644 index 0000000..8563f3d --- /dev/null +++ b/align.h @@ -0,0 +1,377 @@ +/******************************************************************************************* + * + * Local alignment module. Routines for finding local alignments given a seed position, + * representing such an l.a. with its interval and a set of pass-thru points, so that + * a detailed alignment can be efficiently computed on demand. + * + * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C, + * 2 for G, and 3 for T. + * + * Author: Gene Myers + * Date : July 2013 + * + ********************************************************************************************/ + +#ifndef _A_MODULE + +#define _A_MODULE + +#include "DB.h" + +#define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can + // and do compress traces pts to 8-bit unsigned ints + +/*** INTERACTIVE vs BATCH version + + The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or + batch version of the routines in this library are compiled. In batch mode, routines + print an error message and exit. In interactive mode, the routines place the error + message in EPLACE (also defined in DB.h) and return an error value, typically NULL + if the routine returns a pointer, and an unusual integer value if the routine returns + an integer. + Below when an error return is described, one should understand that this value is returned + only if the routine was compiled in INTERACTIVE mode. + +***/ + + +/*** PATH ABSTRACTION: + + Coordinates are *between* characters where 0 is the tick just before the first char, + 1 is the tick between the first and second character, and so on. Our data structure + is called a Path refering to its conceptualization in an edit graph. + + A local alignment is specified by the point '(abpos,bbpos)' at which its path in + the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends. + In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is + the *first* character of X). + + There are 'diffs' differences in an optimal local alignment between the beginning and + end points of the alignment (if computed by Compute_Trace), or nearly so (if computed + by Local_Alignment). + + Optionally, a Path can have additional information about the exact nature of the + aligned substrings if the field 'trace' is not NULL. Trace points to either an + array of integers (if computed by a Compute_Trace routine), or an array of unsigned + short integers (if computed by Local_Alignment). + + If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short + values: + + d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n + + to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos) + passes through the n trace points for i in [1,n]: + + (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS + and b_i = bbpos + (b_0 + b_1 + b_i-1) + + where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the + interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of + the aread where TS is the "trace spacing" employed when finding the alignment (see + New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the + portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow + the Compute_Trace routines to efficiently compute the exact alignment between the two + reads by efficiently computing exact alignments between consecutive pairs of trace points. + Moreover, the diff values give one an idea of the quality of the alignment along every + segment of TS symbols of the aread. + + If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers + < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j + indicates that a dash should be placed before A[-j] and a positive number k indicates + that a dash should be placed before B[k], where A and B are the two sequences of the + overlap. The indels occur in the trace in the order in which they occur along the + alignment. For a good example of how to "decode" a trace into an alignment, see the + code for the routine Print_Alignment. + +***/ + +typedef struct + { void *trace; + int tlen; + int diffs; + int abpos, bbpos; + int aepos, bepos; + } Path; + + +/*** ALIGNMENT ABSTRACTION: + + An alignment is modeled by an Alignment record, which in addition to a *pointer* to a + 'path', gives pointers to the A and B sequences, their lengths, and indicates whether + the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer + of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact + trace depending on what routines have been called on the record. + + One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL, + or using the sequence of pass-through points in trace, (2) print an ASCII representation + of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence + (which is a reversible process). + + If the alignment record shows the B sequence as complemented, *** THEN IT IS THE + RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of + the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements + the sequence a of length n. The operation does the complementation/reversal in place. + Calling it a second time on a given fragment restores it to its original state. + + With the introduction of the DAMAPPER, we need to code chains of alignments between a + pair of sequences. The alignments of a chain are expected to be found in order either on + a file or in memory, where the START_FLAG marks the first alignment and the NEXT_FLAG all + subsequent alignmenst in a chain. A chain of a single LA is marked with the START_FLAG. + The BEST_FLAG marks one of the best chains for a pair of sequences. The convention is + that either every record has either a START- or NEXT-flag, or none of them do (e.g. as + produced by daligner), so one can always check the flags of the first alignment to see + whether or not the chain concept applies to a given collection or not. +***/ + +#define COMP_FLAG 0x1 +#define ACOMP_FLAG 0x2 // A-sequence is complemented, not B ! Only Local_Alignment notices + +#define COMP(x) ((x) & COMP_FLAG) +#define ACOMP(x) ((x) & ACOMP_FLAG) + +#define START_FLAG 0x4 // LA is the first of a chain of 1 or more la's +#define NEXT_FLAG 0x8 // LA is the next segment of a chain. +#define BEST_FLAG 0x10 // This is the start of the best chain + +#define CHAIN_START(x) ((x) & START_FLAG) +#define CHAIN_NEXT(x) ((x) & NEXT_FLAG) +#define BEST_CHAIN(x) ((x) & BEST_FLAG) + +#define ELIM_FLAG 0x20 // This LA should be ignored + +#define ELIM(x) ((x) & ELIM_FLAG) + +typedef struct + { Path *path; + uint32 flags; /* Pipeline status and complementation flags */ + char *aseq; /* Pointer to A sequence */ + char *bseq; /* Pointer to B sequence */ + int alen; /* Length of A sequence */ + int blen; /* Length of B sequence */ + } Alignment; + +void Complement_Seq(char *a, int n); + + /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working + storage that is more efficiently reused with each call, rather than being allocated anew + with each call. Each *thread* can create a Work_Data object with New_Work_Data and this + object holds and retains the working storage for routines of this module between calls + to the routines. If enough memory for a Work_Data is not available then NULL is returned. + Free_Work_Data frees a Work_Data object and all working storage held by it. + */ + + typedef void Work_Data; + + Work_Data *New_Work_Data(); + + void Free_Work_Data(Work_Data *work); + + /* Local_Alignment seeks local alignments of a quality determined by a number of parameters. + These are coded in an Align_Spec object that can be created with New_Align_Spec and + freed with Free_Align_Spec when no longer needed. There are 4 essential parameters: + + ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio + data we set this to .70 assuming an average of 15% error in each read. + trace_space: the spacing interval for keeping trace points and segment differences (see + description of 'trace' for Paths above) + freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), + freq[2] = f(G), and freq[3] = f(T). This vector is part of the header + of every DAZZ database (see db.h). + reach: a boolean, if set alignment extend to the boundary when reasonable, otherwise + the terminate only at suffix-positive points. + + If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. + overlap), then the last/first 30 columns of the alignment are guaranteed to be + suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically + measured function that increases from 1 as the entropy of freq decreases. If memory is + unavailable or the freq distribution is too skewed then NULL is returned. + + You can get back the original parameters used to create an Align_Spec with the simple + utility functions below. + */ + + typedef void Align_Spec; + + Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq, int reach); + + void Free_Align_Spec(Align_Spec *spec); + + int Trace_Spacing (Align_Spec *spec); + double Average_Correlation(Align_Spec *spec); + float *Base_Frequencies (Align_Spec *spec); + int Overlap_If_Possible(Align_Spec *spec); + + /* Local_Alignment finds the longest significant local alignment between the sequences in + 'align' subject to: + + (a) the alignment criterion given by the Align_Spec 'spec', + (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within + the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh + on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), + (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and + (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. + + The path record of 'align' has its 'trace' filled from the point of view of an overlap + between the aread and the bread. In addition a Path record from the point of view of the + bread versus the aread is returned by the function, with this Path's 'trace' filled in + appropriately. The space for the returned path and the two 'trace's are in the working + storage supplied by the Work_Data packet and this space is reused with each call, so if + one wants to retain the bread-path and the two trace point sequences, then they must be + copied to user-allocated storage before calling the routine again. NULL is returned in + the event of an error. + + Find_Extension is a variant of Local_Alignment that simply finds a local alignment that + either ends (if prefix is non-zero) or begins (if prefix is zero) at the point + (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero + value only when INTERACTIVE is on and it cannot allocate the memory it needs. + Only the path and trace with respect to the aread is returned. This routine is experimental + and may not persist in later versions of the code. + */ + + Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, + int low, int hgh, int anti, int lbord, int hbord); + + int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !! + int diag, int anti, int lbord, int hbord, int prefix); + + /* Given a legitimate Alignment object and associated trace point vector in 'align->path.trace', + Compute_Trace_X, computes an exact trace for the alignment and resets 'align->path.trace' + to point at an integer array within the storage of the Work_Data packet encoding an + exact optimal trace from the start to end points. If the trace is needed beyond the + next call to a routine that sets it, then it should be copied to an array allocated + and managed by the caller. + + Compute_Trace_PTS computes a trace by computing the trace between successive trace points. + It is much, much faster than Compute_Alignment below but at the tradeoff of not necessarily + being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace + by computing the trace between the mid-points of alignments between two adjacent pairs of trace + points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal + alignments. Both these routines return 1 if an error occurred and 0 otherwise. + */ + +#define LOWERMOST -1 // Possible modes for "mode" parameter below) +#define GREEDIEST 0 +#define UPPERMOST 1 + + int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode); + int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode); + + /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where + it assumes the spacing between trace points between both the A and B read varies, and + futher assumes that the A-spacing is given in the short integers normally occupied by + the differences in the alignment between the trace points. This routine is experimental + and may not persist in later versions of the code. + */ + + int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !! + + /* Compute Alignment determines the best alignment between the substrings specified by align. + If the task is DIFF_ONLY, then only the difference of this alignment is computed and placed + in the "diffs" field of align's path. If the task is PLUS_TRACE or DIFF_TRACE, then + 'path.trace' is set to point at an integer array within the storage of the Work_Data packet + encoding a trace point sequence for an optimal alignment, whereas if the task is PLUS_ALIGN + or DIFF_ALIGN, then it points to an optimal trace of an optimatl alignment. The PLUS + tasks can only be called if the immmediately proceeding call was a DIFF_ONLY on the same + alignment record and sequences, in which case a little efficiency is gained by avoiding + the repetition of the top level search for an optimal mid-point. + */ + +#define PLUS_ALIGN 0 +#define PLUS_TRACE 1 +#define DIFF_ONLY 2 +#define DIFF_ALIGN 3 +#define DIFF_TRACE 4 + + int Compute_Alignment(Alignment *align, Work_Data *work, int task, int trace_spacing); + + /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the + two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls + the display width of numbers, it must be not less than the width of any number to be + displayed. + + If the alignment trace is an exact trace, then one can ask Print_Alignment to print an + ASCII representation of the alignment 'align' to the file 'file'. Indent the display + by "indent" spaces and put "width" columns per line in the display. Show "border" + characters of sequence on each side of the aligned region. If upper is non-zero then + display bases in upper case. If coord is greater than 0, then the positions of the + first character in A and B in the given row is displayed with a field width given by + coord's value. + + Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns + per segment, it prints "block" characters of the A sequence in each segment. This results + in segments of different lengths, but is convenient when looking at two alignments involving + A as segments are guaranteed to cover the same interval of A in a segment. + + Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. + + Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then + the trace is ignored, otherwise the trace must be to a full alignment trace and this trace + is also appropriately inverted. + */ + + void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); + + int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, + int indent, int width, int border, int upper, int coord); + + int Print_Reference(FILE *file, Alignment *align, Work_Data *work, + int indent, int block, int border, int upper, int coord); + + void Flip_Alignment(Alignment *align, int full); + + +/*** OVERLAP ABSTRACTION: + + Externally, between modules an Alignment is modeled by an "Overlap" record, which + (a) replaces the pointers to the two sequences with their ID's in the DAZZ data bases, + (b) does not contain the length of the 2 sequences (must fetch from DB), and + (c) contains its path as a subrecord rather than as a pointer (indeed, typically the + corresponding Alignment record points at the Overlap's path sub-record). The trace pointer + is always to a sequence of trace points and can be either compressed (uint8) or + uncompressed (uint16). One can read and write binary records of an "Overlap". +***/ + +typedef struct { + Path path; /* Path: begin- and end-point of alignment + diffs */ + uint32 flags; /* Pipeline status and complementation flags */ + int aread; /* Id # of A sequence */ + int bread; /* Id # of B sequence */ +} Overlap; + + + /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace + (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace + into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to + accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). + + Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that + occupies 'tbytes' bytes per value. It returns non-zero if there was an error writing. + + Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' + where the trace occupes 'tbytes' per value and the print out is indented from the left + margin by 'indent' spaces. + + Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and + Decompress_TraceTo16 does the reverse conversion. If check is set in a call to Compress + then it checks whether the values fit in 8-bits, and if not returns a non-zero result + in interactive mode, or exits with an error message in batch mode. + + Check_Trace_Points checks that the number of trace points is correct and that the sum + of the b-read displacements equals the b-read alignment interval, assuming the trace + spacing is 'tspace'. It reports an error message if there is a problem and 'verbose' + is non-zero. The 'ovl' came from the file names 'fname'. + */ + + int Read_Overlap(FILE *input, Overlap *ovl); + int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); + + int Write_Overlap(FILE *output, Overlap *ovl, int tbytes); + void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); + + int Compress_TraceTo8(Overlap *ovl, int check); + void Decompress_TraceTo16(Overlap *ovl); + + int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname); + +#endif // _A_MODULE diff --git a/daligner.c b/daligner.c new file mode 100644 index 0000000..67fc16c --- /dev/null +++ b/daligner.c @@ -0,0 +1,758 @@ + /******************************************************************************************** + * + * Find all local alignment between long, noisy DNA reads: + * Compare sequences in 'subject' database against those in the list of 'target' databases + * searching for local alignments of 1000bp or more (defined constant MIN_OVERLAP in + * filter.c). Subject is compared in both orientations againt each target. An output + * stream of 'Overlap' records (see align.h) is written in binary to the standard output, + * each encoding a given found local alignment between two of the sequences. The -v + * option turns on a verbose reporting mode that gives statistics on each major stage. + * + * The filter operates by looking for a pair of diagonal bands of width 2^'s' that contain + * a collection of exact matching 'k'-mers between the two sequences, such that the total + * number of bases covered by 'k'-mer hits is 'h'. k cannot be larger than 32 in the + * current implementation. + * + * Some k-mers are significantly over-represented (e.g. homopolymer runs). These are + * suppressed as seed hits, with the parameter 't' -- any k-mer that occurs more than + * 't' times in either the subject or target is not counted as a seed hit. If the -t + * option is absent then no k-mer is suppressed. Alternatively, the option -M specifies + * that 't' is dynamically set to the largest value such that less than -M memory is + * used. + * + * For each subject, target pair, say XXX and YYY, the program outputs a file containing + * overlaps of the form XXX.YYY.[C|N]#.las where C implies that the reads in XXX were + * complemented and N implies they were not (both comparisons are performed), and # is + * the thread that detected and wrote out the collection of overlaps. For example, if + * NTHREAD in the program is 4, then 8 files are output for each subject, target pair. + * + * Author: Gene Myers + * Date : June 1, 2014 + * + *********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#if defined(BSD) +#include +#endif + +#include "DB.h" +#include "lsd.sort.h" +#include "filter.h" + +static char *Usage[] = + { "[-vaABI] [-k] [-%] [-h] [-w] [-t]", + " [-M] [-e] [-s] [-H]", + " [-T] [-P] [-m]+", + " ...", + }; + +int VERBOSE; // Globally visible to filter.c +int MINOVER; +int HGAP_MIN; +int SYMMETRIC; +int IDENTITY; +int BRIDGE; +char *SORT_PATH; + +uint64 MEM_LIMIT; +uint64 MEM_PHYSICAL; + +/* Adapted from code by David Robert Nadeau (http://NadeauSoftware.com) licensed under + * "Creative Commons Attribution 3.0 Unported License" + * (http://creativecommons.org/licenses/by/3.0/deed.en_US) + * + * I removed Windows options, reformated, and return int64 instead of size_t + */ + +static int64 getMemorySize( ) +{ +#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) + + // OSX, NetBSD, OpenBSD + + int mib[2]; + size_t size = 0; + size_t len = sizeof( size ); + + mib[0] = CTL_HW; +#if defined(HW_MEMSIZE) + mib[1] = HW_MEMSIZE; // OSX +#elif defined(HW_PHYSMEM64) + mib[1] = HW_PHYSMEM64; // NetBSD, OpenBSD +#endif + if (sysctl(mib,2,&size,&len,NULL,0) == 0) + return ((size_t) size); + return (0); + +#elif defined(_SC_AIX_REALMEM) + + // AIX + + return ((size_t) sysconf( _SC_AIX_REALMEM ) * ((size_t) 1024L)); + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) + + // FreeBSD, Linux, OpenBSD, & Solaris + + size_t size = 0; + + size = (size_t) sysconf(_SC_PHYS_PAGES); + return (size * ((size_t) sysconf(_SC_PAGESIZE))); + +#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) + + // ? Legacy ? + + size_t size = 0; + + size = (size_t) sysconf(_SC_PHYS_PAGES); + return (size * ((size_t) sysconf(_SC_PAGE_SIZE))); + +#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) + + // DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX + + int mib[2]; + unsigned int size = 0; + size_t len = sizeof( size ); + + mib[0] = CTL_HW; +#if defined(HW_REALMEM) + mib[1] = HW_REALMEM; // FreeBSD +#elif defined(HW_PYSMEM) + mib[1] = HW_PHYSMEM; // Others +#endif + if (sysctl(mib,2,&size,&len,NULL,0) == 0) + return (size_t)size; + return (0); + +#else + + return (0); + +#endif +} + +typedef struct + { int *ano; + int *end; + int idx; + int out; + } Event; + +static void reheap(int s, Event **heap, int hsize) +{ int c, l, r; + Event *hs, *hr, *hl; + + c = s; + hs = heap[s]; + while ((l = 2*c) <= hsize) + { r = l+1; + hl = heap[l]; + hr = heap[r]; + if (hr->idx > hl->idx) + { if (hs->idx > hl->idx) + { heap[c] = hl; + c = l; + } + else + break; + } + else + { if (hs->idx > hr->idx) + { heap[c] = hr; + c = r; + } + else + break; + } + } + if (c != s) + heap[c] = hs; +} + +static int64 merge_size(DAZZ_DB *block, int mtop) +{ Event ev[mtop+1]; + Event *heap[mtop+2]; + int r, mhalf; + int64 nsize; + + { DAZZ_TRACK *track; + int i; + + track = block->tracks; + for (i = 0; i < mtop; i++) + { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; + ev[i].out = 1; + heap[i+1] = ev+i; + track = track->next; + } + ev[mtop].idx = INT32_MAX; + heap[mtop+1] = ev+mtop; + } + + mhalf = mtop/2; + + nsize = 0; + for (r = 0; r < block->nreads; r++) + { int i, level, hsize; + DAZZ_TRACK *track; + + track = block->tracks; + for (i = 0; i < mtop; i++) + { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; + if (ev[i].ano < ev[i].end) + ev[i].idx = *(ev[i].ano); + else + ev[i].idx = INT32_MAX; + track = track->next; + } + hsize = mtop; + + for (i = mhalf; i > 1; i--) + reheap(i,heap,hsize); + + level = 0; + while (1) + { Event *p; + + reheap(1,heap,hsize); + + p = heap[1]; + if (p->idx == INT32_MAX) break; + + p->out = 1-p->out; + if (p->out) + { level -= 1; + if (level == 0) + nsize += 1; + } + else + { if (level == 0) + nsize += 1; + level += 1; + } + p->ano += 1; + if (p->ano >= p->end) + p->idx = INT32_MAX; + else + p->idx = *(p->ano); + } + } + + return (nsize); +} + +static DAZZ_TRACK *merge_tracks(DAZZ_DB *block, int mtop, int64 nsize) +{ DAZZ_TRACK *ntrack; + Event ev[mtop+1]; + Event *heap[mtop+2]; + int r, mhalf; + int64 *anno; + int *alen; + int *data; + + ntrack = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating merged track"); + if (ntrack == NULL) + Clean_Exit(1); + ntrack->name = Strdup("merge","Allocating merged track"); + ntrack->anno = anno = (int64 *) Malloc(sizeof(int64)*(block->nreads+1),"Allocating merged track"); + ntrack->alen = alen = (int *) Malloc(sizeof(int)*block->nreads,"Allocating merged track"); + ntrack->data = data = (int *) Malloc(sizeof(int)*nsize,"Allocating merged track"); + ntrack->size = sizeof(int); + ntrack->next = NULL; + ntrack->loaded = 1; + if (anno == NULL || alen == NULL || data == NULL || ntrack->name == NULL) + Clean_Exit(1); + + { DAZZ_TRACK *track; + int i; + + track = block->tracks; + for (i = 0; i < mtop; i++) + { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; + ev[i].out = 1; + heap[i+1] = ev+i; + track = track->next; + } + ev[mtop].idx = INT32_MAX; + heap[mtop+1] = ev+mtop; + } + + mhalf = mtop/2; + + nsize = 0; + for (r = 0; r < block->nreads; r++) + { int i, level, hsize; + DAZZ_TRACK *track; + + anno[r] = nsize; + + track = block->tracks; + for (i = 0; i < mtop; i++) + { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; + if (ev[i].ano < ev[i].end) + ev[i].idx = *(ev[i].ano); + else + ev[i].idx = INT32_MAX; + track = track->next; + } + hsize = mtop; + + for (i = mhalf; i > 1; i--) + reheap(i,heap,hsize); + + level = 0; + while (1) + { Event *p; + + reheap(1,heap,hsize); + + p = heap[1]; + if (p->idx == INT32_MAX) break; + + p->out = 1-p->out; + if (p->out) + { level -= 1; + if (level == 0) + data[nsize++] = p->idx; + } + else + { if (level == 0) + data[nsize++] = p->idx; + level += 1; + } + p->ano += 1; + if (p->ano >= p->end) + p->idx = INT32_MAX; + else + p->idx = *(p->ano); + } + alen[r] = (int) (nsize - anno[r]); + } + anno[r] = nsize; + + return (ntrack); +} + +static int read_DB(DAZZ_DB *block, char *name, char **mask, int *mstat, int mtop, int kmer) +{ int i, isdam, status, kind, stop; + + isdam = Open_DB(name,block); + if (isdam < 0) + Clean_Exit(1); + + for (i = 0; i < mtop; i++) + { status = Check_Track(block,mask[i],&kind); + if (status >= 0) + { if (kind != MASK_TRACK) + { fprintf(stderr,"%s: %s track is not a mask track.\n",Prog_Name,mask[i]); + exit (1); + } + if (status == 0) + Open_Track(block,mask[i]); + mstat[i] = 1; + } + else if (status == -1) + { printf("%s: Warning: %s track not sync'd with db %s, ignored.\n", + Prog_Name,mask[i],name); + } + } + + Trim_DB(block); + + stop = 0; + for (i = 0; i < mtop; i++) + { DAZZ_TRACK *track; + int64 *anno; + int j; + + status = Check_Track(block,mask[i],&kind); + if (status < 0) + continue; + + stop += 1; + track = Open_Track(block,mask[i]); + Load_All_Track_Data(track); + + anno = (int64 *) (track->anno); + for (j = 0; j <= block->nreads; j++) + anno[j] /= sizeof(int); + } + + if (stop > 1) + { int64 nsize; + DAZZ_TRACK *track; + + nsize = merge_size(block,stop); + track = merge_tracks(block,stop,nsize); + + while (block->tracks != NULL) + Close_Track(block,block->tracks); + + block->tracks = track; + } + + if (block->cutoff < kmer) + { for (i = 0; i < block->nreads; i++) + if (block->reads[i].rlen < kmer) + { fprintf(stderr,"%s: Block %s contains reads < %dbp long ! Run DBsplit -x%d\n", + Prog_Name,name,kmer,kmer); + Clean_Exit(1); + } + } + + Load_All_Reads(block,0); + + return (isdam); +} + +static char *CommandBuffer(char *aname, char *bname, char *spath) +{ static char *cat = NULL; + static int max = -1; + int len; + + len = 2*(strlen(aname) + strlen(bname) + strlen(spath)) + 200; + if (len > max) + { max = ((int) (1.2*len)) + 100; + if ((cat = (char *) realloc(cat,max+1)) == NULL) + { fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name); + Clean_Exit(1); + } + } + return (cat); +} + +void Clean_Exit(int val) +{ char *command; + + command = CommandBuffer("","",SORT_PATH); + sprintf(command,"rm -r %s",SORT_PATH); + if (system(command) != 0) + { fprintf(stderr,"%s: Command Failed:\n%*s %s\n", + Prog_Name,(int) strlen(Prog_Name),"",command); + exit (1); + } + exit (val); +} + +int main(int argc, char *argv[]) +{ DAZZ_DB _ablock, _bblock; + DAZZ_DB *ablock = &_ablock, *bblock = &_bblock; + char *afile, *bfile; + char *apath, *bpath; + char *aroot, *broot; + void *aindex, *bindex; + int alen, blen; + Align_Spec *asettings; + int isdam; + int MMAX, MTOP, *MSTAT; + char **MASK; + + int KMER_LEN; + int MOD_THR; + int BIN_SHIFT; + int MAX_REPS; + int HIT_MIN; + double AVE_ERROR; + int SPACING; + int NTHREADS; + int MAP_ORDER; + +#ifdef PROFILE + struct rusage stime, etime; + + getrusage(RUSAGE_SELF, &stime); +#endif + + { int i, j, k; + int flags[128]; + char *eptr; + DIR *dirp; + + ARG_INIT("daligner2.0") + + KMER_LEN = 16; + MOD_THR = 28; + HIT_MIN = 50; + BIN_SHIFT = 6; + MAX_REPS = 0; + HGAP_MIN = 0; + AVE_ERROR = .75; + SPACING = 100; + MINOVER = 1500; // Globally visible to filter.c + NTHREADS = 4; + SORT_PATH = "/tmp"; + + MEM_PHYSICAL = getMemorySize(); + MEM_LIMIT = MEM_PHYSICAL; + if (MEM_PHYSICAL == 0) + { fprintf(stderr,"\nWarning: Could not get physical memory size\n"); + fflush(stderr); + } + + MTOP = 0; + MMAX = 10; + MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); + MSTAT = (int *) Malloc(MMAX*sizeof(int),"Allocating mask status array"); + if (MASK == NULL || MSTAT == NULL) + exit (1); + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("vaABI") + break; + case 'k': + ARG_POSITIVE(KMER_LEN,"K-mer length") + if (KMER_LEN > 32) + { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name); + exit (1); + } + break; + case 'w': + ARG_POSITIVE(BIN_SHIFT,"Log of bin width") + break; + case 'h': + ARG_POSITIVE(HIT_MIN,"Hit threshold (in bp.s)") + break; + case 't': + ARG_POSITIVE(MAX_REPS,"Tuple supression frequency") + break; + case 'H': + ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") + break; + case 'e': + ARG_REAL(AVE_ERROR) + if (AVE_ERROR < .7 || AVE_ERROR >= 1.) + { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n", + Prog_Name,AVE_ERROR); + exit (1); + } + break; + case 'l': + ARG_POSITIVE(MINOVER,"Minimum alignment length") + break; + case 's': + ARG_POSITIVE(SPACING,"Trace spacing") + break; + case 'M': + { int limit; + + ARG_NON_NEGATIVE(limit,"Memory allocation (in Gb)") + MEM_LIMIT = limit * 0x40000000ll; + break; + } + case 'm': + if (MTOP >= MMAX) + { MMAX = 1.2*MTOP + 10; + MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); + MSTAT = (int *) Realloc(MSTAT,MMAX*sizeof(int),"Reallocating mask status array"); + if (MASK == NULL || MSTAT == NULL) + exit (1); + } + MSTAT[MTOP] = 0; + MASK[MTOP++] = argv[i]+2; + break; + case 'P': + SORT_PATH = argv[i]+2; + if ((dirp = opendir(SORT_PATH)) == NULL) + { fprintf(stderr,"%s: -P option: cannot open directory %s\n",Prog_Name,SORT_PATH); + exit (1); + } + closedir(dirp); + break; + case 'T': + ARG_POSITIVE(NTHREADS,"Number of threads") + break; + case '%': + ARG_POSITIVE(MOD_THR,"Modimer percentage") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; // Globally declared in filter.h + SYMMETRIC = 1-flags['A']; + IDENTITY = flags['I']; + BRIDGE = flags['B']; + MAP_ORDER = flags['a']; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); + fprintf(stderr,"\n"); + fprintf(stderr," -k: k-mer size (must be <= 32).\n"); + fprintf(stderr," -%%: modimer percentage (take %% of the k-mers).\n"); + fprintf(stderr," -w: Look for k-mers in averlapping bands of size 2^-w.\n"); + fprintf(stderr," -h: A seed hit if the k-mers in band cover >= -h bps in the"); + fprintf(stderr," targest read.\n"); + fprintf(stderr," -t: Ignore k-mers that occur >= -t times in a block.\n"); + fprintf(stderr," -M: Use only -M GB of memory by ignoring most frequent k-mers.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -e: Look for alignments with -e percent similarity.\n"); + fprintf(stderr," -l: Look for alignments of length >= -l.\n"); + fprintf(stderr," -s: The trace point spacing for encoding alignments.\n"); + fprintf(stderr," -B: Bridge consecutive aligned segments into one if possible\n"); + fprintf(stderr," -H: HGAP option: align only target reads of length >= -H.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -T: Use -T threads.\n"); + fprintf(stderr," -P: Do block level sorts and merges in directory -P.\n"); + fprintf(stderr," -m: Soft mask the blocks with the specified mask.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); + fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n"); + fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n"); + fprintf(stderr," -A: Compare subjet to target, but not vice versa.\n"); + fprintf(stderr," -I: Compare reads to themselves\n"); + exit (1); + } + } + + MINOVER *= 2; + Set_Filter_Params(KMER_LEN,MOD_THR,BIN_SHIFT,MAX_REPS,HIT_MIN,NTHREADS); + Set_LSD_Params(NTHREADS,VERBOSE); + + // Create directory in SORT_PATH for file operations + + { char *newpath; + + newpath = (char *) Malloc(strlen(SORT_PATH)+30,"Allocating sort path"); + if (newpath == NULL) + exit (1); + sprintf(newpath,"%s/daligner.%d",SORT_PATH,getpid()); + if (mkdir(newpath,S_IRWXU) != 0) + { fprintf(stderr,"%s: Could not create directory %s\n",Prog_Name,newpath); + exit (1); + } + SORT_PATH = newpath; + } + + // Read in the reads in A + + afile = argv[1]; + isdam = read_DB(ablock,afile,MASK,MSTAT,MTOP,KMER_LEN); + if (isdam) + aroot = Root(afile,".dam"); + else + aroot = Root(afile,".db"); + apath = PathTo(afile); + + asettings = New_Align_Spec( AVE_ERROR, SPACING, ablock->freq, 1); + + if (VERBOSE) + printf("\nBuilding index for %s\n",aroot); + aindex = Sort_Kmers(ablock,&alen); + + // Compare against reads in B in both orientations + + { int i, j; + Block_Looper *parse; + char *command; + + for (i = 2; i < argc; i++) + { parse = Parse_Block_DB_Arg(argv[i]); + + while (Advance_Block_Arg(parse)) + { broot = Block_Arg_Root(parse); + bpath = Block_Arg_Path(parse); + + if (strcmp(broot,aroot) != 0 || strcmp(bpath,apath) != 0) + { bfile = Strdup(Catenate(bpath,"/",broot,""),"Allocating path"); + read_DB(bblock,bfile,MASK,MSTAT,MTOP,KMER_LEN); + free(bfile); + + if (VERBOSE) + printf("\nBuilding index for %s\n",broot); + bindex = Sort_Kmers(bblock,&blen); + Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,asettings); + Close_DB(bblock); + } + else + Match_Filter(aroot,ablock,aroot,ablock,aindex,alen,aindex,alen,asettings); + +#define SYSTEM_CHECK(command) \ + if (VERBOSE) \ + printf("\n%s\n",command); \ + if (system(command) != 0) \ + { fprintf(stderr,"\n%s: Command Failed:\n%*s %s\n", \ + Prog_Name,(int) strlen(Prog_Name),"",command); \ + Clean_Exit(1); \ + } + + command = CommandBuffer(aroot,broot,SORT_PATH); + + sprintf(command,"LAsort %s %s %s/%s.%s.N%c",VERBOSE?"-v":"", + MAP_ORDER?"-a":"",SORT_PATH,aroot,broot,BLOCK_SYMBOL); + SYSTEM_CHECK(command) + + sprintf(command,"LAmerge %s %s %s.%s.las %s/%s.%s.N%c.S",VERBOSE?"-v":"", + MAP_ORDER?"-a":"",aroot,broot,SORT_PATH,aroot,broot,BLOCK_SYMBOL); + SYSTEM_CHECK(command) + + if (strcmp(broot,aroot) != 0 || strcmp(bpath,apath) != 0) + { if (SYMMETRIC) + { sprintf(command,"LAsort %s %s %s/%s.%s.N%c",VERBOSE?"-v":"", + MAP_ORDER?"-a":"",SORT_PATH,broot,aroot,BLOCK_SYMBOL); + SYSTEM_CHECK(command) + + sprintf(command,"LAmerge %s %s %s.%s.las %s/%s.%s.N%c.S",VERBOSE?"-v":"", + MAP_ORDER?"-a":"",broot,aroot,SORT_PATH,broot,aroot,BLOCK_SYMBOL); + SYSTEM_CHECK(command) + } + } + + free(bpath); + free(broot); + } + + Free_Block_Arg(parse); + } + + for (j = 0; j < MTOP; j++) + if (MSTAT[j] == 0) + printf("%s: Warning: Track %s given but never used.\n", Prog_Name,MASK[j]); + } + + free(aindex); + Close_DB(ablock); + free(apath); + free(aroot); + +#ifdef PROFILE + { int64 secs, mics; + + getrusage(RUSAGE_SELF, &etime); + + secs = etime.ru_utime.tv_sec - stime.ru_utime.tv_sec; + mics = etime.ru_utime.tv_usec - stime.ru_utime.tv_usec; + mics += 1000000*secs; + + secs = etime.ru_stime.tv_sec - stime.ru_stime.tv_sec; + mics += etime.ru_stime.tv_usec - stime.ru_stime.tv_usec; + mics += 1000000*secs; + + printf("T %lld\n",mics/1000); + } +#endif + + Clean_Exit(0); + exit (0); +} diff --git a/dumpLA.c b/dumpLA.c new file mode 100644 index 0000000..89e52eb --- /dev/null +++ b/dumpLA.c @@ -0,0 +1,177 @@ +#include +#include + +#include "DB.h" +#include "align.h" + +int main(int argc, char *argv[]) +{ char code, which; + int64 total; + int aread, bread; + char orient, chain; + int alen, blen; + int ab, ae, bb, be; + int diffs; + int len; + int tspace, small, tbytes; + uint8 *tbuffer = NULL; + uint16 *sbuffer = NULL; + int hasNext, haveC, haveT, haveD; + int nline; + int i; + int64 novls; + Overlap _ovl, *ovl = &_ovl; + Path *path = & (ovl->path); + FILE *output; + char *pwd, *root; + + // Process arguments + + if (argc != 2) + { fprintf(stderr,"Usage: dumpLA < (ascii dump)\n"); + exit (1); + } + + pwd = PathTo(argv[1]); + root = Root(argv[1],".las"); + if ((output = fopen(Catenate(pwd,"/",root,".las"),"w")) == NULL) + { fprintf(stderr,"DumpLA: Cannot open %s for writing\n",argv[1]); + exit (1); + } + free(root); + free(pwd); + + nline = 1; + small = 0; + tbytes = 2; + hasNext = 0; + while (scanf(" %c",&code) == 1) // Header lines + if (code == '@' || code == '+' || code == '%') + { scanf(" %c %lld",&which,&total); + if (code == '@' && which == 'T') + { tbuffer = (uint8 *) malloc(2*total*sizeof(uint16)); + sbuffer = (uint16 *) tbuffer; + } + nline += 1; + } + else + { if (tbuffer == NULL) + { fprintf(stderr,"DumpLA: Line %d: .las dump must contain trace header lines\n",nline); + exit (1); + } + if (code != 'X') + { fprintf(stderr,"DumpLA: Line %d: .las dump must have an X-line after header\n",nline); + exit (1); + } + scanf(" %d",&tspace); + if (tspace <= TRACE_XOVR && tspace != 0) + { small = 1; + tbytes = 1; + } + else + { small = 0; + tbytes = 2; + } + nline += 1; + if (scanf(" %c",&code) == 1) + { if (code != 'P') + { fprintf(stderr,"DumpLA: Line %d: .las dump data must being with a P-line\n",nline); + exit (1); + } + hasNext = 1; + } + break; + } + + novls = 0; + fwrite(&novls,sizeof(int64),1,output); + fwrite(&tspace,sizeof(int),1,output); + + while (hasNext) // For each data line do + { scanf(" %d %d %c %c",&aread,&bread,&orient,&chain); + nline += 1; + haveC = haveT = haveD = 0; + hasNext = 0; + while (scanf(" %c",&code) == 1) // For each data line do + if (code == 'P') + { hasNext = 1; + break; + } + else + switch (code) + { case 'L': // Read lengths + scanf(" %d %d",&alen,&blen); + nline += 1; + break; + case 'C': // Coordinate intervals + scanf(" %d %d %d %d",&ab,&ae,&bb,&be); + nline += 1; + haveC = 1; + break; + case 'D': // Differences + scanf(" %d",&diffs); + nline += 1; + haveD = 1; + break; + case 'T': // Mask + haveT = 1; + scanf(" %d",&len); + nline += (len+1); + len *= 2; + if (small) + { for (int i = 0; i < len; i += 2) + scanf(" %hhd %hhd",tbuffer+i,tbuffer+(i+1)); + } + else + { for (int i = 0; i < len; i += 2) + scanf(" %hd %hd",sbuffer+i,sbuffer+(i+1)); + } + break; + default: + fprintf(stderr,"DumpLA: Line %d: Unrecognized line type '%c'\n",nline,code); + exit (1); + } + if (!haveC) + { fprintf(stderr,"DumpLA: Line %d: Alignment record does not have a C-line\n",nline); + exit (1); + } + if (!haveT) + { fprintf(stderr,"DumpLA: Line %d: Alignment record does not have a T-line\n",nline); + exit (1); + } + if (!haveD) + { diffs = 0; + for (i = 0; i < len; i += 2) + diffs += tbuffer[i]; + } + + novls += 1; + ovl->aread = aread-1; + ovl->bread = bread-1; + ovl->flags = 0; + if (orient == 'c') + ovl->flags |= COMP_FLAG; + if (chain == '-') + ovl->flags |= NEXT_FLAG; + else if (chain == '>') + ovl->flags |= BEST_FLAG; + else if (chain == '+') + ovl->flags |= START_FLAG; + path->abpos = ab; + path->aepos = ae; + path->bbpos = bb; + path->bepos = be; + path->diffs = diffs; + path->tlen = len; + path->trace = (void *) tbuffer; + + Write_Overlap(output,ovl,tbytes); + } + + rewind(output); + fwrite(&novls,sizeof(int64),1,output); + + fclose(output); + + exit (0); +} diff --git a/filter.c b/filter.c new file mode 100644 index 0000000..982c9e9 --- /dev/null +++ b/filter.c @@ -0,0 +1,2677 @@ +/******************************************************************************************** + * + * Fast local alignment filter for long, noisy reads based on "dumbing down" of my RECOMB 2005 + * filter with Jens Stoye, and a "smarting up" of the k-mer matching by turning it into + * a threaded sort and merge paradigm using a super cache coherent radix sort. Local + * alignment is accomplised with dynamically-banded O(nd) algorithm that terminates when + * it fails to find a e-matching patch for a significant distance, and polishes the match + * to the last e-prefix-positive 32-mer. + * + * Author : Gene Myers + * First : June 2013 + * Current: June 1, 2014 + * + ********************************************************************************************/ + +// A complete threaded code for the filter + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "lsd.sort.h" +#include "filter.h" +#include "align.h" + + // WHen running sensitivity trials, compute histogram of +#define MAXHIT 1000 // false & true positive hit scores + + // Debug Controls + +#undef TEST_KSORT +#undef TEST_PAIRS +#undef TEST_CSORT +#define HOW_MANY 3000 // Print first HOW_MANY items for each of the TEST options above + +#define DO_ALIGNMENT + +#undef TEST_GATHER +#undef TEST_CONTAIN +#undef TEST_BRIDGE +#undef SHOW_OVERLAP // Show the cartoon +#undef SHOW_ALIGNMENT // Show the alignment + +#define ALIGN_WIDTH 80 // Parameters for alignment +#define ALIGN_INDENT 20 +#define ALIGN_BORDER 10 + +#ifdef SHOW_OVERLAP +#define NOTHREAD +#endif + +#ifdef TEST_GATHER +#define NOTHREAD +#endif + +#ifdef TEST_CONTAIN +#define NOTHREAD +#endif + + // Algorithm constants & global data types + +#define THREAD pthread_t + +#define MAX_CODE_16 0xffffu +#define MAX_CODE_32 0xffffffffu +#define MAX_CODE_64 0xffffffffffffffffllu + +#define SIGN_BIT 0x1u +#define LONG_BIT 0x80000000u +#define POST_MASK 0x7fffffffu + +#define MAXGRAM 10000 // Cap on k-mer count histogram (in count_thread, merge_thread) + +#define PANEL_SIZE 50000 // Size to break up very long A-reads +#define PANEL_OVERLAP 10000 // Overlap of A-panels + +#define MATCH_CHUNK 100 // Max initial number of hits between two reads +#define TRACE_CHUNK 20000 // Max initial trace points in hits between two reads + +typedef struct + { uint32 rpos; + uint32 read; + uint64 code; + } KmerPos; + +typedef struct + { int aread; + int bread; + int apos; + int diag; + } SeedPair; + +/******************************************************************************************* + * + * PARAMETER SETUP + * + ********************************************************************************************/ + + // K-mer selection strategy control: + // Select modimizers mod MODULUS < ModThr (best) + +#define MODULUS 101 + +static int Kmer; +static uint64 ModThr; +static int Koff; // Kmer + 1; +static int Kshift; // 2*Kmer +static uint64 Kmask; // 2^Kshift - 1 + +static uint64 LFmask; // 4^floor(Kmer/2)-1 +static uint64 HFmask; // Kmask - LFmask; +static uint64 LRmask; // 4^ceil(Kmer/2)-1 +static uint64 HRmask; // Kmask - LRmask; + +static int Hitmin; +static int Binshift; +static int Suppress; +static int TooFrequent; // (Suppress != 0) ? Suppress : INT32_MAX + +static int NTHREADS; // # of threads to use + +void Set_Filter_Params(int kmer, int mod, int binshift, int suppress, int hitmin, int nthread) +{ if (kmer > 32) + { fprintf(stderr,"%s: Kmer length must be <= 32\n",Prog_Name); + exit (1); + } + + Kmer = kmer; + Koff = kmer+1; + ModThr = mod; + Binshift = binshift; + Suppress = suppress; + Hitmin = hitmin; + + Kmer = kmer; + if (Kmer >= 32) + { Kshift = 64; + Kmask = MAX_CODE_64; + } + else + { Kshift = 2*Kmer; + Kmask = (0x1llu << Kshift) - 1; + } + + LFmask = (0x1llu<<(Kshift/2))-1; + HFmask = Kmask - LFmask; + + LRmask = (0x1llu<<((Kshift+1)/2))-1; + HRmask = Kmask - LRmask; + + if (Suppress == 0) + TooFrequent = INT32_MAX; + else + TooFrequent = Suppress; + + NTHREADS = nthread; +} + + +/******************************************************************************************* + * + * INDEX BUILD + * + ********************************************************************************************/ + +static DAZZ_DB *TA_block; +static DAZZ_TRACK *TA_track; + +static KmerPos *FR_src; +static KmerPos *FR_trg; + +static uint64 Cumber[4]; // Cumber[i] = (3-i) << (Kshift-2) + +typedef struct + { int beg; + int end; + int fill; + } Tuple_Arg; + + // for reads [beg,end) computing how many k-tuples are not masked + +static void *mask_thread(void *arg) +{ Tuple_Arg *data = (Tuple_Arg *) arg; + DAZZ_READ *reads = TA_block->reads; + int km1 = Kmer-1; + int beg, end, idx; + int64 a, b, f; + int i, p, q; + + int x; + uint64 c, u; + uint64 d, v; + char *s; + + beg = data->beg; + end = data->end; + idx = 0; + + s = ((char *) (TA_block->bases)) + TA_block->reads[beg].boff; + if (TA_track != NULL) + { int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; + int *point = (int *) (TA_track->data); + + q = 0; + f = anno1[beg-1]; + for (i = beg; i < end; i++) + { b = f; + f = anno1[i]; + for (a = b; a <= f; a += 2) + { if (a == b) + p = 0; + else + p = point[a-1]; + if (a == f) + q = reads[i].rlen; + else + q = point[a]; + if (q-p > km1) + { c = u = 0; + for (x = 1; x < Kmer; x++) + { c = (c << 2) | s[p]; + u = (u >> 2) | Cumber[(int) s[p++]]; + } + while (p < q) + { x = s[p]; + + d = (c & HFmask); + c = ((c << 2) | x) & Kmask; + d = d | (c & LFmask); + + v = (u & LRmask); + u = (u >> 2) | Cumber[x]; + v = v | (u & HRmask); + + if (u < c) + { if (u % MODULUS < ModThr) + idx += 1; + } + else + { if (c % MODULUS < ModThr) + idx += 1; + } + + if (v < d) + { if (v % MODULUS < ModThr) + idx += 1; + } + else + { if (d % MODULUS < ModThr) + idx += 1; + } + p += 1; + } + } + } + s += (q+1); + } + } + else + for (i = beg; i < end; i++) + { q = reads[i].rlen; + c = u = 0; + for (p = 0; p < km1; p++) + { x = s[p]; + c = (c << 2) | x; + u = (u >> 2) | Cumber[x]; + } + for (p = km1; p < q; p++) + { x = s[p]; + + d = (c & HFmask); + c = ((c << 2) | x) & Kmask; + d = d | (c & LFmask); + + v = (u & LRmask); + u = (u >> 2) | Cumber[x]; + v = v | (u & HRmask); + + if (u < c) + { if (u % MODULUS < ModThr) + idx += 1; + } + else + { if (c % MODULUS < ModThr) + idx += 1; + } + + if (v < d) + { if (v % MODULUS < ModThr) + idx += 1; + } + else + { if (d % MODULUS < ModThr) + idx += 1; + } + } + s += (q+1); + } + + data->fill = idx; + return (NULL); +} + + // for reads [beg,end) place their k-tuples in list starting at index idx + +static void *tuple_thread(void *arg) +{ Tuple_Arg *data = (Tuple_Arg *) arg; + DAZZ_READ *reads = TA_block->reads; + int km1 = Kmer-1; + KmerPos *list = FR_src; + int beg, end, idx; + int64 a, b, f; + int i, p, q, x, r; + uint64 c, u; + uint64 d, v; + uint32 lbit; + char *s; + + beg = data->beg; + end = data->end; + idx = data->fill; + + s = ((char *) (TA_block->bases)) + TA_block->reads[beg].boff; + if (TA_track != NULL) + { int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; + int *point = (int *) (TA_track->data); + + q = 0; + f = anno1[beg-1]; + for (i = beg; i < end; i++) + { r = (i << 1); + b = f; + f = anno1[i]; + for (a = b; a <= f; a += 2) + { if (a == b) + p = 0; + else + p = point[a-1]; + if (a == f) + q = reads[i].rlen; + else + q = point[a]; + if (q-p > km1) + { c = 0; + u = 0; + for (x = 1; x < Kmer; x++) + { c = ((c << 2) | s[p]); + u = (u >> 2) | Cumber[(int) s[p]]; + p += 1; + } + + lbit = 0; + while (p < q) + { x = s[p++]; + + d = (c & HFmask); + c = ((c << 2) | x) & Kmask; + d = d | (c & LFmask); + + v = (u & LRmask); + u = (u >> 2) | Cumber[x]; + v = v | (u & HRmask); + + if (u < c) + { if (u % MODULUS < ModThr) + { list[idx].code = u; + list[idx].read = r | SIGN_BIT; + list[idx].rpos = p; + idx += 1; + } + } + else + { if (c % MODULUS < ModThr) + { list[idx].code = c; + list[idx].read = r; + list[idx].rpos = p; + idx += 1; + } + } + + if (v < d) + { if (v % MODULUS < ModThr) + { list[idx].code = v; + list[idx].read = r | SIGN_BIT; + list[idx].rpos = p | lbit; + idx += 1; + } + } + else + { if (d % MODULUS < ModThr) + { list[idx].code = d; + list[idx].read = r; + list[idx].rpos = p | lbit; + idx += 1; + } + } + lbit = LONG_BIT; + } + } + } + s += (q+1); + } + } + + else + for (i = beg; i < end; i++) + { q = reads[i].rlen; + r = (i << 1); + c = 0; + u = 0; + for (p = 0; p < km1; p++) + { x = s[p]; + c = (c << 2) | x; + u = (u >> 2) | Cumber[x]; + } + lbit = 0; + while (p < q) + { x = s[p++]; + + d = (c & HFmask); + c = ((c << 2) | x) & Kmask; + d = d | (c & LFmask); + + v = (u & LRmask); + u = (u >> 2) | Cumber[x]; + v = v | (u & HRmask); + + if (u < c) + { if (u % MODULUS < ModThr) + { list[idx].code = u; + list[idx].read = r | SIGN_BIT; + list[idx].rpos = p; + idx += 1; + } + } + else + { if (c % MODULUS < ModThr) + { list[idx].code = c; + list[idx].read = r; + list[idx].rpos = p; + idx += 1; + } + } + + if (v < d) + { if (v % MODULUS < ModThr) + { list[idx].code = v; + list[idx].read = r | SIGN_BIT; + list[idx].rpos = p | lbit; + idx += 1; + } + } + else + { if (d % MODULUS < ModThr) + { list[idx].code = d; + list[idx].read = r; + list[idx].rpos = p | lbit; + idx += 1; + } + } + lbit = LONG_BIT; + } + s += (q+1); + } + + return (NULL); +} + +static void *compsize_thread(void *arg) +{ Tuple_Arg *data = (Tuple_Arg *) arg; + int end = data->end; + KmerPos *src = FR_src; + int n, i, c, p; + uint64 h, g; + + i = data->beg; + h = src[i].code; + n = 0; + while (i < end) + { p = i++; + while (1) + { g = src[i].code; + if (g != h) + break; + i += 1; + } + if ((c = (i-p)) < TooFrequent) + n += c; + h = g; + } + + data->fill = n; + return (NULL); +} + +static void *compress_thread(void *arg) +{ Tuple_Arg *data = (Tuple_Arg *) arg; + int end = data->end; + KmerPos *src = FR_src; + KmerPos *trg = FR_trg; + int n, i, p; + uint64 h, g; + + i = data->beg; + h = src[i].code; + n = data->fill; + while (i < end) + { p = i++; + while (1) + { g = src[i].code; + if (g != h) + break; + i += 1; + } + if (i-p < TooFrequent) + { while (p < i) + trg[n++] = src[p++]; + } + h = g; + } + + return (NULL); +} + +void *Sort_Kmers(DAZZ_DB *block, int *len) +{ THREAD threads[NTHREADS]; + Tuple_Arg parmt[NTHREADS]; + + KmerPos *src, *trg, *rez; + int kmers, nreads; + + nreads = block->nreads; + + if (block->reads[nreads].boff > 0x7fffffffll) + { fprintf(stderr,"%s: Fatal error, DB blocks are greater than 2Gbp!\n",Prog_Name); + Clean_Exit(1); + } + + if (nreads <= 0) + goto no_mers; + + TA_block = block; + TA_track = block->tracks; + + Cumber[0] = (0x3llu << (Kshift-2)); + Cumber[1] = (0x2llu << (Kshift-2)); + Cumber[2] = (0x1llu << (Kshift-2)); + Cumber[3] = (0x0llu << (Kshift-2)); + + // Determine how many k-tuples will be listed for each thread + // and use that to set up index drop points + + { int i, x, z; + + parmt[0].beg = 0; + for (i = 1; i < NTHREADS; i++) + parmt[i].beg = parmt[i-1].end = (((int64) nreads) * i) / NTHREADS; + parmt[NTHREADS-1].end = nreads; + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,mask_thread,parmt+i); + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + + x = 0; + for (i = 0; i < NTHREADS; i++) + { z = parmt[i].fill; + parmt[i].fill = x; + x += z; + } + kmers = x; + + if (kmers <= 0) + goto no_mers; + } + + // Allocate k-mer sorting arrays now that # of kmers is known + + if (( (Kshift-1)/8 + (TooFrequent < INT32_MAX) ) & 0x1) + { src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); + trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); + } + else + { trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); + src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); + } + if (src == NULL || trg == NULL) + Clean_Exit(1); + +#ifdef PROFILE + printf("K %d\n",kmers); +#endif + + if (VERBOSE) + { printf("\n Kmer count = "); + Print_Number((int64) kmers,0,stdout); + printf("\n Using %.2fGb of space\n",(1. * kmers) / (0x20000000/sizeof(KmerPos))); + fflush(stdout); + } + + // Build the k-mer list + + { int i; + + FR_src = src; + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,tuple_thread,parmt+i); + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + } + + // Sort the k-mer list + + { int i; + int mersort[11]; + +#if __ORDER_LITTLE_ENDIAN__ == __BYTE_ORDER__ + for (i = 0; i < (Kmer-1)/4+1; i++) + mersort[i] = 8+i; +#else + for (i = 0; i < (Kmer-1)/4+1; i++) + mersort[i] = 17-i; +#endif + mersort[i] = -1; + + rez = (KmerPos *) LSD_Sort(kmers,src,trg,16,16,mersort); + } + + // Compress frequent tuples if requested + + if (TooFrequent < INT32_MAX && kmers > 0) + { int i, x, z; + uint64 h; + + parmt[0].beg = 0; + for (i = 1; i < NTHREADS; i++) + { x = (((int64) i)*kmers) / NTHREADS; + h = rez[x-1].code; + while (rez[x].code == h) + x += 1; + parmt[i-1].end = parmt[i].beg = x; + } + parmt[NTHREADS-1].end = kmers; + + if (rez[kmers-1].code == MAX_CODE_64) + rez[kmers].code = 0; + else + rez[kmers].code = MAX_CODE_64; + + if (src == rez) + { FR_src = src; + FR_trg = rez = trg; + } + else + { FR_src = trg; + FR_trg = rez = src; + } + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,compsize_thread,parmt+i); + + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + + x = 0; + for (i = 0; i < NTHREADS; i++) + { z = parmt[i].fill; + parmt[i].fill = x; + x += z; + } + kmers = x; + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,compress_thread,parmt+i); + + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + } + + rez[kmers].code = MAX_CODE_64; + rez[kmers+1].code = 0; + + if (src != rez) + free(src); + else + free(trg); + +#ifdef TEST_KSORT + { int i; + + printf("\nKMER SORT:\n"); + for (i = 0 /*100000000*/; i < 100000000+HOW_MANY && i < kmers; i++) + { KmerPos *c = rez+i; + printf(" %9d: %6d%c / %6d / %016llx\n",i,c->read>>1, + (c->read&0x1)?'c':'n',(c->rpos & POST_MASK),c->code); + } + fflush(stdout); + } +#endif + +#ifdef HISTOGRAM_KSORT + { int hist[100]; + uint64 ca; + int i, j; + + for (i = 0; i < 100; i++) + hist[i] = 0; + + i = 0; + while (i < kmers) + { ca = rez[i].code; + j = i++; + while (rez[i].code == ca) + i += 1; + if (i-j >= 100) + hist[99] += 1; + else + hist[i-j] += 1; + } + + for (i = 99; i >= 0; i--) + printf(" %2d: %6d\n",i,hist[i]); + } +#endif + + if (VERBOSE) + { if (TooFrequent < INT32_MAX) + { printf(" Revised kmer count = "); + Print_Number((int64) kmers,0,stdout); + printf("\n"); + } + printf(" Index occupies %.2fGb\n",(1. * kmers) / (0x40000000/sizeof(KmerPos))); + fflush(stdout); + } + + if (kmers <= 0) + { free(rez); + goto no_mers; + } + + if (kmers > (int64) (MEM_LIMIT/(4*sizeof(KmerPos)))) + { fprintf(stderr,"Warning: Block size too big, index occupies more than 1/4 of"); + if (MEM_LIMIT == MEM_PHYSICAL) + fprintf(stderr," physical memory (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); + else + fprintf(stderr," desired memory allocation (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); + fflush(stderr); + } + + *len = kmers; + return (rez); + +no_mers: + *len = 0; + return (NULL); +} + + +/******************************************************************************************* + * + * FILTER MATCH + * + ********************************************************************************************/ + +static int find_tuple(uint64 x, KmerPos *a, int n) +{ int l, r, m; + + // smallest k s.t. a[k].code >= x (or n if does not exist) + + l = 0; + r = n; + while (l < r) + { m = ((l+r) >> 1); + if (a[m].code < x) + l = m+1; + else + r = m; + } + return (l); +} + + // Determine what *will* be the size of the merged list and histogram of sizes for given cutoffs + +static KmerPos *MG_alist; +static KmerPos *MG_blist; +static DAZZ_DB *MG_ablock; +static DAZZ_DB *MG_bblock; +static SeedPair *MG_hits; +static int MG_self; + +typedef struct + { int abeg, aend; + int bbeg, bend; + int64 nhits; + int limit; + int64 hitgram[MAXGRAM]; + } Merge_Arg; + +static void *count_thread(void *arg) +{ Merge_Arg *data = (Merge_Arg *) arg; + KmerPos *asort = MG_alist; + KmerPos *bsort = MG_blist; + int64 *gram = data->hitgram; + int64 nhits = 0; + int aend = data->aend; + + int64 ct; + int ia, ja; + uint64 ca, da; + + ia = data->abeg; + ca = asort[ia].code; + if (MG_self) + { uint32 ar; + int ka; + + while (1) + { ja = ka = ia++; + ct = 0; + if (IDENTITY) + while (1) + { da = asort[ia].code; + if (da != ca) + break; + ct += (ia-ja); + ia += 1; + } + else + while (1) + { da = asort[ia].code; + if (da != ca) + break; + ar = (asort[ia].read & ~0x1u); + while (ka < ia && asort[ka].read < ar) + ka += 1; + ct += (ka-ja); + ia += 1; + } + + ca = da; + if (ia > aend) + { if (ja >= aend) + break; + ia = aend; + ca = asort[ia].code; + ct -= (ka-ja); + } + + nhits += ct; + if (ct < MAXGRAM) + gram[ct] += 1; + } + } + else + { int ib, jb; + uint64 cb; + + ib = data->bbeg; + cb = bsort[ib].code; + while (1) + { ja = ia++; + while (1) + { da = asort[ia].code; + if (da != ca) + break; + ia += 1; + } + + if (ia > aend) + { if (ja >= aend) + break; + ia = aend; + da = asort[ia].code; + } + + while (cb < ca) + { ib += 1; + cb = bsort[ib].code; + } + if (cb != ca) + { ca = da; + continue; + } + + jb = ib++; + while (1) + { cb = bsort[ib].code; + if (cb != ca) + break; + ib += 1; + } + ca = da; + + ct = ((int64) (ia-ja))*(ib-jb); + nhits += ct; + if (ct < MAXGRAM) + gram[ct] += 1; + } + } + + data->nhits = nhits; + + return (NULL); +} + + // Produce the merged list now that the list has been allocated and + // the appropriate cutoff determined. + +static void *merge_thread(void *arg) +{ Merge_Arg *data = (Merge_Arg *) arg; + KmerPos *asort = MG_alist; + KmerPos *bsort = MG_blist; + DAZZ_READ *reads = MG_bblock->reads; + SeedPair *hits = MG_hits; + int64 nhits = data->nhits; + int aend = data->aend; + int limit = data->limit; + + int64 ct; + int ia, ja; + uint64 ca, da; + int nread = MG_ablock->nreads; + + ia = data->abeg; + ca = asort[ia].code; + if (MG_self) + { uint32 ar, br; + uint32 ap, bp; + uint32 as, bs; + int a, ka; + + while (1) + { ja = ka = ia++; + ct = 0; + if (IDENTITY) + while (1) + { da = asort[ia].code; + if (da != ca) + break; + ct += (ia-ja); + ia += 1; + } + else + while (1) + { da = asort[ia].code; + if (da != ca) + break; + ar = (asort[ia].read & ~0x1u); + while (ka < ia && asort[ka].read < ar) + ka += 1; + ct += (ka-ja); + ia += 1; + } + + ca = da; + if (ia > aend) + { if (ja >= aend) + break; + ia = aend; + ca = asort[ia].code; + ct -= (ka-ja); + } + + if (ct >= limit) + continue; + + if (IDENTITY) + for (ka = ja+1; ka < ia; ka++) + { ar = asort[ka].read; + as = (ar & SIGN_BIT); + ar >>= 1; + ap = (asort[ka].rpos & POST_MASK); + for (a = ja; a < ka; a++) + { br = asort[a].read; + bs = (br & SIGN_BIT); + br >>= 1; + bp = asort[a].rpos; + if (bs == as) + { bp = (bp & POST_MASK); + hits[nhits].aread = ar; + } + else + { if ((bp & LONG_BIT) != 0) + bp = (reads[br].rlen - (bp & POST_MASK)) + Koff; + else + bp = (reads[br].rlen - (bp & POST_MASK)) + Kmer; + hits[nhits].aread = ar + nread; + } + hits[nhits].bread = br; + hits[nhits].apos = ap; + hits[nhits].diag = ap - bp; + nhits += 1; + } + } + else + for (ka = ja+1; ka < ia; ka++) + { ar = asort[ka].read; + as = (ar & SIGN_BIT); + ar >>= 1; + ap = (asort[ka].rpos & POST_MASK); + for (a = ja; a < ka; a++) + { br = asort[a].read; + bs = (br & SIGN_BIT); + br >>= 1; + if (br >= ar) + break; + bp = asort[a].rpos; + if (bs == as) + { bp = (bp & POST_MASK); + hits[nhits].aread = ar; + } + else + { if ((bp & LONG_BIT) != 0) + bp = (reads[br].rlen - (bp & POST_MASK)) + Koff; + else + bp = (reads[br].rlen - (bp & POST_MASK)) + Kmer; + hits[nhits].aread = ar + nread; + } + hits[nhits].bread = br; + hits[nhits].apos = ap; + hits[nhits].diag = ap - bp; + nhits += 1; + } + } + } + } + else + { int ib, jb; + uint64 cb; + uint32 ar, br; + uint32 ap, bp; + uint32 as, bs; + int a, b; + + ib = data->bbeg; + cb = bsort[ib].code; + while (1) + { ja = ia++; + while (1) + { da = asort[ia].code; + if (da != ca) + break; + ia += 1; + } + + if (ia > aend) + { if (ja >= aend) + break; + ia = aend; + da = asort[ia].code; + } + + while (cb < ca) + { ib += 1; + cb = bsort[ib].code; + } + if (cb != ca) + { ca = da; + continue; + } + + jb = ib++; + while (1) + { cb = bsort[ib].code; + if (cb != ca) + break; + ib += 1; + } + ca = da; + + if (((int64) (ia-ja))*(ib-jb) >= limit) + continue; + + for (a = ja; a < ia; a++) + { ar = asort[a].read; + as = (ar & SIGN_BIT); + ar >>= 1; + ap = (asort[a].rpos & POST_MASK); + for (b = jb; b < ib; b++) + { br = bsort[b].read; + bs = (br & SIGN_BIT); + br >>= 1; + bp = bsort[b].rpos; + if (bs == as) + { bp = (bp & POST_MASK); + hits[nhits].aread = ar; + } + else + { if ((bp & LONG_BIT) != 0) + bp = (reads[br].rlen - (bp & POST_MASK)) + Koff; + else + bp = (reads[br].rlen - (bp & POST_MASK)) + Kmer; + hits[nhits].aread = ar + nread; + } + hits[nhits].bread = br; + hits[nhits].apos = ap; + hits[nhits].diag = ap - bp; + nhits += 1; + } + } + + } + } + + return (NULL); +} + + // Report threads: given a segment of merged list, find all seeds and from them all alignments. + +static DAZZ_DB *MR_ablock; +static DAZZ_DB *MR_bblock; +static SeedPair *MR_hits; +static int MR_two; +static Align_Spec *MR_spec; +static int MR_tspace; + +typedef struct + { uint64 max; + uint64 top; + uint16 *trace; + } Trace_Buffer; + +static inline int MapToTPAbove(Path *path, int *x, int isA, Trace_Buffer *tbuf) +{ uint16 *trace = tbuf->trace + (uint64) path->trace; + int a, b, i; + + a = (path->abpos / MR_tspace) * MR_tspace; + b = path->bbpos; + for (i = 1; i < path->tlen; i += 2) + { a += MR_tspace; + b += trace[i]; + if (a > path->aepos) + a = path->aepos; + if (isA) + { if (a >= *x) + { *x = a; + return (b); + } + } + else + { if (b >= *x) + { *x = b; + return (a); + } + } + } + if (isA) + { *x = a; + return (b); + } + else + { *x = b; + return (a); + } +} + +static inline int MapToTPBelow(Path *path, int *x, int isA, Trace_Buffer *tbuf) +{ uint16 *trace = tbuf->trace + (uint64) path->trace; + int a, b, i; + + a = ((path->aepos + (MR_tspace-1)) / MR_tspace) * MR_tspace; + b = path->bepos; + for (i = path->tlen-1; i >= 0; i -= 2) + { a -= MR_tspace; + b -= trace[i]; + if (a < path->abpos) + a = path->abpos; + if (isA) + { if (a <= *x) + { *x = a; + return (b); + } + } + else + { if (b <= *x) + { *x = b; + return (a); + } + } + } + if (isA) + { *x = a; + return (b); + } + else + { *x = b; + return (a); + } +} + +static int Check_Bridge(Path *path, Trace_Buffer *tbuf) +{ uint16 *trace = tbuf->trace + (uint64) path->trace; + int i; + + if (MR_tspace <= TRACE_XOVR) + { for (i = 0; i < path->tlen; i++) + if (trace[i] > 250) + return (1); + } + return (0); +} + +static void Compute_Bridge_Path(Path *path1, Path *path2, Alignment *align, int comp, + int aovl, int bovl, Work_Data *work, Trace_Buffer *tbuf) +{ Path *apath; + int ain, aout; + int bin, bout, boff; + int i, j, p; + uint16 *trk; + + apath = align->path; + + if (bovl > aovl) + { bin = path2->bbpos; + bout = path1->bepos; + ain = MapToTPBelow(path1,&bin,0,tbuf); + aout = MapToTPAbove(path2,&bout,0,tbuf); + } + else + { ain = path2->abpos; + aout = path1->aepos; + bin = MapToTPBelow(path1,&ain,1,tbuf); + bout = MapToTPAbove(path2,&aout,1,tbuf); + } + +#ifdef TEST_BRIDGE + printf("\n Tangle [%5d..%5d] vs [%5d..%5d] %4d\n", + path1->abpos,path1->aepos,path2->abpos,path2->aepos,abs(aovl-bovl)); + printf(" [%5d..%5d] vs [%5d..%5d] %4d vs %4d\n", + path1->bbpos,path1->bepos,path2->bbpos,path2->bepos,aovl,bovl); + printf(" (%d,%d) to (%d,%d)\n",ain,bin,aout,bout); + fflush(stdout); +#endif + + apath->abpos = ain - 2*MR_tspace; + apath->aepos = aout + 2*MR_tspace; + apath->bbpos = MapToTPBelow(path1,&(apath->abpos),1,tbuf); + apath->bepos = MapToTPAbove(path2,&(apath->aepos),1,tbuf); + + if (comp) + { boff = MR_tspace - apath->aepos % MR_tspace; + + p = align->alen - apath->abpos; + apath->abpos = align->alen - apath->aepos; + apath->aepos = p; + p = align->blen - apath->bbpos; + apath->bbpos = align->blen - apath->bepos; + apath->bepos = p; + + boff = boff - apath->abpos % MR_tspace; + align->aseq -= boff; + apath->abpos += boff; + apath->aepos += boff; + align->alen += boff; + } + +#ifdef TEST_BRIDGE + printf("\n (%d,%d) to (%d,%d)\n",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); + fflush(stdout); + + Compute_Alignment(align,work,DIFF_ALIGN,0); + Print_Reference(stdout,align,work,8,100,10,0,6); + fflush(stdout); +#endif + + Compute_Alignment(align,work,DIFF_TRACE,MR_tspace); + + trk = (uint16 *) apath->trace; + if (comp) + { j = apath->tlen-2; + i = 0; + while (i < j) + { p = trk[i]; + trk[i] = trk[j]; + trk[j] = p; + p = trk[i+1]; + trk[i+1] = trk[j+1]; + trk[j+1] = p; + i += 2; + j -= 2; + } + + align->aseq += boff; + apath->abpos -= boff; + apath->aepos -= boff; + align->alen -= boff; + + p = align->alen - apath->abpos; + apath->abpos = align->alen - apath->aepos; + apath->aepos = p; + p = align->blen - apath->bbpos; + apath->bbpos = align->blen - apath->bepos; + apath->bepos = p; + } + +#ifdef TEST_BRIDGE + { int err; + + bin = apath->bbpos; + bout = apath->bepos; + err = apath->diffs; + + p = 2*(ain / MR_tspace - apath->abpos / MR_tspace); + for (i = 0; i < p; i += 2) + { bin += trk[i+1]; + err -= trk[i]; + } + + p = 2*(apath->aepos / MR_tspace - aout / MR_tspace); + for (i = align->path->tlen, p = i-p; i > p; i -= 2) + { bout -= trk[i-1]; + err -= trk[i-2]; + } + + printf(" (%d,%d) to (%d,%d)\n",ain,bin,aout,bout); + printf(" Box %d vs %d -> %d %d%%\n",aout-ain,bout-bin,err, + (200*err)/((aout-ain)+(bout-bin))); + fflush(stdout); + } +#endif + + if (tbuf->top + apath->tlen >= tbuf->max) + { tbuf->max = 1.2*(tbuf->top+apath->tlen) + TRACE_CHUNK; + tbuf->trace = (uint16 *) Realloc(tbuf->trace,sizeof(uint16)*tbuf->max,"Allocating paths"); + if (tbuf->trace == NULL) + Clean_Exit(1); + } + trk = tbuf->trace + tbuf->top; + memcpy(trk,apath->trace,apath->tlen*sizeof(uint16)); + apath->trace = (void *) (tbuf->top); + tbuf->top += apath->tlen; +} + +static int Entwine(Path *jpath, Path *kpath, Trace_Buffer *tbuf, int *where) +{ int ac, b2, y2, ae; + int i, j, k; + int num, den, min; +#ifdef SEE_ENTWINE + int strt = 1; + int iflare, oflare; +#endif + + uint16 *ktrace = tbuf->trace + (uint64) (kpath->trace); + uint16 *jtrace = tbuf->trace + (uint64) (jpath->trace); + + min = 10000; + num = 0; + den = 0; + +#ifdef SEE_ENTWINE + printf("\n"); +#endif + + y2 = jpath->bbpos; + j = jpath->abpos/MR_tspace; + + b2 = kpath->bbpos; + k = kpath->abpos/MR_tspace; + + if (jpath->abpos == kpath->abpos) + { min = abs(y2-b2); + if (min == 0) + *where = kpath->abpos; + } + + if (j < k) + { ac = k*MR_tspace; + + j = 1 + 2*(k-j); + k = 1; + + for (i = 1; i < j; i += 2) + y2 += jtrace[i]; + } + else + { ac = j*MR_tspace; + + k = 1 + 2*(j-k); + j = 1; + + for (i = 1; i < k; i += 2) + b2 += ktrace[i]; + } + + ae = jpath->aepos; + if (ae > kpath->aepos) + ae = kpath->aepos; + + while (1) + { ac += MR_tspace; + if (ac >= ae) + break; + y2 += jtrace[j]; + b2 += ktrace[k]; + j += 2; + k += 2; + +#ifdef SEE_ENTWINE + printf(" @ %5d : %5d %5d = %4d\n",ac,y2,b2,abs(b2-y2)); +#endif + + i = abs(y2-b2); + if (i <= min) + { min = i; + if (i == 0) + *where = ac; + } + num += i; + den += 1; +#ifdef SEE_ENTWINE + if (strt) + { strt = 0; + iflare = i; + } + oflare = i; +#endif + } + + if (jpath->aepos == kpath->aepos) + { i = abs(jpath->bepos-kpath->bepos); + if (i <= min) + { min = i; + if (i == 0) + *where = kpath->aepos; + } + } + +#ifdef SEE_ENTWINE + if (den == 0) + printf("Nothing\n"); + else + printf("MINIM = %d AVERAGE = %d IFLARE = %d OFLARE = %d\n",min,num/den,iflare,oflare); +#endif + + if (den == 0) + return (-1); + else + return (min); +} + + +// Produce the concatentation of path1 and path2 where they are known to meet at +// the trace point with coordinate ap. Place this result in a big growing buffer, +// that gets reset when fusion is called with path1 = NULL + +static void Fusion(Path *path1, int ap, Path *path2, Trace_Buffer *tbuf) +{ int k, k1, k2; + int len, diff; + uint16 *trace; + + k1 = 2 * ((ap/MR_tspace) - (path1->abpos/MR_tspace)); + k2 = 2 * ((ap/MR_tspace) - (path2->abpos/MR_tspace)); + + len = k1+(path2->tlen-k2); + + if (tbuf->top + len >= tbuf->max) + { tbuf->max = 1.2*(tbuf->top+len) + TRACE_CHUNK; + tbuf->trace = (uint16 *) Realloc(tbuf->trace,sizeof(uint16)*tbuf->max,"Allocating paths"); + if (tbuf->trace == NULL) + Clean_Exit(1); + } + + trace = tbuf->trace + tbuf->top; + tbuf->top += len; + + diff = 0; + len = 0; + if (k1 > 0) + { uint16 *t = tbuf->trace + (uint64) (path1->trace); + for (k = 0; k < k1; k += 2) + { trace[len++] = t[k]; + trace[len++] = t[k+1]; + diff += t[k]; + } + } + if (k2 < path2->tlen) + { uint16 *t = tbuf->trace + (uint64) (path2->trace); + for (k = k2; k < path2->tlen; k += 2) + { trace[len++] = t[k]; + trace[len++] = t[k+1]; + diff += t[k]; + } + } + + path1->aepos = path2->aepos; + path1->bepos = path2->bepos; + path1->diffs = diff; + path1->trace = (void *) (trace - tbuf->trace); + path1->tlen = len; +} + +// Produce the concatentation of path1, path2, and path3 where they are known to meet at +// the ends of path2 which was produced by Compute-Alignment. Place this result in +// a big growing buffer. + +static void Bridge(Path *path1, Path *path2, Path *path3, Trace_Buffer *tbuf) +{ int k, k1, k2; + int len, diff; + uint16 *trace; + + k1 = 2 * ((path2->abpos/MR_tspace) - (path1->abpos/MR_tspace)); + if (path2->aepos == path3->aepos) + k2 = path3->tlen; + else + k2 = 2 * ((path2->aepos/MR_tspace) - (path3->abpos/MR_tspace)); + + len = k1 + path2->tlen + (path3->tlen-k2); + + if (tbuf->top + len >= tbuf->max) + { tbuf->max = 1.2*(tbuf->top+len) + TRACE_CHUNK; + tbuf->trace = (uint16 *) Realloc(tbuf->trace,sizeof(uint16)*tbuf->max,"Allocating paths"); + if (tbuf->trace == NULL) + Clean_Exit(1); + } + + trace = tbuf->trace + tbuf->top; + tbuf->top += len; + + diff = 0; + len = 0; + if (k1 > 0) + { uint16 *t = tbuf->trace + (uint64) (path1->trace); + for (k = 0; k < k1; k += 2) + { trace[len++] = t[k]; + trace[len++] = t[k+1]; + diff += t[k]; + } + } + if (path2->tlen > 0) + { uint16 *t = tbuf->trace + (uint64) (path2->trace); + for (k = 0; k < path2->tlen; k += 2) + { trace[len++] = t[k]; + trace[len++] = t[k+1]; + diff += t[k]; + } + } + if (k2 < path3->tlen) + { uint16 *t = tbuf->trace + (uint64) (path3->trace); + for (k = k2; k < path3->tlen; k += 2) + { trace[len++] = t[k]; + trace[len++] = t[k+1]; + diff += t[k]; + } + } + + path1->aepos = path3->aepos; + path1->bepos = path3->bepos; + path1->diffs = diff; + path1->trace = (void *) (trace - tbuf->trace); + path1->tlen = len; +} + +static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, + Alignment *align, Work_Data *work, Trace_Buffer *tbuf) +{ Path *jpath, *kpath, *apath; + Path _bpath, *bpath = &_bpath; + Alignment _blign, *blign = &_blign; + + int j, k, no; + int dist; + int awhen = 0, bwhen = 0; + int comp; + +#if defined(TEST_CONTAIN) || defined(TEST_BRIDGE) + for (j = 0; j < novls; j++) + printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, + amatch[j].bbpos,amatch[j].bepos); +#endif + + (void) work; + + // Loop to catch LA's that share a common trace point and fuse them + + apath = align->path; + comp = COMP(align->flags); + + blign->aseq = align->bseq; + blign->bseq = align->aseq; + blign->alen = align->blen; + blign->blen = align->alen; + blign->path = bpath; + + (void) apath->tlen; // Just to shut up stupid compilers + + for (j = 1; j < novls; j++) + { jpath = amatch+j; + for (k = j-1; k >= 0; k--) + { kpath = amatch+k; + + if (kpath->abpos < 0) + continue; + + if (jpath->abpos < kpath->abpos) + + { if (kpath->abpos <= jpath->aepos && kpath->bbpos <= jpath->bepos) + { dist = Entwine(jpath,kpath,tbuf,&awhen); + if (dist == 0) + { if (kpath->aepos > jpath->aepos) + { if (comp) + { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); + if (dist != 0) + continue; + Fusion(jpath,awhen,kpath,tbuf); + Fusion(bmatch+k,bwhen,bmatch+j,tbuf); + bmatch[j] = bmatch[k]; +#ifdef TEST_CONTAIN + printf(" Really 1"); +#endif + } + else + { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); + if (dist != 0) + continue; + Fusion(jpath,awhen,kpath,tbuf); + Fusion(bmatch+j,bwhen,bmatch+k,tbuf); +#ifdef TEST_CONTAIN + printf(" Really 2"); +#endif + } + k = j; + } + kpath->abpos = -1; +#ifdef TEST_CONTAIN + printf(" Fuse! A %d %d\n",j,k); +#endif + } + } + } + + else // kpath->abpos <= jpath->abpos + + { if (jpath->abpos <= kpath->aepos && jpath->bbpos <= kpath->bepos) + { dist = Entwine(kpath,jpath,tbuf,&awhen); + if (dist == 0) + { if (kpath->abpos == jpath->abpos) + { if (kpath->aepos > jpath->aepos) + { *jpath = *kpath; + bmatch[j] = bmatch[k]; + } + } + else if (jpath->aepos > kpath->aepos) + { if (comp) + { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); + if (dist != 0) + continue; + Fusion(kpath,awhen,jpath,tbuf); + *jpath = *kpath; + Fusion(bmatch+j,bwhen,bmatch+k,tbuf); +#ifdef TEST_CONTAIN + printf(" Really 4"); +#endif + } + else + { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); + if (dist != 0) + continue; + Fusion(kpath,awhen,jpath,tbuf); + *jpath = *kpath; + Fusion(bmatch+k,bwhen,bmatch+j,tbuf); + bmatch[j] = bmatch[k]; +#ifdef TEST_CONTAIN + printf(" Really 5"); +#endif + } + k = j; + } + else + { *jpath = *kpath; + bmatch[j] = bmatch[k]; + } + kpath->abpos = -1; +#ifdef TEST_CONTAIN + printf(" Fuse! B %d %d\n",j,k); +#endif + } + } + } + } + } + + // Loop to catch LA's that have a narrow parallel overlap and bridge them + + if (BRIDGE) + { for (j = 1; j < novls; j++) + { jpath = amatch+j; + if (jpath->abpos < 0) + continue; + + for (k = j-1; k >= 0; k--) + { Path *path1, *path2; + Path *bath1, *bath2; + int aovl, bovl; + + kpath = amatch+k; + if (kpath->abpos < 0) + continue; + + if (jpath->abpos < kpath->abpos) + { path1 = jpath; + path2 = kpath; + } + else + { path1 = kpath; + path2 = jpath; + } + + if (path2->abpos >= path1->aepos || path1->aepos >= path2->aepos || + path1->bbpos >= path2->bbpos || path2->bbpos >= path1->bepos || + path1->bepos >= path2->bepos) + continue; + aovl = path1->aepos - path2->abpos; + bovl = path1->bepos - path2->bbpos; + if (abs(aovl-bovl) > .2 * (aovl+bovl)) + continue; + + if (comp == (jpath->abpos < kpath->abpos)) + { bath1 = bmatch+k; + bath2 = bmatch+j; + } + else + { bath1 = bmatch+j; + bath2 = bmatch+k; + } + if (bath1->abpos > bath2->abpos) + { printf(" SYMFAIL %d %d\n",j,k); + continue; + } + + Compute_Bridge_Path(path1,path2,align,0,aovl,bovl,work,tbuf); + Compute_Bridge_Path(bath1,bath2,blign,comp,bovl,aovl,work,tbuf); + + if (Check_Bridge(apath,tbuf) || Check_Bridge(bpath,tbuf)) + continue; + + Bridge(path1,apath,path2,tbuf); + *jpath = *path1; + + Bridge(bath1,bpath,bath2,tbuf); + bmatch[j] = *bath1; + + kpath->abpos = -1; + +#ifdef TEST_BRIDGE + { Alignment extra; + Path pcopy; + + pcopy = *jpath; + extra = *align; + pcopy.trace = tbuf->trace + (uint64) jpath->trace; + extra.path = &pcopy; + Compute_Trace_PTS(&extra,work,MR_tspace,GREEDIEST); + Print_Reference(stdout,&extra,work,8,100,10,0,6); + fflush(stdout); + + pcopy = *bath1; + extra = *blign; + pcopy.trace = tbuf->trace + (uint64) bmatch[j].trace; + extra.path = &pcopy; + if (comp) + { Complement_Seq(extra.aseq,extra.alen); + Complement_Seq(extra.bseq,extra.blen); + } + Compute_Trace_PTS(&extra,work,MR_tspace,GREEDIEST); + Print_Reference(stdout,&extra,work,8,100,10,0,6); + fflush(stdout); + if (comp) + { Complement_Seq(extra.aseq,extra.alen); + Complement_Seq(extra.bseq,extra.blen); + } + } +#endif + } + } + } + + no = 0; + for (j = 0; j < novls; j++) + if (amatch[j].abpos >= 0) + { bmatch[no] = bmatch[j]; + amatch[no] = amatch[j]; + no += 1; + } + novls = no; + +#if defined(TEST_CONTAIN) || defined(TEST_BRIDGE) + for (j = 0; j < novls; j++) + printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, + amatch[j].bbpos,amatch[j].bepos); +#endif + + return (novls); +} + +static void Diagonal_Span(Path *path, int *mind, int *maxd) +{ uint16 *points; + int i, tlen; + int dd, low, hgh; + + points = path->trace; + tlen = path->tlen; + + dd = path->abpos - path->bbpos; + low = hgh = dd; + + dd = path->aepos - path->bepos; + if (dd < low) + low = dd; + else if (dd > hgh) + hgh = dd; + + dd = (path->abpos/MR_tspace)*MR_tspace - path->bbpos; + tlen -= 2; + for (i = 1; i < tlen; i += 2) + { dd += MR_tspace - points[i]; + if (dd < low) + low = dd; + else if (dd > hgh) + hgh = dd; + } + + *mind = (low >> Binshift)-1; + *maxd = (hgh >> Binshift)+1; +} + +static void CopyAndComp(char *bcomp,char *bseq, int blen) +{ char *s, *t; + + t = bcomp + (blen-1); + s = bseq; + t[1] = 4; + while (t >= bcomp) + *t-- = 3-*s++; + t[0] = 4; +} + +typedef struct + { int64 beg, end; + int *score; + int *lastp; + int *lasta; + Work_Data *work; + FILE *ofile1; + FILE *ofile2; + int64 nfilt; + int64 nlas; +#ifdef PROFILE + int profyes[MAXHIT+1]; + int profno[MAXHIT+1]; +#endif + } Report_Arg; + +typedef struct + { uint64 p1; // The lower half + uint64 p2; + } Double; + +static void *report_thread(void *arg) +{ Report_Arg *data = (Report_Arg *) arg; + SeedPair *hits = MR_hits; + Double *hitd = (Double *) MR_hits; + DAZZ_READ *bread = MR_bblock->reads; + DAZZ_READ *aread = MR_ablock->reads; + char *aseq = (char *) (MR_ablock->bases); + char *bseq = (char *) (MR_bblock->bases); + int *score = data->score; + int *scorp = data->score + 1; + int *scorm = data->score - 1; + int *lastp = data->lastp; + int *lasta = data->lasta; + int afirst = MR_ablock->tfirst; + int bfirst = MR_bblock->tfirst; + FILE *ofile1 = data->ofile1; + FILE *ofile2 = data->ofile2; + Work_Data *work = data->work; + int maxdiag = ( MR_ablock->maxlen >> Binshift); + int mindiag = (-MR_bblock->maxlen >> Binshift); + int areads = MR_ablock->nreads; +#ifdef PROFILE + int *profyes = data->profyes; + int *profno = data->profno; + int maxhit; +#endif + + Overlap _ovlb, *ovlb = &_ovlb; + Overlap _ovla, *ovla = &_ovla; + Alignment _align, *align = &_align; + Path *apath = &(ovla->path); + Path *bpath; + char *bcomp; + + int Omax, novl; + Path *amatch, *bmatch; + + Trace_Buffer _tbuf, *tbuf = &_tbuf; + int small, tbytes; + + Double *hitc; + int minhit; + uint64 cpair; + uint64 npair = 0; + int64 nidx, eidx; + + int64 nfilt = 0; + int64 nlas = 0; + int64 ahits = 0; + int64 bhits = 0; + + // In ovl and align roles of A and B are reversed, as the B sequence must be the + // complemented sequence !! + + align->path = apath; + bcomp = New_Read_Buffer(MR_bblock); + + if (MR_tspace <= TRACE_XOVR) + { small = 1; + tbytes = sizeof(uint8); + } + else + { small = 0; + tbytes = sizeof(uint16); + } + + Omax = MATCH_CHUNK; + amatch = Malloc(sizeof(Path)*Omax,"Allocating match vector"); + bmatch = Malloc(sizeof(Path)*Omax,"Allocating match vector"); + + tbuf->max = 2*TRACE_CHUNK; + tbuf->trace = Malloc(sizeof(short)*tbuf->max,"Allocating trace vector"); + + if (amatch == NULL || bmatch == NULL || tbuf->trace == NULL) + Clean_Exit(1); + + fwrite(&ahits,sizeof(int64),1,ofile1); + fwrite(&MR_tspace,sizeof(int),1,ofile1); + if (MR_two) + { fwrite(&bhits,sizeof(int64),1,ofile2); + fwrite(&MR_tspace,sizeof(int),1,ofile2); + } + +#ifdef PROFILE + { int i; + for (i = 0; i <= MAXHIT; i++) + profyes[i] = profno[i] = 0; + } +#endif + + minhit = (Hitmin-1)/Kmer + 1; + hitc = hitd + (minhit-1); + eidx = data->end - minhit; + nidx = data->beg; + for (cpair = hitd[nidx].p1; nidx <= eidx; cpair = npair) + if (hitc[nidx].p1 != cpair) + { nidx += 1; + while ((npair = hitd[nidx].p1) == cpair) + nidx += 1; + } + else + { int ar, br, bc; + int alen, blen; + int doA, doB; + int setaln, amark, amark2; + int apos, bpos, diag; + int64 lidx, sidx; + int64 f, h2; + + ar = hits[nidx].aread; + br = hits[nidx].bread; + if (ar >= areads) + { bc = 1; + ar -= areads; + } + else + bc = 0; + alen = aread[ar].rlen; + blen = bread[br].rlen; + doA = (alen >= HGAP_MIN); + doB = (SYMMETRIC && blen >= HGAP_MIN && ! (ar == br && MG_self)); + if (! (doA || doB)) + { nidx += 1; + while ((npair = hitd[nidx].p1) == cpair) + nidx += 1; + continue; + } + +#ifdef TEST_GATHER + printf("%5d vs %5d%c : %5d x %5d\n",ar+afirst,br+bfirst,bc?'c':'n',alen,blen); + fflush(stdout); +#endif + setaln = 1; + amark2 = 0; + novl = 0; + tbuf->top = 0; + for (sidx = nidx; hitd[nidx].p1 == cpair; nidx = h2) + { amark = amark2 + PANEL_SIZE; + amark2 = amark - PANEL_OVERLAP; + + h2 = lidx = nidx; + do + { apos = hits[nidx].apos; + npair = hitd[++nidx].p1; + if (apos <= amark2) + h2 = nidx; + } + while (npair == cpair && apos <= amark); + + if (nidx-lidx < minhit) continue; + + for (f = lidx; f < nidx; f++) + { apos = hits[f].apos; + diag = hits[f].diag >> Binshift; + if (apos - lastp[diag] >= Kmer) + score[diag] += Kmer; + else + score[diag] += apos - lastp[diag]; + lastp[diag] = apos; + } + +#ifdef TEST_GATHER + printf(" %6lld upto %6d",nidx-lidx,amark); + fflush(stdout); +#endif + + for (f = lidx; f < nidx; f++) + { apos = hits[f].apos; + diag = hits[f].diag; + bpos = apos - diag; + diag = diag >> Binshift; + if (apos > lasta[diag] && + (score[diag] + scorp[diag] >= Hitmin || score[diag] + scorm[diag] >= Hitmin)) + { if (setaln) + { setaln = 0; + align->aseq = aseq + aread[ar].boff; + align->bseq = bseq + bread[br].boff; + if (bc) + { CopyAndComp(bcomp,align->bseq,blen); + align->bseq = bcomp; + } + align->alen = alen; + align->blen = blen; + align->flags = ovla->flags = ovlb->flags = bc; + ovlb->bread = ovla->aread = ar + afirst; + ovlb->aread = ovla->bread = br + bfirst; + } +#ifdef TEST_GATHER + else + printf("\n "); + + if (scorm[diag] > scorp[diag]) + printf(" %5d.. x %5d.. %5d (%3d)", + bpos,apos,apos-bpos,score[diag]+scorm[diag]); + else + printf(" %5d.. x %5d.. %5d (%3d)", + bpos,apos,apos-bpos,score[diag]+scorp[diag]); + fflush(stdout); +#endif + nfilt += 1; +#ifdef PROFILE + if (scorm[diag] > scorp[diag]) + maxhit = score[diag] + scorm[diag]; + else + maxhit = score[diag] + scorp[diag]; + if (maxhit > MAXHIT) + maxhit = MAXHIT; +#endif + +#ifdef DO_ALIGNMENT + bpath = Local_Alignment(align,work,MR_spec,apos-bpos,apos-bpos,apos+bpos,-1,-1); + + { int low, hgh, ae; + + Diagonal_Span(apath,&low,&hgh); + if (diag < low) + low = diag; + else if (diag > hgh) + hgh = diag; + ae = apath->aepos; + for (diag = low; diag <= hgh; diag++) + if (ae > lasta[diag]) + lasta[diag] = ae; +#ifdef TEST_GATHER + printf(" %d - %d @ %d",low,hgh,apath->aepos); + fflush(stdout); +#endif + } + + if ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos) >= MINOVER) + { if (novl >= Omax) + { Omax = 1.2*novl + MATCH_CHUNK; + amatch = Realloc(amatch,sizeof(Path)*Omax, + "Reallocating match vector"); + bmatch = Realloc(bmatch,sizeof(Path)*Omax, + "Reallocating match vector"); + if (amatch == NULL || bmatch == NULL) + Clean_Exit(1); + } + + if (tbuf->top + (apath->tlen + bpath->tlen) > tbuf->max) + { tbuf->max = 1.2*(tbuf->top+(apath->tlen+bpath->tlen)) + TRACE_CHUNK; + tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, + "Reallocating trace vector"); + if (tbuf->trace == NULL) + Clean_Exit(1); + } + + amatch[novl] = *apath; + amatch[novl].trace = (void *) (tbuf->top); + memmove(tbuf->trace+tbuf->top,apath->trace,sizeof(short)*apath->tlen); + tbuf->top += apath->tlen; + + bmatch[novl] = *bpath; + bmatch[novl].trace = (void *) (tbuf->top); + memmove(tbuf->trace+tbuf->top,bpath->trace,sizeof(short)*bpath->tlen); + tbuf->top += bpath->tlen; + + novl += 1; +#ifdef PROFILE + profyes[maxhit] += 1; +#endif + +#ifdef TEST_GATHER + printf(" [%5d,%5d] x [%5d,%5d] = %4d", + apath->abpos,apath->aepos,apath->bbpos,apath->bepos,apath->diffs); + fflush(stdout); +#endif +#ifdef SHOW_OVERLAP + printf("\n\n %d(%d) vs %d(%d)\n\n", + ovla->aread,ovla->alen,ovla->bread,ovla->blen); + Print_ACartoon(stdout,align,ALIGN_INDENT); +#ifdef SHOW_ALIGNMENT + Compute_Trace_ALL(align,work); + printf("\n Diff = %d\n",align->path->diffs); + Print_Alignment(stdout,align,work, + ALIGN_INDENT,ALIGN_WIDTH,ALIGN_BORDER,0,5); +#endif +#endif // SHOW_OVERLAP + + } + else +#ifdef TEST_GATHER + printf(" No alignment %d", + ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos))/2); + fflush(stdout); +#else +#ifdef PROFILE + { if (ar != br) + profno[maxhit] += 1; + } +#else + ; +#endif +#endif + +#endif // DO_ALIGNMENT + } + } + + for (f = lidx; f < nidx; f++) + { diag = hits[f].diag >> Binshift; + score[diag] = lastp[diag] = 0; + } +#ifdef TEST_GATHER + printf("\n"); + fflush(stdout); +#endif + } + + for (f = sidx; f < nidx; f++) + { int d; + + diag = hits[f].diag >> Binshift; + for (d = diag; d <= maxdiag; d++) + if (lasta[d] == 0) + break; + else + lasta[d] = 0; + for (d = diag-1; d >= mindiag; d--) + if (lasta[d] == 0) + break; + else + lasta[d] = 0; + } + + + { int i; + +#ifdef TEST_CONTAIN + if (novl > 1) + printf("\n%5d vs %5d:\n",ar,br); +#endif + + novl = Handle_Redundancies(amatch,novl,bmatch,align,work,tbuf); + + if (doA) + { for (i = 0; i < novl; i++) + { ovla->path = amatch[i]; + ovla->path.trace = tbuf->trace + (uint64) (ovla->path.trace); + if (small) + Compress_TraceTo8(ovla,1); + if (Write_Overlap(ofile1,ovla,tbytes)) + { fprintf(stderr,"%s: Cannot write to %s too small?\n",SORT_PATH,Prog_Name); + Clean_Exit(1); + } + } + ahits += novl; + } + if (doB) + { for (i = 0; i < novl; i++) + { ovlb->path = bmatch[i]; + ovlb->path.trace = tbuf->trace + (uint64) (ovlb->path.trace); + if (small) + Compress_TraceTo8(ovlb,1); + if (Write_Overlap(ofile2,ovlb,tbytes)) + { fprintf(stderr,"%s: Cannot write to %s, too small?\n",SORT_PATH,Prog_Name); + Clean_Exit(1); + } + } + bhits += novl; + } + + nlas += novl; + } + } + + free(tbuf->trace); + free(bmatch); + free(amatch); + free(bcomp-1); + + data->nfilt = nfilt; + data->nlas = nlas; + + if (MR_two) + { rewind(ofile2); + fwrite(&bhits,sizeof(int64),1,ofile2); + fclose(ofile2); + } + else + ahits += bhits; + + rewind(ofile1); + fwrite(&ahits,sizeof(int64),1,ofile1); + fclose(ofile1); + + return (NULL); +} + + +/******************************************************************************************* + * + * THE ALGORITHM + * + ********************************************************************************************/ + +static char *NameBuffer(char *aname, char *bname) +{ static char *cat = NULL; + static int max = -1; + int len; + + len = strlen(aname) + strlen(bname) + 100; + if (len > max) + { max = ((int) (1.2*len)) + 100; + if ((cat = (char *) realloc(cat,max+1)) == NULL) + { fprintf(stderr,"%s: Out of memory (Making path name)\n",Prog_Name); + Clean_Exit(1); + } + } + return (cat); +} + +void Match_Filter(char *aname, DAZZ_DB *ablock, char *bname, DAZZ_DB *bblock, + void *vasort, int alen, void *vbsort, int blen, Align_Spec *aspec) +{ THREAD threads[NTHREADS]; + Merge_Arg parmm[NTHREADS]; + Report_Arg parmr[NTHREADS]; + char *fname; + + SeedPair *khit, *hhit; + SeedPair *work1, *work2; + int64 nhits; + int64 nfilt, nlas; + + KmerPos *asort, *bsort; + int64 atot, btot; + + asort = (KmerPos *) vasort; + bsort = (KmerPos *) vbsort; + + atot = ablock->totlen; + btot = bblock->totlen; + + MR_tspace = Trace_Spacing(aspec); + + nfilt = nlas = nhits = 0; + + if (VERBOSE) + printf("\nComparing %s to %s\n",aname,bname); + + if (alen == 0 || blen == 0) + goto zerowork; + + { int i, j, p; + uint64 c; + int limit; + + MG_alist = asort; + MG_blist = bsort; + MG_ablock = ablock; + MG_bblock = bblock; + MG_self = (ablock == bblock); + + parmm[0].abeg = parmm[0].bbeg = 0; + for (i = 1; i < NTHREADS; i++) + { p = (int) ((((int64) alen) * i) / NTHREADS); + if (p > 0) + { c = asort[p-1].code; + while (asort[p].code == c) + p += 1; + } + parmm[i].abeg = parmm[i-1].aend = p; + parmm[i].bbeg = parmm[i-1].bend = find_tuple(asort[p].code,bsort,blen); + } + parmm[NTHREADS-1].aend = alen; + parmm[NTHREADS-1].bend = blen; + + for (i = 0; i < NTHREADS; i++) + for (j = 0; j < MAXGRAM; j++) + parmm[i].hitgram[j] = 0; + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,count_thread,parmm+i); + + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + + if (VERBOSE) + printf("\n"); + if (MEM_LIMIT > 0) + { int64 histo[MAXGRAM]; + int64 tom, avail; + + for (j = 0; j < MAXGRAM; j++) + histo[j] = parmm[0].hitgram[j]; + for (i = 1; i < NTHREADS; i++) + for (j = 0; j < MAXGRAM; j++) + histo[j] += parmm[i].hitgram[j]; + + avail = (int64) (MEM_LIMIT - (sizeof_DB(ablock) + sizeof_DB(bblock))) / sizeof(KmerPos); + if (asort == bsort || avail > alen + 2*blen) + avail = (avail - alen) / 2; + else + avail = avail - (alen + blen); + avail *= (.98 * sizeof(KmerPos)) / sizeof(SeedPair); + + tom = 0; + for (j = 0; j < MAXGRAM; j++) + { tom += j*histo[j]; + if (tom > avail) + break; + } + limit = j; + + if (limit <= 1) + { fprintf(stderr,"\nError: Insufficient "); + if (MEM_LIMIT == MEM_PHYSICAL) + fprintf(stderr," physical memory (%.1fGb), reduce block size\n", + (1.*MEM_LIMIT)/0x40000000ll); + else + { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); + fprintf(stderr," reduce block size or increase allocation\n"); + } + fflush(stderr); + Clean_Exit(1); + } + if (limit < 30) + { fprintf(stderr,"\nWarning: Sensitivity hampered by low "); + if (MEM_LIMIT == MEM_PHYSICAL) + fprintf(stderr," physical memory (%.1fGb), reduce block size\n", + (1.*MEM_LIMIT)/0x40000000ll); + else + { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); + fprintf(stderr," reduce block size or increase allocation\n"); + } + fflush(stderr); + } + if (VERBOSE) + { printf(" Capping mutual k-mer matches over %d (effectively -t%d)\n", + limit,(int) sqrt(1.*limit)); + fflush(stdout); + } + + for (i = 0; i < NTHREADS; i++) + { parmm[i].nhits = 0; + for (j = 1; j < limit; j++) + parmm[i].nhits += j * parmm[i].hitgram[j]; + parmm[i].limit = limit; + } + } + else + for (i = 0; i < NTHREADS; i++) + parmm[i].limit = INT32_MAX; + + nhits = parmm[0].nhits; + for (i = 1; i < NTHREADS; i++) + parmm[i].nhits = nhits += parmm[i].nhits; + + if (VERBOSE) + { printf(" Hit count = "); + Print_Number(nhits,0,stdout); + if (asort == bsort || nhits*sizeof(SeedPair) >= blen*sizeof(KmerPos)) + printf("\n Highwater of %.2fGb space\n", + (1. * (alen*sizeof(KmerPos) + 2*nhits*sizeof(SeedPair)) / 0x40000000ll)); + else + printf("\n Highwater of %.2fGb space\n", + (1. * ((alen + blen)*sizeof(KmerPos) + nhits*sizeof(SeedPair)) / 0x40000000ll)); + fflush(stdout); + } + + if (nhits == 0) + goto zerowork; + + if (asort == bsort) + hhit = work1 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1), + "Allocating daligner hit vectors"); + else + { if (nhits*sizeof(SeedPair) >= blen*sizeof(KmerPos)) + bsort = (KmerPos *) Realloc(bsort,sizeof(SeedPair)*(nhits+1), + "Reallocating daligner sort vectors"); + hhit = work1 = (SeedPair *) bsort; + } + khit = work2 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1), + "Allocating daligner hit vectors"); + if (hhit == NULL || khit == NULL || bsort == NULL) + Clean_Exit(1); + + MG_blist = bsort; + MG_hits = khit; + + for (i = NTHREADS-1; i > 0; i--) + parmm[i].nhits = parmm[i-1].nhits; + parmm[0].nhits = 0; + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,merge_thread,parmm+i); + + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + +#ifdef TEST_PAIRS + printf("\nSETUP SORT:\n"); + for (i = 0; i < HOW_MANY && i < nhits; i++) + printf(" %6d / %6d / %5d / %5d\n",khit[i].aread,khit[i].bread,khit[i].apos,khit[i].diag); +#endif + } + + { int i, j; + int pairsort[13]; + int areads = ablock->nreads-1; + int breads = bblock->nreads-1; + int maxlen = ablock->maxlen; + int abits, bbits, pbits; + + abits = 1; + while (areads > 0) + { areads >>= 1; + abits += 1; + } + + bbits = 0; + while (breads > 0) + { breads >>= 1; + bbits += 1; + } + + pbits = 1; + while (maxlen > 0) + { maxlen >>= 1; + pbits += 1; + } + +#if __ORDER_LITTLE_ENDIAN__ == __BYTE_ORDER__ + for (i = 0; i <= (pbits-1)/8; i++) + pairsort[i] = 8+i; + j = i; + for (i = 0; i <= (bbits-1)/8; i++) + pairsort[j+i] = 4+i; + j += i; + for (i = 0; i <= (abits-1)/8; i++) + pairsort[j+i] = i; +#else + for (i = 0; i <= (pbits-1)/8; i++) + pairsort[i] = 11+i; + j = i; + for (i = 0; i <= (bbits-1)/8; i++) + pairsort[j+i] = 7-i; + j += i; + for (i = 0; i <= (abits-1)/8; i++) + pairsort[j+i] = 3-i; +#endif + pairsort[j+i] = -1; + + khit = (SeedPair *) LSD_Sort(nhits,khit,hhit,16,16,pairsort); + + khit[nhits].aread = 0x7fffffff; + khit[nhits].bread = 0x7fffffff; + khit[nhits].apos = 0x7fffffff; + khit[nhits].diag = 0x7fffffff; + } + +#ifdef TEST_CSORT + { int i; + + printf("\nCROSS SORT %lld:\n",nhits); + for (i = 0; i < HOW_MANY && i <= nhits; i++) + printf(" %6d / %6d / %5d / %5d\n",khit[i].aread,khit[i].bread,khit[i].apos,khit[i].diag); + } +#endif + + { int max_diag = ((ablock->maxlen >> Binshift) - ((-bblock->maxlen) >> Binshift)) + 3; + int *space; + int i; + + MR_ablock = ablock; + MR_bblock = bblock; + MR_hits = khit; + MR_two = ! MG_self && SYMMETRIC; + MR_spec = aspec; + + { int p, r; + + parmr[0].beg = 0; + for (i = 1; i < NTHREADS; i++) + { p = (nhits * i) / NTHREADS; + if (p > 0) + { r = khit[p-1].bread; + while (khit[p].bread == r) + p += 1; + } + parmr[i].beg = parmr[i-1].end = p; + } + parmr[NTHREADS-1].end = nhits; + } + + space = (int *) Malloc(NTHREADS*3*max_diag*sizeof(int),"Allocating space for report thread"); + if (space == NULL) + Clean_Exit(1); + + fname = NameBuffer(aname,bname); + + for (i = 0; i < 3*max_diag*NTHREADS; i++) + space[i] = 0; + for (i = 0; i < NTHREADS; i++) + { if (i == 0) + parmr[i].score = space - (((-bblock->maxlen) >> Binshift) - 1); + else + parmr[i].score = parmr[i-1].lasta + max_diag; + parmr[i].lastp = parmr[i].score + max_diag; + parmr[i].lasta = parmr[i].lastp + max_diag; + parmr[i].work = New_Work_Data(); + + sprintf(fname,"%s/%s.%s.N%d.las",SORT_PATH,aname,bname,i+1); + parmr[i].ofile1 = Fopen(fname,"w"); + if (parmr[i].ofile1 == NULL) + Clean_Exit(1); + + if (MG_self) + parmr[i].ofile2 = parmr[i].ofile1; + else if (SYMMETRIC) + { sprintf(fname,"%s/%s.%s.N%d.las",SORT_PATH,bname,aname,i+1); + parmr[i].ofile2 = Fopen(fname,"w"); + if (parmr[i].ofile2 == NULL) + Clean_Exit(1); + } + } + +#ifdef NOTHREAD + + for (i = 0; i < NTHREADS; i++) + report_thread(parmr+i); + +#else + + for (i = 0; i < NTHREADS; i++) + pthread_create(threads+i,NULL,report_thread,parmr+i); + + for (i = 0; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + +#endif + + for (i = 0; i < NTHREADS; i++) + { nfilt += parmr[i].nfilt; + nlas += parmr[i].nlas; + Free_Work_Data(parmr[i].work); + } + free(space); + +#ifdef PROFILE + { int64 nyes, nno; + + printf("H %lld\n",nhits); + printf("S %lld\n",nfilt); + printf("A %lld\n",nlas); + + nyes = 0; + nno = 0; + for (i = MAXHIT; i >= 0; i--) + { int j; + int64 ny, nn; + + ny = nn = 0; + for (j = 0; j < NTHREADS; j++) + { ny += parmr[j].profyes[i]; + nn += parmr[j].profno[i]; + } + nyes += ny; + nno += nn; + if (ny+nn > 0) + printf(" %4d %6lld %6lld\n",i,nyes,nno); + } + } +#endif + } + + free(work2); + free(work1); + goto epilogue; + +zerowork: + { FILE *ofile; + int i; + + fname = NameBuffer(aname,bname); + + nhits = 0; + for (i = 0; i < NTHREADS; i++) + { sprintf(fname,"%s/%s.%s.N%d.las",SORT_PATH,aname,bname,i+1); + ofile = Fopen(fname,"w"); + fwrite(&nhits,sizeof(int64),1,ofile); + fwrite(&MR_tspace,sizeof(int),1,ofile); + fclose(ofile); + if (! MG_self && SYMMETRIC) + { sprintf(fname,"%s/%s.%s.N%d.las",SORT_PATH,bname,aname,i+1); + ofile = Fopen(fname,"w"); + fwrite(&nhits,sizeof(int64),1,ofile); + fwrite(&MR_tspace,sizeof(int),1,ofile); + fclose(ofile); + } + } + } + +epilogue: + + if (VERBOSE) + { int width; + + if (nhits <= 0) + width = 1; + else + width = ((int) log10((double) nhits)) + 1; + width += (width-1)/3; + + printf("\n "); + Print_Number(nhits,width,stdout); + printf(" %d-mers (%e of matrix)\n ",Kmer,(1.*nhits/atot)/btot); + Print_Number(nfilt,width,stdout); + printf(" seed hits (%e of matrix)\n ",(1.*nfilt/atot)/btot); + Print_Number(nlas,width,stdout); + printf(" confirmed hits (%e of matrix)\n",(1.*nlas/atot)/btot); + fflush(stdout); + } +} diff --git a/filter.h b/filter.h new file mode 100644 index 0000000..8527924 --- /dev/null +++ b/filter.h @@ -0,0 +1,39 @@ +/******************************************************************************************* + * + * Filter interface for the dazzler. + * + * Author: Gene Myers + * Date : July 2013 + * + ********************************************************************************************/ + +#ifndef _FILTER + +#define _FILTER + +#include "DB.h" +#include "align.h" + +#undef PROFILE + +extern int VERBOSE; // -v flag is set? +extern int MINOVER; // minimum overlap (-l) +extern int HGAP_MIN; // HGap minimum (-H) +extern int SYMMETRIC; // output both A vs B and B vs A? ( ! -A) +extern int IDENTITY; // compare reads against themselves? (-I) +extern int BRIDGE; // bridge consecutive, chainable alignments (-B) +extern char *SORT_PATH; // where to place temporary files (-P) + +extern uint64 MEM_LIMIT; // memory limit (-M) +extern uint64 MEM_PHYSICAL; + +void Set_Filter_Params(int kmer, int mod, int binshift, int suppress, int hitmin, int nthreads); + +void *Sort_Kmers(DAZZ_DB *block, int *len); + +void Match_Filter(char *aname, DAZZ_DB *ablock, char *bname, DAZZ_DB *bblock, + void *atable, int alen, void *btable, int blen, Align_Spec *asettings); + +void Clean_Exit(int val); + +#endif diff --git a/lsd.sort.c b/lsd.sort.c new file mode 100644 index 0000000..1c98f75 --- /dev/null +++ b/lsd.sort.c @@ -0,0 +1,268 @@ +/******************************************************************************************* + * + * Fast threaded lexical sort routine. Can be compiled to accommodate any element size + * (set WORD_SIZE), and makes only n+1 passes to sort n radix bytes. The radix order + * for the bytes of an element may be sorted in any order as listed in the array bytes + * (that is -1 terminated). + * + * Author : Gene Myers + * First : May 2018 + * + ********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "lsd.sort.h" + +typedef unsigned char uint8; +typedef long long int64; + +#undef TEST_LSORT + +static int RSIZE; // Span between records +static int DSIZE; // Size of record + +static int NTHREADS; // # of threads to use +static int VERBOSE; // Print each byte as it is sorted + +void Set_LSD_Params(int nthread, int verbose) +{ NTHREADS = nthread; + VERBOSE = verbose; +} + +// Global variables for every "lex_thread" + +static int LEX_byte; // Current byte to sort on +static int LEX_next; // Next byte to sort on (if >= 0) +static int64 LEX_zdiv; // Size of thread segments (in bytes) +static uint8 *LEX_src; // Source data goes to ... +static uint8 *LEX_trg; // Target data + +// Thread control record + +typedef struct + { int64 beg; // Sort [beg,end) of LEX_src + int64 end; + int check[256]; // Not all of bucket will go to the same thread in the next cycle? + int next[256]; // Thread assignment for next cycle (updated if check true) + int64 thresh[256]; // If check then multiple of LEX_zdiv to check for thread assignment + int64 tptr[256]; // Finger for each 8-bit value + int64 *sptr; // Conceptually [256][NTHREADS]. At end of sorting pass + } Lex_Arg; // sprtr[b][n] = # of occurences of value b in rangd of + // thread n for the *next* pass + +// Threaded sorting pass + +static void *lex_thread(void *arg) +{ Lex_Arg *data = (Lex_Arg *) arg; + int64 *sptr = data->sptr; + int64 *tptr = data->tptr; + uint8 *src = LEX_src; + uint8 *dig = LEX_src + LEX_byte; + uint8 *nig = LEX_src + LEX_next; + uint8 *trg = LEX_trg; + int64 zdiv = LEX_zdiv; + int *check = data->check; + int *next = data->next; + int64 *thresh = data->thresh; + + int64 i, n, x; + uint8 d; + + n = data->end; + if (LEX_next < 0) + for (i = data->beg; i < n; i += RSIZE) + { d = dig[i]; + x = tptr[d]; + tptr[d] += RSIZE; + memcpy(trg+x,src+i,DSIZE); + } + else + for (i = data->beg; i < n; i += RSIZE) + { d = dig[i]; + x = tptr[d]; + tptr[d] += RSIZE; + memcpy(trg+x,src+i,DSIZE); + if (check[d]) + { if (x >= thresh[d]) + { next[d] += 0x100; + thresh[d] += zdiv; + } + } + sptr[next[d] | nig[i]] += 1; + } + return (NULL); +} + +// Threaded sort initiation pass: count bucket sizes + +static void *lexbeg_thread(void *arg) +{ Lex_Arg *data = (Lex_Arg *) arg; + int64 *tptr = data->tptr; + uint8 *dig = LEX_src + LEX_byte; + + int64 i, n; + + n = data->end; + for (i = data->beg; i < n; i += RSIZE) + tptr[dig[i]] += 1; + return (NULL); +} + +// Radix sort the indicated "bytes" of src, using array trg as the secondary array +// The arrays contains len elements each of "size" bytes. +// Return a pointer to the array containing the final result. + +void *LSD_Sort(int64 nelem, void *src, void *trg, int rsize, int dsize, int *bytes) +{ pthread_t threads[NTHREADS]; + Lex_Arg parmx[NTHREADS]; // Thread control record for sorting + + uint8 *xch; + int64 x, y, asize; + int i, j, z, b; + + asize = nelem*rsize; + RSIZE = rsize; + DSIZE = dsize; + + LEX_zdiv = ((nelem-1)/NTHREADS + 1)*RSIZE; + LEX_src = (uint8 *) src; + LEX_trg = (uint8 *) trg; + + for (i = 0; i < NTHREADS; i++) + parmx[i].sptr = (int64 *) alloca(NTHREADS*256*sizeof(int64)); + + // For each requested byte b in order, radix sort + + for (b = 0; bytes[b] >= 0; b++) + { LEX_byte = bytes[b]; + LEX_next = bytes[b+1]; + + if (VERBOSE) + { printf(" Sorting byte %d\n",LEX_byte); + fflush(stdout); + } + + // Setup beg, end, and zero tptr counters + + x = 0; + for (i = 0; i < NTHREADS; i++) + { parmx[i].beg = x; + x = LEX_zdiv*(i+1); + if (x > asize) + x = asize; + parmx[i].end = x; + for (j = 0; j < 256; j++) + parmx[i].tptr[j] = 0; + } + parmx[NTHREADS-1].end = asize; + + // If first pass, then explicitly sweep to get tptr counts + // otherwise accumulate from sptr counts of last sweep + + if (b == 0) + { for (i = 1; i < NTHREADS; i++) + pthread_create(threads+i,NULL,lexbeg_thread,parmx+i); + lexbeg_thread(parmx); + for (i = 1; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + } + else + { int64 *pxt, *pxs; + + for (i = 0; i < NTHREADS; i++) + { pxt = parmx[i].tptr; + for (z = 0; z < NTHREADS; z++) + { pxs = parmx[z].sptr + (i<<8); + for (j = 0; j < 256; j++) + pxt[j] += pxs[j]; + } + } + } + + // Zero sptr array counters in preparation of pass + + for (i = 0; i < NTHREADS; i++) + for (z = (NTHREADS<<8)-1; z >= 0; z--) + parmx[i].sptr[z] = 0; + + // Convert tptr from counts to fingers, and determine thead assignment arrays + // to avoid a division in the inner most loop + + { int64 thr; + int nxt; + + thr = LEX_zdiv; + nxt = 0; + x = 0; + for (j = 0; j < 256; j++) + for (i = 0; i < NTHREADS; i++) + { y = parmx[i].tptr[j]*RSIZE; + parmx[i].tptr[j] = x; + x += y; + parmx[i].next[j] = nxt; + if (x < thr) + parmx[i].check[j] = 0; + else + { parmx[i].check[j] = 1; + parmx[i].thresh[j] = thr; + while (x >= thr) + { thr += LEX_zdiv; + nxt += 0x100; + } + } + } + } + + // Threaded pass + + for (i = 1; i < NTHREADS; i++) + pthread_create(threads+i,NULL,lex_thread,parmx+i); + lex_thread(parmx); + for (i = 1; i < NTHREADS; i++) + pthread_join(threads[i],NULL); + + xch = LEX_src; + LEX_src = LEX_trg; + LEX_trg = xch; + +#ifdef TEST_LSORT + { int64 c; + uint8 *psort = LEX_src-RSIZE; + + printf("\nLSORT %d\n",LEX_byte); + for (c = 0; c < 1000*RSIZE; c += RSIZE) + { printf(" %4lld: ",c/RSIZE); + for (j = 0; j < DSIZE; j++) + printf(" %02x",LEX_src[c+j]); + printf("\n"); + } + + for (c = RSIZE; c < asize; c += RSIZE) + { for (j = LEX_byte; j >= 2; j--) + if (LEX_src[c+j] > psort[c+j]) + break; + else if (LEX_src[c+j] < psort[c+j]) + { printf(" Order: %lld",c/RSIZE); + for (x = 2; x <= LEX_byte; x++) + printf(" %02x",psort[c+x]); + printf(" vs"); + for (x = 2; x <= LEX_byte; x++) + printf(" %02x",LEX_src[c+x]); + printf("\n"); + break; + } + } + } +#endif + } + + return ((void *) LEX_src); +} diff --git a/lsd.sort.h b/lsd.sort.h new file mode 100644 index 0000000..cdd8c71 --- /dev/null +++ b/lsd.sort.h @@ -0,0 +1,8 @@ +#ifndef LSD_SORT +#define LSD_SORT + +void Set_LSD_Params(int nthread, int verbose); + +void *LSD_Sort(long long len, void *src, void *trg, int rsize, int dsize, int *bytes); + +#endif // LSD_SORT -- cgit v1.2.3 From cad70fe78d22342ca5a6feb6d0bf2ee4db676243 Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Sat, 15 Aug 2020 21:39:59 +0200 Subject: Import daligner_1.0+git20200727.ed40ce5-3.debian.tar.xz [dgit import tarball daligner 1.0+git20200727.ed40ce5-3 daligner_1.0+git20200727.ed40ce5-3.debian.tar.xz] --- README.source | 10 ++++ README.test | 9 +++ changelog | 128 ++++++++++++++++++++++++++++++++++++++++++ control | 23 ++++++++ copyright | 36 ++++++++++++ docs | 3 + man/daligner.1 | 12 ++++ manpages | 1 + patches/cflags.patch | 15 +++++ patches/cross.patch | 60 ++++++++++++++++++++ patches/destdir-install.patch | 16 ++++++ patches/lddflags.patch | 62 ++++++++++++++++++++ patches/series | 4 ++ rules | 21 +++++++ salsa-ci.yml | 4 ++ source/format | 1 + tests/control | 3 + tests/run-unit-test | 29 ++++++++++ upstream/metadata | 21 +++++++ watch | 7 +++ 20 files changed, 465 insertions(+) create mode 100644 README.source create mode 100644 README.test create mode 100644 changelog create mode 100644 control create mode 100644 copyright create mode 100644 docs create mode 100644 man/daligner.1 create mode 100644 manpages create mode 100644 patches/cflags.patch create mode 100644 patches/cross.patch create mode 100644 patches/destdir-install.patch create mode 100644 patches/lddflags.patch create mode 100644 patches/series create mode 100755 rules create mode 100644 salsa-ci.yml create mode 100644 source/format create mode 100644 tests/control create mode 100644 tests/run-unit-test create mode 100644 upstream/metadata create mode 100644 watch diff --git a/README.source b/README.source new file mode 100644 index 0000000..82fcd43 --- /dev/null +++ b/README.source @@ -0,0 +1,10 @@ +daligner for Debian +=================== + +Please note: + + The files DB.[ch] and QV.[ch] are identical code copies from + dazzdb package. It would make sense to create a real library + package from dazzdb and re-use the code here + + -- Andreas Tille Tue, 28 Jan 2020 08:37:57 +0100 diff --git a/README.test b/README.test new file mode 100644 index 0000000..b936868 --- /dev/null +++ b/README.test @@ -0,0 +1,9 @@ +Notes on how this package can be tested. +──────────────────────────────────────── + +This package can be tested by running the provided test: + + sudo apt install dazzdb + sh run-unit-test + +in order to confirm its integrity. diff --git a/changelog b/changelog new file mode 100644 index 0000000..848999f --- /dev/null +++ b/changelog @@ -0,0 +1,128 @@ +daligner (1.0+git20200727.ed40ce5-3) unstable; urgency=medium + + [ Helmut Grohne ] + * Do not hard code gcc + Closes: #968434 + + -- Andreas Tille Sat, 15 Aug 2020 21:39:59 +0200 + +daligner (1.0+git20200727.ed40ce5-2) unstable; urgency=medium + + * Team upload. + * added autopkgtests + * Install docs + + -- Shruti Sridhar Wed, 12 Aug 2020 22:52:11 +0200 + +daligner (1.0+git20200727.ed40ce5-1) unstable; urgency=medium + + * Team upload. + * New upstream version + + -- Steffen Moeller Mon, 10 Aug 2020 16:01:48 +0200 + +daligner (1.0+git20200608.c18a2fb-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * debhelper-compat 13 (routine-update) + * Add salsa-ci file (routine-update) + * Rules-Requires-Root: no (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit. + + -- Steffen Moeller Sat, 13 Jun 2020 15:37:29 +0200 + +daligner (1.0+git20200115.c2b47da-1) unstable; urgency=medium + + * New upstream version + * Afif removed himself from Uploaders + * Add myself to Uploaders + * New upstream version + * debhelper-compat 12 + * Standards-Version: 4.5.0 (routine-update) + * debian/README.source: Document code copy from dazzdb + + -- Andreas Tille Tue, 28 Jan 2020 09:32:56 +0100 + +daligner (1.0+git20180524.fd21879-1) unstable; urgency=medium + + * Team upload. + + [ Jelmer Vernooij ] + * Use secure copyright file specification URI. + + [ Andreas Tille ] + * New upstream commit + * debhelper 11 + * Point Vcs fields to salsa.debian.org + * Standards-Version: 4.2.1 + * Remove trailing whitespace in debian/copyright + * hardening=+all + + -- Andreas Tille Sun, 28 Oct 2018 08:34:03 +0100 + +daligner (1.0+20180108-1) unstable; urgency=medium + + * New upstream snapshot (git 233274a) + * Bump Standards-Version to 4.1.3 + * Bump copyright years + + -- Afif Elghraoui Sun, 04 Feb 2018 05:26:26 -0500 + +daligner (1.0+20171010-2) unstable; urgency=low + + [ Steffen Moeller ] + * [debian/upstream/metadata] Reference to OMICtools added. + + -- Afif Elghraoui Thu, 19 Oct 2017 23:42:51 -0400 + +daligner (1.0+20171010-1) unstable; urgency=medium + + * New upstream snapshot (git b966696) + * Bump Standards Version to 4.1.1 + * Bump copyright years + + -- Afif Elghraoui Thu, 19 Oct 2017 23:18:52 -0400 + +daligner (1.0+20161119-1) unstable; urgency=medium + + * New upstream snapshot (git a9458dc) + + -- Afif Elghraoui Wed, 18 Jan 2017 22:16:19 -0800 + +daligner (1.0+20160927-2) unstable; urgency=low + + * Use LDLIBS rather than LDFLAGS. + Thanks to Logan Rosen (Closes: #849431) + * Suggest dascrubber + * d/rules: Don't combine CPPFLAGS with CFLAGS + * Use debhelper compat 10 + + -- Afif Elghraoui Tue, 27 Dec 2016 13:03:16 -0800 + +daligner (1.0+20160927-1) unstable; urgency=medium + + * Imported Upstream version 1.0+20160927 (git ca167d3) + * Update patches + * Refer to upstream README for command reference + * Update Standards-Version + * Use encrypted protocols for VCS URLs + * Update email address and copyright years + + -- Afif Elghraoui Tue, 11 Oct 2016 21:12:16 -0700 + +daligner (1.0+20151214-1) unstable; urgency=medium + + * New upstream revision (git2923450) + * Add manpages for new executables + * Stylistic consistency in packaging + * Refresh/adjust patches + * Suggest dazzdb + + -- Afif Elghraoui Fri, 08 Jan 2016 22:55:34 -0800 + +daligner (1.0-1) unstable; urgency=low + + * Initial release (Closes: #797352) + + -- Afif Elghraoui Sat, 29 Aug 2015 13:25:02 -0700 diff --git a/control b/control new file mode 100644 index 0000000..3a49554 --- /dev/null +++ b/control @@ -0,0 +1,23 @@ +Source: daligner +Maintainer: Debian Med Packaging Team +Uploaders: Andreas Tille +Section: science +Priority: optional +Build-Depends: debhelper-compat (= 13) +Standards-Version: 4.5.0 +Vcs-Browser: https://salsa.debian.org/med-team/daligner +Vcs-Git: https://salsa.debian.org/med-team/daligner.git +Homepage: https://dazzlerblog.wordpress.com +Rules-Requires-Root: no + +Package: daligner +Architecture: any +Depends: ${shlibs:Depends}, + ${misc:Depends} +Suggests: dazzdb, + dascrubber +Description: local alignment discovery between long nucleotide sequencing reads + These tools permit one to find all significant local alignments between + reads encoded in a Dazzler database. The assumption is that the reads are + from a Pacific Biosciences RS II long read sequencer. That is, the reads + are long and noisy, up to 15% on average. diff --git a/copyright b/copyright new file mode 100644 index 0000000..f050988 --- /dev/null +++ b/copyright @@ -0,0 +1,36 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: daligner +Upstream-Contact: Eugene W. Myers, Jr. +Source: http://github.com/thegenemyers/DALIGNER + +Files: * +Copyright: 2013-2018 Eugene W. Myers, Jr. +License: BSD-3-Clause~EWM + +Files: debian/* +Copyright: 2015-2018 Afif Elghraoui +License: BSD-3-Clause~EWM + +License: BSD-3-Clause~EWM + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + . + · Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + . + · Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + . + · The name of EWM may not be used to endorse or promote products derived from + this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/docs b/docs new file mode 100644 index 0000000..06e0915 --- /dev/null +++ b/docs @@ -0,0 +1,3 @@ +README.md +debian/README.* +debian/tests/run-unit-test diff --git a/man/daligner.1 b/man/daligner.1 new file mode 100644 index 0000000..2ab3eb8 --- /dev/null +++ b/man/daligner.1 @@ -0,0 +1,12 @@ +.TH DALIGNER 1 "October 2016" +.SH NAME +daligner \- local alignment discovery between long nucleotide sequencing reads +.SH SYNOPSIS +.B daligner +.RI [ options ] " files" ... +.SH DESCRIPTION +These tools permit one to find all significant local alignments between reads encoded in a Dazzler database. +The assumption is that the reads are from a Pacific Biosciences RS II long read sequencer. +That is, the reads are long and noisy, up to 15% on average. +.SH OPTIONS +For a complete description of available commands and their options, see /usr/share/doc/daligner/README.md diff --git a/manpages b/manpages new file mode 100644 index 0000000..13cdaf4 --- /dev/null +++ b/manpages @@ -0,0 +1 @@ +debian/man/*.1 diff --git a/patches/cflags.patch b/patches/cflags.patch new file mode 100644 index 0000000..c962c7f --- /dev/null +++ b/patches/cflags.patch @@ -0,0 +1,15 @@ +Description: Append to CFLAGS +Author: Afif Elghraoui +Forwarded: not-needed +Last-Update: 2016-01-08 +--- a/Makefile ++++ b/Makefile +@@ -3,7 +3,7 @@ DEST_DIR = ~/bin + # CFLAGS = -O0 -g -Wall -Wextra -Wno-unused-result -fno-strict-aliasing -fsanitize=address -fsanitize=undefined + # Above is for debug out of bound addresses, must compile with -lASAN -lUBSAN if gcc instead of clang + +-CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ++CFLAGS += -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing + + ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAa2b LAb2a dumpLA + diff --git a/patches/cross.patch b/patches/cross.patch new file mode 100644 index 0000000..2d1f3b3 --- /dev/null +++ b/patches/cross.patch @@ -0,0 +1,60 @@ +Date: Sat, 15 Aug 2020 07:24:26 +0200 +From: Helmut Grohne +Description: Do not hard code gcc +Bug-Debian: https://bugs.debian.org/968434 + +--- a/Makefile ++++ b/Makefile +@@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmer + all: $(ALL) + + daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + + HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + + LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + + LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + + LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + + LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + + LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) + + dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) + + clean: + rm -f $(ALL) diff --git a/patches/destdir-install.patch b/patches/destdir-install.patch new file mode 100644 index 0000000..8f98f17 --- /dev/null +++ b/patches/destdir-install.patch @@ -0,0 +1,16 @@ +Description: Use DESTDIR during installation +Author: Afif Elghraoui +Forwarded: not-needed +Last-Update: 2016-01-08 +--- a/Makefile ++++ b/Makefile +@@ -51,7 +51,8 @@ clean: + rm -f daligner.tar.gz + + install: +- cp $(ALL) $(DEST_DIR) ++ install -d $(DESTDIR)/usr/bin ++ install $(ALL) $(DESTDIR)/usr/bin + + package: + make clean diff --git a/patches/lddflags.patch b/patches/lddflags.patch new file mode 100644 index 0000000..fb8230e --- /dev/null +++ b/patches/lddflags.patch @@ -0,0 +1,62 @@ +Description: Support LDFLAGS in upstream build system + This patch was made using the following command: + sed -i 's/\(gcc.*\)/\1 \$(LDFLAGS)/' Makefile +Author: Afif Elghraoui +Forwarded: not-needed +Last-Update: 2016-01-08 + +--- a/Makefile ++++ b/Makefile +@@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmer + all: $(ALL) + + daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm ++ gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + + HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + + LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + + LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + + LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + + LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + + LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) + + LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) + + dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) + + clean: + rm -f $(ALL) diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..5089cec --- /dev/null +++ b/patches/series @@ -0,0 +1,4 @@ +cflags.patch +lddflags.patch +destdir-install.patch +cross.patch diff --git a/rules b/rules new file mode 100755 index 0000000..080b3d0 --- /dev/null +++ b/rules @@ -0,0 +1,21 @@ +#!/usr/bin/make -f + +#DH_VERBOSE = 1 +include /usr/share/dpkg/default.mk + +export DEB_BUILD_MAINT_OPTIONS=hardening=+all + +%: + dh $@ + +override_dh_auto_install: + dh_auto_install + mkdir -p $(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1 + for command in $(CURDIR)/debian/$(DEB_SOURCE)/usr/bin/*; \ + do \ + if [ $$(basename $$command) != daligner ] ; \ + then \ + ln -s daligner.1 \ + $(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1/$$(basename $$command).1 ; \ + fi; \ + done diff --git a/salsa-ci.yml b/salsa-ci.yml new file mode 100644 index 0000000..33c3a64 --- /dev/null +++ b/salsa-ci.yml @@ -0,0 +1,4 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/tests/control b/tests/control new file mode 100644 index 0000000..bedafb6 --- /dev/null +++ b/tests/control @@ -0,0 +1,3 @@ +Tests: run-unit-test +Depends: @, dazzdb +Restrictions: allow-stderr diff --git a/tests/run-unit-test b/tests/run-unit-test new file mode 100644 index 0000000..463e336 --- /dev/null +++ b/tests/run-unit-test @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +pkg=daligner + +export LC_ALL=C.UTF-8 +if [ "${AUTOPKGTEST_TMP}" = "" ] ; then + AUTOPKGTEST_TMP=$(mktemp -d /tmp/${pkg}-test.XXXXXX) + trap "rm -rf ${AUTOPKGTEST_TMP}" 0 INT QUIT ABRT PIPE TERM +fi + +cd "${AUTOPKGTEST_TMP}" + +rangen 1.0 -r2020 >R.fasta +fasta2DAM R R.fasta +dsimulator R -c20. -r2020 >G.fasta +fasta2DB G G.fasta +rm G.fasta +DBsplit -s11 G +DBdust G.1 +DBdust G.2 +DBstats -mdust G +cat G.db + +HPC.daligner -mdust -t5 G | sh -v +LAshow -c G -w50 G.1 | head + +# Since the output is not consistent. +#It cannot be compared with the reference. diff --git a/upstream/metadata b/upstream/metadata new file mode 100644 index 0000000..aac65ad --- /dev/null +++ b/upstream/metadata @@ -0,0 +1,21 @@ +Bug-Database: https://github.com/thegenemyers/DALIGNER/issues +Bug-Submit: https://github.com/thegenemyers/DALIGNER/issues/new +Reference: + Author: Gene Myers + Title: Efficient Local Alignment Discovery amongst Noisy Long Reads + Booktitle: Algorithms in Bioinformatics + ISSN: 0302-9743 + Year: 2014 + Volume: 8701 + Pages: 52-67 + DOI: 10.1007/978-3-662-44753-6_5 + URL: http://link.springer.com/chapter/10.1007%2F978-3-662-44753-6_5 +Registry: + - Name: OMICtools + Entry: OMICS_08897 + - Name: bio.tools + Entry: NA + - Name: SciCrunch + Entry: SCR_016066 + - Name: conda:bioconda + Entry: daligner diff --git a/watch b/watch new file mode 100644 index 0000000..3698f95 --- /dev/null +++ b/watch @@ -0,0 +1,7 @@ +version=4 + +opts="mode=git,pretty=1.0+git%cd.%h" \ + https://github.com/thegenemyers/DALIGNER.git HEAD + +#opts="filenamemangle=s/(?:.*)?v?(\d[\d\.]*)\.tar\.gz/daligner-$1.tar.gz/" \ +# https://github.com/thegenemyers/DALIGNER/tags (?:.*/)?[vV]?(\d[\d\.]*)\.tar\.gz -- cgit v1.2.3 From 7164be233d9bf39b269e6b18d4880693b6983d61 Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Sat, 15 Aug 2020 21:39:59 +0200 Subject: Append to CFLAGS Forwarded: not-needed Last-Update: 2016-01-08 Gbp-Pq: Name cflags.patch --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fc3d8d4..2afddb4 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ DEST_DIR = ~/bin # CFLAGS = -O0 -g -Wall -Wextra -Wno-unused-result -fno-strict-aliasing -fsanitize=address -fsanitize=undefined # Above is for debug out of bound addresses, must compile with -lASAN -lUBSAN if gcc instead of clang -CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing +CFLAGS += -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAa2b LAb2a dumpLA -- cgit v1.2.3 From b2e32ad053dbade42ce91c9ec87e574ba85eaa49 Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Sat, 15 Aug 2020 21:39:59 +0200 Subject: Support LDFLAGS in upstream build system Forwarded: not-needed Last-Update: 2016-01-08 This patch was made using the following command: sed -i 's/\(gcc.*\)/\1 \$(LDFLAGS)/' Makefile Gbp-Pq: Name lddflags.patch --- Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 2afddb4..1cbe3f5 100644 --- a/Makefile +++ b/Makefile @@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck L all: $(ALL) daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm + gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm + gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) clean: rm -f $(ALL) -- cgit v1.2.3 From c2725b23312de07005c211598983a5a18f3f27ad Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Sat, 15 Aug 2020 21:39:59 +0200 Subject: Use DESTDIR during installation Forwarded: not-needed Last-Update: 2016-01-08 Gbp-Pq: Name destdir-install.patch --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1cbe3f5..3fbf5ab 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,8 @@ clean: rm -f daligner.tar.gz install: - cp $(ALL) $(DEST_DIR) + install -d $(DESTDIR)/usr/bin + install $(ALL) $(DESTDIR)/usr/bin package: make clean -- cgit v1.2.3 From a9609070b758545cd231019a4b8557def787b4b0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 15 Aug 2020 07:24:26 +0200 Subject: Do not hard code gcc Bug-Debian: https://bugs.debian.org/968434 Gbp-Pq: Name cross.patch --- Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 3fbf5ab..e442f02 100644 --- a/Makefile +++ b/Makefile @@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck L all: $(ALL) daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) clean: rm -f $(ALL) -- cgit v1.2.3 From 8503dc511d15908e446c34c3742fc097be8d8409 Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Wed, 3 Aug 2022 16:35:57 +0200 Subject: Append to CFLAGS Forwarded: not-needed Last-Update: 2016-01-08 Gbp-Pq: Name cflags.patch --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fc3d8d4..2afddb4 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ DEST_DIR = ~/bin # CFLAGS = -O0 -g -Wall -Wextra -Wno-unused-result -fno-strict-aliasing -fsanitize=address -fsanitize=undefined # Above is for debug out of bound addresses, must compile with -lASAN -lUBSAN if gcc instead of clang -CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing +CFLAGS += -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAa2b LAb2a dumpLA -- cgit v1.2.3 From 636cf404387d637cbcc6631f1d60eed68e41961b Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Wed, 3 Aug 2022 16:35:57 +0200 Subject: Support LDFLAGS in upstream build system Forwarded: not-needed Last-Update: 2016-01-08 This patch was made using the following command: sed -i 's/\(gcc.*\)/\1 \$(LDFLAGS)/' Makefile Gbp-Pq: Name lddflags.patch --- Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 2afddb4..1cbe3f5 100644 --- a/Makefile +++ b/Makefile @@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck L all: $(ALL) daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm + gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm + gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm + gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) clean: rm -f $(ALL) -- cgit v1.2.3 From 8c247d48722e6ffe70e81e3be7428dd765eec353 Mon Sep 17 00:00:00 2001 From: Afif Elghraoui Date: Wed, 3 Aug 2022 16:35:57 +0200 Subject: Use DESTDIR during installation Forwarded: not-needed Last-Update: 2016-01-08 Gbp-Pq: Name destdir-install.patch --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1cbe3f5..3fbf5ab 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,8 @@ clean: rm -f daligner.tar.gz install: - cp $(ALL) $(DEST_DIR) + install -d $(DESTDIR)/usr/bin + install $(ALL) $(DESTDIR)/usr/bin package: make clean -- cgit v1.2.3 From 1a7378333062b1137b4e4814aedfc7e2ebb80f7c Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 15 Aug 2020 07:24:26 +0200 Subject: Do not hard code gcc Bug-Debian: https://bugs.debian.org/968434 Gbp-Pq: Name cross.patch --- Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 3fbf5ab..e442f02 100644 --- a/Makefile +++ b/Makefile @@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck L all: $(ALL) daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h - gcc $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) clean: rm -f $(ALL) -- cgit v1.2.3 From b9cb0e1ee2b96bfc5e090915313c4d119d74ff6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Wed, 3 Aug 2022 16:35:57 +0200 Subject: apply CPPFLAGS too. Forwarded: not-needed Last-Update: 2022-08-03 Among other things, it enables fortifying sources. Last-Update: 2022-08-03 Gbp-Pq: Name cppflags.patch --- Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index e442f02..d3be2ea 100644 --- a/Makefile +++ b/Makefile @@ -10,40 +10,40 @@ ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck L all: $(ALL) daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm $(LDFLAGS) LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) LAa2b: LAa2b.c align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAa2b LAa2b.c align.c DB.c QV.c -lm $(LDFLAGS) LAb2a: LAb2a.c align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o LAb2a LAb2a.c align.c DB.c QV.c -lm $(LDFLAGS) dumpLA: dumpLA.c align.c align.h DB.c DB.h QV.c QV.h - $(CC) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) + $(CC) $(CPPFLAGS) $(CFLAGS) -o dumpLA dumpLA.c align.c DB.c QV.c -lm $(LDFLAGS) clean: rm -f $(ALL) -- cgit v1.2.3 From 63bb7a11cf97b1f256417a4fb514e2b894b4bf83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Wed, 3 Aug 2022 16:35:57 +0200 Subject: fix a couple of typos caught by lintian. Forwarded: no Last-Update: 2022-08-03 Last-Update: 2022-08-03 Gbp-Pq: Name typos.patch --- DB.c | 4 ++-- LAsplit.c | 2 +- daligner.c | 2 +- lsd.sort.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/DB.c b/DB.c index d674a86..bdf6791 100644 --- a/DB.c +++ b/DB.c @@ -2208,7 +2208,7 @@ int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) } if (accum != extra->accum) { EPRINTF(EPLACE, - "%s: Reduction indicator of extra %s does not agree with previos .anno block files\n", + "%s: Reduction indicator of extra %s does not agree with previous .anno block files\n", Prog_Name,name); goto error; } @@ -2829,7 +2829,7 @@ static Block_Looper *parse_block_arg(char *arg, int isDB) first = last = -1; else { if (index(ppnt+1,BLOCK_SYMBOL) != NULL) - { EPRINTF(EPLACE,"%s: Two or more occurences of %c-sign in source name '%s'\n", + { EPRINTF(EPLACE,"%s: Two or more occurrences of %c-sign in source name '%s'\n", Prog_Name,BLOCK_SYMBOL,root); goto error; } diff --git a/LAsplit.c b/LAsplit.c index 966a0ff..e060528 100644 --- a/LAsplit.c +++ b/LAsplit.c @@ -105,7 +105,7 @@ int main(int argc, char *argv[]) exit (1); } if (index(root2+1,BLOCK_SYMBOL) != NULL) - { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n", + { fprintf(stderr,"%s: Two or more occurrences of %c-sign in source name '%s'\n", Prog_Name,BLOCK_SYMBOL,root); exit (1); } diff --git a/daligner.c b/daligner.c index 67fc16c..c1fbfce 100644 --- a/daligner.c +++ b/daligner.c @@ -531,7 +531,7 @@ int main(int argc, char *argv[]) ARG_POSITIVE(HIT_MIN,"Hit threshold (in bp.s)") break; case 't': - ARG_POSITIVE(MAX_REPS,"Tuple supression frequency") + ARG_POSITIVE(MAX_REPS,"Tuple suppression frequency") break; case 'H': ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") diff --git a/lsd.sort.c b/lsd.sort.c index 1c98f75..87d258a 100644 --- a/lsd.sort.c +++ b/lsd.sort.c @@ -55,7 +55,7 @@ typedef struct int64 thresh[256]; // If check then multiple of LEX_zdiv to check for thread assignment int64 tptr[256]; // Finger for each 8-bit value int64 *sptr; // Conceptually [256][NTHREADS]. At end of sorting pass - } Lex_Arg; // sprtr[b][n] = # of occurences of value b in rangd of + } Lex_Arg; // sprtr[b][n] = # of occurrences of value b in rangd of // thread n for the *next* pass // Threaded sorting pass -- cgit v1.2.3 From 27ce5c6bb85fb7ad8be2ea758eee92c68a298bf6 Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Mon, 6 Feb 2023 11:01:05 +0100 Subject: Import daligner_1.0+git20221215.bd26967-1.debian.tar.xz [dgit import tarball daligner 1.0+git20221215.bd26967-1 daligner_1.0+git20221215.bd26967-1.debian.tar.xz] --- README.source | 10 +++ README.test | 9 +++ changelog | 146 ++++++++++++++++++++++++++++++++++++++++++ control | 24 +++++++ copyright | 36 +++++++++++ docs | 3 + man/daligner.1 | 12 ++++ manpages | 1 + patches/cflags.patch | 15 +++++ patches/cppflags.patch | 54 ++++++++++++++++ patches/cross.patch | 52 +++++++++++++++ patches/destdir-install.patch | 16 +++++ patches/lddflags.patch | 54 ++++++++++++++++ patches/series | 6 ++ patches/typos.patch | 59 +++++++++++++++++ rules | 21 ++++++ salsa-ci.yml | 4 ++ source/format | 1 + tests/control | 3 + tests/run-unit-test | 29 +++++++++ upstream/metadata | 23 +++++++ watch | 7 ++ 22 files changed, 585 insertions(+) create mode 100644 README.source create mode 100644 README.test create mode 100644 changelog create mode 100644 control create mode 100644 copyright create mode 100644 docs create mode 100644 man/daligner.1 create mode 100644 manpages create mode 100644 patches/cflags.patch create mode 100644 patches/cppflags.patch create mode 100644 patches/cross.patch create mode 100644 patches/destdir-install.patch create mode 100644 patches/lddflags.patch create mode 100644 patches/series create mode 100644 patches/typos.patch create mode 100755 rules create mode 100644 salsa-ci.yml create mode 100644 source/format create mode 100644 tests/control create mode 100644 tests/run-unit-test create mode 100644 upstream/metadata create mode 100644 watch diff --git a/README.source b/README.source new file mode 100644 index 0000000..82fcd43 --- /dev/null +++ b/README.source @@ -0,0 +1,10 @@ +daligner for Debian +=================== + +Please note: + + The files DB.[ch] and QV.[ch] are identical code copies from + dazzdb package. It would make sense to create a real library + package from dazzdb and re-use the code here + + -- Andreas Tille Tue, 28 Jan 2020 08:37:57 +0100 diff --git a/README.test b/README.test new file mode 100644 index 0000000..b936868 --- /dev/null +++ b/README.test @@ -0,0 +1,9 @@ +Notes on how this package can be tested. +──────────────────────────────────────── + +This package can be tested by running the provided test: + + sudo apt install dazzdb + sh run-unit-test + +in order to confirm its integrity. diff --git a/changelog b/changelog new file mode 100644 index 0000000..84a5049 --- /dev/null +++ b/changelog @@ -0,0 +1,146 @@ +daligner (1.0+git20221215.bd26967-1) unstable; urgency=medium + + * New upstream version + * Standards-Version: 4.6.2 (routine-update) + + -- Andreas Tille Mon, 06 Feb 2023 11:01:05 +0100 + +daligner (1.0+git20210330.f61b8cf-1) unstable; urgency=medium + + * New upstream version + * Standards-Version: 4.6.1 (routine-update) + * cppflags.patch: added to fortify hardened functions. + * typos.patch: added to fix typos caught by lintian. + * d/u/metadata: add repository information. + * d/control: add myself to uploaders. + + -- Étienne Mollier Wed, 03 Aug 2022 16:35:57 +0200 + +daligner (1.0+git20200727.ed40ce5-3) unstable; urgency=medium + + [ Helmut Grohne ] + * Do not hard code gcc + Closes: #968434 + + -- Andreas Tille Sat, 15 Aug 2020 21:39:59 +0200 + +daligner (1.0+git20200727.ed40ce5-2) unstable; urgency=medium + + * Team upload. + * added autopkgtests + * Install docs + + -- Shruti Sridhar Wed, 12 Aug 2020 22:52:11 +0200 + +daligner (1.0+git20200727.ed40ce5-1) unstable; urgency=medium + + * Team upload. + * New upstream version + + -- Steffen Moeller Mon, 10 Aug 2020 16:01:48 +0200 + +daligner (1.0+git20200608.c18a2fb-1) unstable; urgency=medium + + * Team upload. + * New upstream version + * debhelper-compat 13 (routine-update) + * Add salsa-ci file (routine-update) + * Rules-Requires-Root: no (routine-update) + * Set upstream metadata fields: Bug-Database, Bug-Submit. + + -- Steffen Moeller Sat, 13 Jun 2020 15:37:29 +0200 + +daligner (1.0+git20200115.c2b47da-1) unstable; urgency=medium + + * New upstream version + * Afif removed himself from Uploaders + * Add myself to Uploaders + * New upstream version + * debhelper-compat 12 + * Standards-Version: 4.5.0 (routine-update) + * debian/README.source: Document code copy from dazzdb + + -- Andreas Tille Tue, 28 Jan 2020 09:32:56 +0100 + +daligner (1.0+git20180524.fd21879-1) unstable; urgency=medium + + * Team upload. + + [ Jelmer Vernooij ] + * Use secure copyright file specification URI. + + [ Andreas Tille ] + * New upstream commit + * debhelper 11 + * Point Vcs fields to salsa.debian.org + * Standards-Version: 4.2.1 + * Remove trailing whitespace in debian/copyright + * hardening=+all + + -- Andreas Tille Sun, 28 Oct 2018 08:34:03 +0100 + +daligner (1.0+20180108-1) unstable; urgency=medium + + * New upstream snapshot (git 233274a) + * Bump Standards-Version to 4.1.3 + * Bump copyright years + + -- Afif Elghraoui Sun, 04 Feb 2018 05:26:26 -0500 + +daligner (1.0+20171010-2) unstable; urgency=low + + [ Steffen Moeller ] + * [debian/upstream/metadata] Reference to OMICtools added. + + -- Afif Elghraoui Thu, 19 Oct 2017 23:42:51 -0400 + +daligner (1.0+20171010-1) unstable; urgency=medium + + * New upstream snapshot (git b966696) + * Bump Standards Version to 4.1.1 + * Bump copyright years + + -- Afif Elghraoui Thu, 19 Oct 2017 23:18:52 -0400 + +daligner (1.0+20161119-1) unstable; urgency=medium + + * New upstream snapshot (git a9458dc) + + -- Afif Elghraoui Wed, 18 Jan 2017 22:16:19 -0800 + +daligner (1.0+20160927-2) unstable; urgency=low + + * Use LDLIBS rather than LDFLAGS. + Thanks to Logan Rosen (Closes: #849431) + * Suggest dascrubber + * d/rules: Don't combine CPPFLAGS with CFLAGS + * Use debhelper compat 10 + + -- Afif Elghraoui Tue, 27 Dec 2016 13:03:16 -0800 + +daligner (1.0+20160927-1) unstable; urgency=medium + + * Imported Upstream version 1.0+20160927 (git ca167d3) + * Update patches + * Refer to upstream README for command reference + * Update Standards-Version + * Use encrypted protocols for VCS URLs + * Update email address and copyright years + + -- Afif Elghraoui Tue, 11 Oct 2016 21:12:16 -0700 + +daligner (1.0+20151214-1) unstable; urgency=medium + + * New upstream revision (git2923450) + * Add manpages for new executables + * Stylistic consistency in packaging + * Refresh/adjust patches + * Suggest dazzdb + + -- Afif Elghraoui Fri, 08 Jan 2016 22:55:34 -0800 + +daligner (1.0-1) unstable; urgency=low + + * Initial release (Closes: #797352) + + -- Afif Elghraoui Sat, 29 Aug 2015 13:25:02 -0700 diff --git a/control b/control new file mode 100644 index 0000000..b2867f7 --- /dev/null +++ b/control @@ -0,0 +1,24 @@ +Source: daligner +Maintainer: Debian Med Packaging Team +Uploaders: Andreas Tille , + Étienne Mollier +Section: science +Priority: optional +Build-Depends: debhelper-compat (= 13) +Standards-Version: 4.6.2 +Vcs-Browser: https://salsa.debian.org/med-team/daligner +Vcs-Git: https://salsa.debian.org/med-team/daligner.git +Homepage: https://dazzlerblog.wordpress.com +Rules-Requires-Root: no + +Package: daligner +Architecture: any +Depends: ${shlibs:Depends}, + ${misc:Depends} +Suggests: dazzdb, + dascrubber +Description: local alignment discovery between long nucleotide sequencing reads + These tools permit one to find all significant local alignments between + reads encoded in a Dazzler database. The assumption is that the reads are + from a Pacific Biosciences RS II long read sequencer. That is, the reads + are long and noisy, up to 15% on average. diff --git a/copyright b/copyright new file mode 100644 index 0000000..f050988 --- /dev/null +++ b/copyright @@ -0,0 +1,36 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: daligner +Upstream-Contact: Eugene W. Myers, Jr. +Source: http://github.com/thegenemyers/DALIGNER + +Files: * +Copyright: 2013-2018 Eugene W. Myers, Jr. +License: BSD-3-Clause~EWM + +Files: debian/* +Copyright: 2015-2018 Afif Elghraoui +License: BSD-3-Clause~EWM + +License: BSD-3-Clause~EWM + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + . + · Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + . + · Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + . + · The name of EWM may not be used to endorse or promote products derived from + this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/docs b/docs new file mode 100644 index 0000000..06e0915 --- /dev/null +++ b/docs @@ -0,0 +1,3 @@ +README.md +debian/README.* +debian/tests/run-unit-test diff --git a/man/daligner.1 b/man/daligner.1 new file mode 100644 index 0000000..2ab3eb8 --- /dev/null +++ b/man/daligner.1 @@ -0,0 +1,12 @@ +.TH DALIGNER 1 "October 2016" +.SH NAME +daligner \- local alignment discovery between long nucleotide sequencing reads +.SH SYNOPSIS +.B daligner +.RI [ options ] " files" ... +.SH DESCRIPTION +These tools permit one to find all significant local alignments between reads encoded in a Dazzler database. +The assumption is that the reads are from a Pacific Biosciences RS II long read sequencer. +That is, the reads are long and noisy, up to 15% on average. +.SH OPTIONS +For a complete description of available commands and their options, see /usr/share/doc/daligner/README.md diff --git a/manpages b/manpages new file mode 100644 index 0000000..13cdaf4 --- /dev/null +++ b/manpages @@ -0,0 +1 @@ +debian/man/*.1 diff --git a/patches/cflags.patch b/patches/cflags.patch new file mode 100644 index 0000000..1d640ea --- /dev/null +++ b/patches/cflags.patch @@ -0,0 +1,15 @@ +Description: Append to CFLAGS +Author: Afif Elghraoui +Forwarded: not-needed +Last-Update: 2016-01-08 +--- a/Makefile ++++ b/Makefile +@@ -3,7 +3,7 @@ DEST_DIR = ~/bin + # CFLAGS = -O0 -g -Wall -Wextra -Wno-unused-result -fno-strict-aliasing -fsanitize=address -fsanitize=undefined + # Above is for debug out of bound addresses, must compile with -lASAN -lUBSAN if gcc instead of clang + +-CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ++CFLAGS += -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing + + ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LA2ONE LAcheck ONE2LA + diff --git a/patches/cppflags.patch b/patches/cppflags.patch new file mode 100644 index 0000000..8a14c18 --- /dev/null +++ b/patches/cppflags.patch @@ -0,0 +1,54 @@ +Description: apply CPPFLAGS too. + Among other things, it enables fortifying sources. +Author: Étienne Mollier +Forwarded: not-needed +Last-Update: 2022-08-03 +--- +This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ +--- a/Makefile ++++ b/Makefile +@@ -10,34 +10,34 @@ ALL = daligner HPC.daligner LAsort LAmer + all: $(ALL) + + daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + + HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + + LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + + LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + + LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + + LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h +- $(CC) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) + + LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + + LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + + LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h +- $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + + ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h +- $(CC) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) ++ $(CC) $(CPPFLAGS) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) + + clean: + rm -f $(ALL) diff --git a/patches/cross.patch b/patches/cross.patch new file mode 100644 index 0000000..0b4babc --- /dev/null +++ b/patches/cross.patch @@ -0,0 +1,52 @@ +Date: Sat, 15 Aug 2020 07:24:26 +0200 +From: Helmut Grohne +Description: Do not hard code gcc +Bug-Debian: https://bugs.debian.org/968434 + +--- a/Makefile ++++ b/Makefile +@@ -10,34 +10,34 @@ ALL = daligner HPC.daligner LAsort LAmer + all: $(ALL) + + daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + + HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + + LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + + LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + + LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + + LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h +- gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) + + LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + + LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + + LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + + ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h +- gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) + + clean: + rm -f $(ALL) diff --git a/patches/destdir-install.patch b/patches/destdir-install.patch new file mode 100644 index 0000000..48e60f8 --- /dev/null +++ b/patches/destdir-install.patch @@ -0,0 +1,16 @@ +Description: Use DESTDIR during installation +Author: Afif Elghraoui +Forwarded: not-needed +Last-Update: 2016-01-08 +--- a/Makefile ++++ b/Makefile +@@ -45,7 +45,8 @@ clean: + rm -f daligner.tar.gz + + install: +- cp $(ALL) $(DEST_DIR) ++ install -d $(DESTDIR)/usr/bin ++ install $(ALL) $(DESTDIR)/usr/bin + + package: + make clean diff --git a/patches/lddflags.patch b/patches/lddflags.patch new file mode 100644 index 0000000..d4f5a52 --- /dev/null +++ b/patches/lddflags.patch @@ -0,0 +1,54 @@ +Description: Support LDFLAGS in upstream build system + This patch was made using the following command: + sed -i 's/\(gcc.*\)/\1 \$(LDFLAGS)/' Makefile +Author: Afif Elghraoui +Forwarded: not-needed +Last-Update: 2016-01-08 + +--- a/Makefile ++++ b/Makefile +@@ -10,34 +10,34 @@ ALL = daligner HPC.daligner LAsort LAmer + all: $(ALL) + + daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm ++ gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm $(LDFLAGS) + + HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm $(LDFLAGS) + + LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm $(LDFLAGS) + + LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm $(LDFLAGS) + + LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm $(LDFLAGS) + + LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h +- gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm ++ gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) + + LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm $(LDFLAGS) + + LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm $(LDFLAGS) + + LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h +- gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm ++ gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm $(LDFLAGS) + + ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h +- gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm ++ gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm $(LDFLAGS) + + clean: + rm -f $(ALL) diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..cfd0cec --- /dev/null +++ b/patches/series @@ -0,0 +1,6 @@ +cflags.patch +lddflags.patch +destdir-install.patch +cross.patch +cppflags.patch +typos.patch diff --git a/patches/typos.patch b/patches/typos.patch new file mode 100644 index 0000000..93fc1b7 --- /dev/null +++ b/patches/typos.patch @@ -0,0 +1,59 @@ +Description: fix a couple of typos caught by lintian. +Author: Étienne Mollier +Forwarded: no +Last-Update: 2022-08-03 +--- +This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ +--- daligner.orig/DB.c ++++ daligner/DB.c +@@ -2208,7 +2208,7 @@ + } + if (accum != extra->accum) + { EPRINTF(EPLACE, +- "%s: Reduction indicator of extra %s does not agree with previos .anno block files\n", ++ "%s: Reduction indicator of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } +@@ -2829,7 +2829,7 @@ + first = last = -1; + else + { if (index(ppnt+1,BLOCK_SYMBOL) != NULL) +- { EPRINTF(EPLACE,"%s: Two or more occurences of %c-sign in source name '%s'\n", ++ { EPRINTF(EPLACE,"%s: Two or more occurrences of %c-sign in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + goto error; + } +--- daligner.orig/LAsplit.c ++++ daligner/LAsplit.c +@@ -105,7 +105,7 @@ + exit (1); + } + if (index(root2+1,BLOCK_SYMBOL) != NULL) +- { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n", ++ { fprintf(stderr,"%s: Two or more occurrences of %c-sign in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } +--- daligner.orig/lsd.sort.c ++++ daligner/lsd.sort.c +@@ -55,7 +55,7 @@ + int64 thresh[256]; // If check then multiple of LEX_zdiv to check for thread assignment + int64 tptr[256]; // Finger for each 8-bit value + int64 *sptr; // Conceptually [256][NTHREADS]. At end of sorting pass +- } Lex_Arg; // sprtr[b][n] = # of occurences of value b in rangd of ++ } Lex_Arg; // sprtr[b][n] = # of occurrences of value b in rangd of + // thread n for the *next* pass + + // Threaded sorting pass +--- daligner.orig/daligner.c ++++ daligner/daligner.c +@@ -531,7 +531,7 @@ + ARG_POSITIVE(HIT_MIN,"Hit threshold (in bp.s)") + break; + case 't': +- ARG_POSITIVE(MAX_REPS,"Tuple supression frequency") ++ ARG_POSITIVE(MAX_REPS,"Tuple suppression frequency") + break; + case 'H': + ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") diff --git a/rules b/rules new file mode 100755 index 0000000..080b3d0 --- /dev/null +++ b/rules @@ -0,0 +1,21 @@ +#!/usr/bin/make -f + +#DH_VERBOSE = 1 +include /usr/share/dpkg/default.mk + +export DEB_BUILD_MAINT_OPTIONS=hardening=+all + +%: + dh $@ + +override_dh_auto_install: + dh_auto_install + mkdir -p $(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1 + for command in $(CURDIR)/debian/$(DEB_SOURCE)/usr/bin/*; \ + do \ + if [ $$(basename $$command) != daligner ] ; \ + then \ + ln -s daligner.1 \ + $(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1/$$(basename $$command).1 ; \ + fi; \ + done diff --git a/salsa-ci.yml b/salsa-ci.yml new file mode 100644 index 0000000..33c3a64 --- /dev/null +++ b/salsa-ci.yml @@ -0,0 +1,4 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/tests/control b/tests/control new file mode 100644 index 0000000..bedafb6 --- /dev/null +++ b/tests/control @@ -0,0 +1,3 @@ +Tests: run-unit-test +Depends: @, dazzdb +Restrictions: allow-stderr diff --git a/tests/run-unit-test b/tests/run-unit-test new file mode 100644 index 0000000..463e336 --- /dev/null +++ b/tests/run-unit-test @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +pkg=daligner + +export LC_ALL=C.UTF-8 +if [ "${AUTOPKGTEST_TMP}" = "" ] ; then + AUTOPKGTEST_TMP=$(mktemp -d /tmp/${pkg}-test.XXXXXX) + trap "rm -rf ${AUTOPKGTEST_TMP}" 0 INT QUIT ABRT PIPE TERM +fi + +cd "${AUTOPKGTEST_TMP}" + +rangen 1.0 -r2020 >R.fasta +fasta2DAM R R.fasta +dsimulator R -c20. -r2020 >G.fasta +fasta2DB G G.fasta +rm G.fasta +DBsplit -s11 G +DBdust G.1 +DBdust G.2 +DBstats -mdust G +cat G.db + +HPC.daligner -mdust -t5 G | sh -v +LAshow -c G -w50 G.1 | head + +# Since the output is not consistent. +#It cannot be compared with the reference. diff --git a/upstream/metadata b/upstream/metadata new file mode 100644 index 0000000..4d55e0e --- /dev/null +++ b/upstream/metadata @@ -0,0 +1,23 @@ +Bug-Database: https://github.com/thegenemyers/DALIGNER/issues +Bug-Submit: https://github.com/thegenemyers/DALIGNER/issues/new +Reference: + Author: Gene Myers + Title: Efficient Local Alignment Discovery amongst Noisy Long Reads + Booktitle: Algorithms in Bioinformatics + ISSN: 0302-9743 + Year: 2014 + Volume: 8701 + Pages: 52-67 + DOI: 10.1007/978-3-662-44753-6_5 + URL: http://link.springer.com/chapter/10.1007%2F978-3-662-44753-6_5 +Registry: + - Name: OMICtools + Entry: OMICS_08897 + - Name: bio.tools + Entry: NA + - Name: SciCrunch + Entry: SCR_016066 + - Name: conda:bioconda + Entry: daligner +Repository: https://github.com/thegenemyers/DALIGNER.git +Repository-Browse: https://github.com/thegenemyers/DALIGNER diff --git a/watch b/watch new file mode 100644 index 0000000..3698f95 --- /dev/null +++ b/watch @@ -0,0 +1,7 @@ +version=4 + +opts="mode=git,pretty=1.0+git%cd.%h" \ + https://github.com/thegenemyers/DALIGNER.git HEAD + +#opts="filenamemangle=s/(?:.*)?v?(\d[\d\.]*)\.tar\.gz/daligner-$1.tar.gz/" \ +# https://github.com/thegenemyers/DALIGNER/tags (?:.*/)?[vV]?(\d[\d\.]*)\.tar\.gz -- cgit v1.2.3 From eb8ab916d34faf2956f3c5c99a37d77743c72b6a Mon Sep 17 00:00:00 2001 From: Andreas Tille Date: Mon, 6 Feb 2023 11:01:05 +0100 Subject: Import daligner_1.0+git20221215.bd26967.orig.tar.xz [dgit import orig daligner_1.0+git20221215.bd26967.orig.tar.xz] --- DB.c | 2923 ++++++++++++++++++++++++++++++ DB.h | 729 ++++++++ HPC.daligner.c | 1159 ++++++++++++ LA2ONE.c | 617 +++++++ LAcat.c | 199 +++ LAcheck.c | 397 +++++ LAmerge.c | 524 ++++++ LAshow.c | 650 +++++++ LAsort.c | 413 +++++ LAsplit.c | 229 +++ LICENSE | 34 + Makefile | 52 + ONE2LA.c | 275 +++ ONElib.c | 3924 ++++++++++++++++++++++++++++++++++++++++ ONElib.h | 410 +++++ QV.c | 1481 +++++++++++++++ QV.h | 99 + README.md | 531 ++++++ align.c | 5453 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ align.h | 377 ++++ daligner.c | 758 ++++++++ filter.c | 2677 ++++++++++++++++++++++++++++ filter.h | 39 + lsd.sort.c | 268 +++ lsd.sort.h | 8 + 25 files changed, 24226 insertions(+) create mode 100644 DB.c create mode 100644 DB.h create mode 100644 HPC.daligner.c create mode 100644 LA2ONE.c create mode 100644 LAcat.c create mode 100644 LAcheck.c create mode 100644 LAmerge.c create mode 100644 LAshow.c create mode 100644 LAsort.c create mode 100644 LAsplit.c create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 ONE2LA.c create mode 100644 ONElib.c create mode 100644 ONElib.h create mode 100644 QV.c create mode 100644 QV.h create mode 100644 README.md create mode 100644 align.c create mode 100644 align.h create mode 100644 daligner.c create mode 100644 filter.c create mode 100644 filter.h create mode 100644 lsd.sort.c create mode 100644 lsd.sort.h diff --git a/DB.c b/DB.c new file mode 100644 index 0000000..d674a86 --- /dev/null +++ b/DB.c @@ -0,0 +1,2923 @@ +/******************************************************************************************* + * + * Compressed data base module. Auxiliary routines to open and manipulate a data base for + * which the sequence and read information are separated into two separate files, and the + * sequence is compressed into 2-bits for each base. Support for tracks of additional + * information, and trimming according to the current partition. + * + * Author : Gene Myers + * Date : July 2013 + * Revised: April 2014 + * + ********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" + +#ifdef HIDE_FILES +#define PATHSEP "/." +#else +#define PATHSEP "/" +#endif + + +/******************************************************************************************* + * + * GENERAL UTILITIES + * + ********************************************************************************************/ + +char *Prog_Name; + +#ifdef INTERACTIVE + +char Ebuffer[1000]; + +#endif + +int Count_Args(char *var) +{ int cnt, lev; + char *s; + + cnt = 1; + lev = 0; + for (s = var; *s != '\0'; s++) + if (*s == ',') + { if (lev == 0) + cnt += 1; + } + else if (*s == '(') + lev += 1; + else if (*s == ')') + lev -= 1; + return (cnt); +} + +void *Malloc(int64 size, char *mesg) +{ void *p; + + if ((p = malloc(size)) == NULL) + { if (mesg == NULL) + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); + else + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); + } + return (p); +} + +void *Realloc(void *p, int64 size, char *mesg) +{ if (size <= 0) + size = 1; + if ((p = realloc(p,size)) == NULL) + { if (mesg == NULL) + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); + else + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); + } + return (p); +} + +char *Strdup(char *name, char *mesg) +{ char *s; + + if (name == NULL) + return (NULL); + if ((s = strdup(name)) == NULL) + { if (mesg == NULL) + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); + else + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); + } + return (s); +} + +FILE *Fopen(char *name, char *mode) +{ FILE *f; + + if (name == NULL || mode == NULL) + return (NULL); + if ((f = fopen(name,mode)) == NULL) + EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); + return (f); +} + +char *PathTo(char *name) +{ char *path, *find; + + if (name == NULL) + return (NULL); + if ((find = rindex(name,'/')) != NULL) + { *find = '\0'; + path = Strdup(name,"Extracting path from"); + *find = '/'; + } + else + path = Strdup(".","Allocating default path"); + return (path); +} + +char *Root(char *name, char *suffix) +{ char *path, *find, *dot; + int epos; + + if (name == NULL) + return (NULL); + find = rindex(name,'/'); + if (find == NULL) + find = name; + else + find += 1; + if (suffix == NULL) + { dot = strchr(find,'.'); + if (dot != NULL) + *dot = '\0'; + path = Strdup(find,"Extracting root from"); + if (dot != NULL) + *dot = '.'; + } + else + { epos = strlen(find); + epos -= strlen(suffix); + if (epos > 0 && strcasecmp(find+epos,suffix) == 0) + { find[epos] = '\0'; + path = Strdup(find,"Extracting root from"); + find[epos] = suffix[0]; + } + else + path = Strdup(find,"Allocating root"); + } + return (path); +} + +char *Catenate(char *path, char *sep, char *root, char *suffix) +{ static char *cat = NULL; + static int max = -1; + int len; + + if (path == NULL || root == NULL || sep == NULL || suffix == NULL) + return (NULL); + len = strlen(path); + len += strlen(sep); + len += strlen(root); + len += strlen(suffix); + if (len > max) + { max = ((int) (1.2*len)) + 100; + cat = (char *) realloc(cat,max+1); + if (cat == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); + return (NULL); + } + } + sprintf(cat,"%s%s%s%s",path,sep,root,suffix); + return (cat); +} + +char *Numbered_Suffix(char *left, int num, char *right) +{ static char *sfx = NULL; + static int max = -1; + int len; + + if (left == NULL || right == NULL) + return (NULL); + len = strlen(left); + len += strlen(right) + 40; + if (len > max) + { max = ((int) (1.2*len)) + 100; + sfx = (char *) realloc(sfx,max+1); + if (sfx == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); + return (NULL); + } + } + sprintf(sfx,"%s%d%s",left,num,right); + return (sfx); +} + +static char *MyCatenate(char *path, char *sep, char *root, char *suffix) +{ static char *cat = NULL; + static int max = -1; + int len; + + if (path == NULL || root == NULL || sep == NULL || suffix == NULL) + return (NULL); + len = strlen(path); + len += strlen(sep); + len += strlen(root); + len += strlen(suffix); + if (len > max) + { max = ((int) (1.2*len)) + 100; + cat = (char *) realloc(cat,max+1); + if (cat == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); + return (NULL); + } + } + sprintf(cat,"%s%s%s%s",path,sep,root,suffix); + return (cat); +} + +static char *MyNumbered_Suffix(char *left, int num, char *right) +{ static char *sfx = NULL; + static int max = -1; + int len; + + if (left == NULL || right == NULL) + return (NULL); + len = strlen(left); + len += strlen(right) + 40; + if (len > max) + { max = ((int) (1.2*len)) + 100; + sfx = (char *) realloc(sfx,max+1); + if (sfx == NULL) + { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); + return (NULL); + } + } + sprintf(sfx,"%s%d%s",left,num,right); + return (sfx); +} + + +#define COMMA ',' + +// Print big integers with commas/periods for better readability + +void Print_Number(int64 num, int width, FILE *out) +{ if (width == 0) + { if (num < 1000ll) + fprintf(out,"%lld",num); + else if (num < 1000000ll) + fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); + else if (num < 1000000000ll) + fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, + COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); + else + fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, + COMMA,(num%1000000000ll)/1000000ll, + COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); + } + else + { if (num < 1000ll) + fprintf(out,"%*lld",width,num); + else if (num < 1000000ll) + { if (width <= 4) + fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); + else + fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); + } + else if (num < 1000000000ll) + { if (width <= 8) + fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, + COMMA,num%1000ll); + else + fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, + COMMA,num%1000ll); + } + else + { if (width <= 12) + fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, + (num%1000000000ll)/1000000ll,COMMA, + (num%1000000ll)/1000ll,COMMA,num%1000ll); + else + fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, + (num%1000000000ll)/1000000ll,COMMA, + (num%1000000ll)/1000ll,COMMA,num%1000ll); + } + } +} + +// Return the number of digits, base 10, of num + +int Number_Digits(int64 num) +{ int digit; + + digit = 0; + while (num >= 1) + { num /= 10; + digit += 1; + } + return (digit); +} + + +/******************************************************************************************* + * + * READ COMPRESSION/DECOMPRESSION UTILITIES + * + ********************************************************************************************/ + +// Compress read into 2-bits per base (from [0-3] per byte representation + +void Compress_Read(int len, char *s) +{ int i; + char c, d; + char *s0, *s1, *s2, *s3; + + s0 = s; + s1 = s0+1; + s2 = s1+1; + s3 = s2+1; + + c = s1[len]; + d = s2[len]; + s0[len] = s1[len] = s2[len] = 0; + + for (i = 0; i < len; i += 4) + *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); + + s1[len] = c; + s2[len] = d; +} + +// Uncompress read form 2-bits per base into [0-3] per byte representation + +void Uncompress_Read(int len, char *s) +{ int i, tlen, byte; + char *s0, *s1, *s2, *s3; + char *t; + + s0 = s; + s1 = s0+1; + s2 = s1+1; + s3 = s2+1; + + tlen = (len-1)/4; + + t = s+tlen; + for (i = tlen*4; i >= 0; i -= 4) + { byte = *t--; + s0[i] = (char) ((byte >> 6) & 0x3); + s1[i] = (char) ((byte >> 4) & 0x3); + s2[i] = (char) ((byte >> 2) & 0x3); + s3[i] = (char) (byte & 0x3); + } + s[len] = 4; +} + +// Convert read in [0-3] representation to ascii representation (end with '\n') + +void Lower_Read(char *s) +{ static char letter[4] = { 'a', 'c', 'g', 't' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + +void Upper_Read(char *s) +{ static char letter[4] = { 'A', 'C', 'G', 'T' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + +void Letter_Arrow(char *s) +{ static char letter[4] = { '1', '2', '3', '4' }; + + for ( ; *s != 4; s++) + *s = letter[(int) *s]; + *s = '\0'; +} + +// Convert read in ascii representation to [0-3] representation (end with 4) + +void Number_Read(char *s) +{ static char number[128] = + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + for ( ; *s != '\0'; s++) + *s = number[(int) *s]; + *s = 4; +} + +void Number_Arrow(char *s) +{ static char arrow[128] = + { 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 0, 1, 2, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + }; + + for ( ; *s != '\0'; s++) + *s = arrow[(int) *s]; + *s = 4; +} + +void Change_Read(char *s) +{ static char change[128] = + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 'a', 0, 'c', 0, 0, 0, 'g', + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 't', 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 'A', 0, 'C', 0, 0, 0, 'G', + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 'T', 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + for ( ; *s != '\0'; s++) + *s = change[(int) *s]; +} + + +/******************************************************************************************* + * + * DB STUB HANDLING ROUTINES + * + ********************************************************************************************/ + + // Read the contents of the DB stub file at "path" and return it encoded in a DAZZ_STUB + // structure. This is allocated by the routine. "path" is assumed to be the complete + // name of the file. + +DAZZ_STUB *Read_DB_Stub(char *path, int what) +{ FILE *dbfile; + DAZZ_STUB *stub; + + char buf1[MAX_NAME+100]; + char buf2[MAX_NAME+100]; + int nread; + + int i; + int nfiles; + int nblocks; + int64 size; + int all, cutoff; + + dbfile = Fopen(path,"r"); + if (dbfile == NULL) + EXIT(NULL); + + stub = Malloc(sizeof(DAZZ_STUB),"Allocating DB stub record"); + if (stub == NULL) + { fclose(dbfile); + EXIT(NULL); + } + + stub->nreads = NULL; + stub->fname = NULL; + stub->prolog = NULL; + stub->ublocks = NULL; + stub->tblocks = NULL; + + if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) + goto stub_trash; + + if (what & DB_STUB_NREADS) + { stub->nreads = (int *) Malloc(sizeof(int)*(nfiles+1),"Allocating DB stub record"); + if (stub->nreads == NULL) + goto stub_error; + stub->nreads += 1; + } + + if (what & DB_STUB_FILES) + { stub->fname = (char **) Malloc(sizeof(char *)*(nfiles+1),"Allocating DB stub record"); + if (stub->fname == NULL) + goto stub_error; + stub->fname += 1; + + stub->nfiles = nfiles; + for (i = 0; i < nfiles; i++) + stub->fname[i] = NULL; + } + + if (what & DB_STUB_PROLOGS) + { stub->prolog = (char **) Malloc(sizeof(char *)*(nfiles+1),"Allocating DB stub record"); + if (stub->prolog == NULL) + goto stub_error; + stub->prolog += 1; + + for (i = 0; i < nfiles; i++) + stub->prolog[i] = NULL; + } + + for (i = 0; i < nfiles; i++) + { if (fscanf(dbfile,DB_FDATA,&nread,buf1,buf2) != 3) + goto stub_trash; + if (what & DB_STUB_NREADS) + stub->nreads[i] = nread; + if (what & DB_STUB_FILES) + { stub->fname[i] = Strdup(buf1,"Alloacting DB stub record"); + if (stub->fname[i] == NULL) + goto stub_error; + } + if (what & DB_STUB_PROLOGS) + { stub->prolog[i] = Strdup(buf2,"Alloacting DB stub record"); + if (stub->prolog[i] == NULL) + goto stub_error; + } + } + + if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) + goto stub_trash; + + if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) + goto stub_trash; + + if (what & DB_STUB_BLOCKS) + { stub->ublocks = (int *) Malloc(sizeof(int)*(nblocks+1),"Allocating DB stub record"); + stub->tblocks = (int *) Malloc(sizeof(int)*(nblocks+1),"Allocating DB stub record"); + if (stub->ublocks == NULL || stub->tblocks == NULL) + goto stub_error; + + for (i = 0; i <= nblocks; i++) + if (fscanf(dbfile,DB_BDATA,stub->ublocks+i,stub->tblocks+i) != 2) + goto stub_trash; + } + + fclose(dbfile); + + stub->nfiles = nfiles; + stub->all = all; + stub->cutoff = cutoff; + stub->bsize = size; + stub->nblocks = nblocks; + return (stub); + +stub_trash: + EPRINTF(EPLACE,"%s: Stub file %s is junk\n",Prog_Name,path); +stub_error: + Free_DB_Stub(stub); + fclose(dbfile); + EXIT(NULL); +} + + // Read the DB stub file "path" and extract the read index range [*first,*last) + // for block n, for the trimmed DB if trim is set, the untrimmed DB otherwise. + +int Fetch_Block_Range(char *path, int trim, int n, int *first, int *last) +{ FILE *dbfile; + char buffer[2*MAX_NAME+100]; + int nfiles; + int nblocks; + int64 size; + int all, cutoff; + int tfirst, tlast; + int ufirst, ulast; + int i; + + dbfile = Fopen(path,"r"); + if (dbfile == NULL) + EXIT(1); + if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) + goto stub_error; + for (i = 0; i < nfiles; i++) + if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) + goto stub_error; + if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) + goto stub_error; + + if (n < 0 || n >= nblocks) + { *first = *last = -1; + return (0); + } + + if (fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all) != 3) + goto stub_error; + for (i = 1; i <= n; i++) + if (fscanf(dbfile,DB_BDATA,&ufirst,&tfirst) != 2) + goto stub_error; + if (fscanf(dbfile,DB_BDATA,&ulast,&tlast) != 2) + goto stub_error; + + fclose(dbfile); + + if (trim) + { *first = tfirst; + *last = tlast; + } + else + { *first = ufirst; + *last = ulast; + } + + return (0); + +stub_error: + fclose(dbfile); + EPRINTF(EPLACE,"%s: Stub file %s is junk\n",Prog_Name,path); + EXIT(1); +} + + // Free a DAZZ_STUB data structure returned by Read_DB_Stub + +void Free_DB_Stub(DAZZ_STUB *stub) +{ int i; + + if (stub == NULL) + return; + if (stub->fname != NULL) + { for (i = 0; i < stub->nfiles; i++) + free(stub->fname[i]); + free(stub->fname-1); + } + if (stub->prolog != NULL) + { for (i = 0; i < stub->nfiles; i++) + free(stub->prolog[i]); + free(stub->prolog-1); + } + if (stub->nreads != NULL) + free(stub->nreads-1); + free(stub->ublocks); + free(stub->tblocks); + free(stub); +} + + +/******************************************************************************************* + * + * DB OPEN, TRIM, SIZE_OF, LIST_FILES & CLOSE ROUTINES + * + ********************************************************************************************/ + + +// Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has +// a part # in it then just the part is opened. The index array is allocated (for all or +// just the part) and read in. +// Return status of routine: +// -1: The DB could not be opened for a reason reported by the routine to EPLACE +// 0: Open of DB proceeded without mishap +// 1: Open of DAM proceeded without mishap + +static char *atrack_name = ".@arw"; +static char *qtrack_name = ".@qvs"; + +int Open_DB(char* path, DAZZ_DB *db) +{ DAZZ_DB dbcopy; + char *root, *pwd, *bptr, *fptr, *cat; + int nreads; + FILE *index, *dbvis, *bases; + int status, plen, isdam; + int part, cutoff, all; + int ufirst, tfirst, ulast, tlast; + + status = -1; + dbcopy = *db; + + plen = strlen(path); + if (strcmp(path+(plen-4),".dam") == 0) + { root = Root(path,".dam"); + isdam = 1; + } + else + { if (strcmp(path+(plen-3),".db") == 0) + isdam = -1; + else + isdam = 0; + root = Root(path,".db"); + } + pwd = PathTo(path); + + bptr = rindex(root,'.'); + if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') + { part = strtol(bptr+1,&fptr,10); + if (*fptr != '\0' || part == 0) + part = 0; + else + *bptr = '\0'; + } + else + part = 0; + + if (isdam > 0) + cat = MyCatenate(pwd,"/",root,".dam"); + else + cat = MyCatenate(pwd,"/",root,".db"); + if (cat == NULL) + return (-1); + if ((dbvis = fopen(cat,"r")) == NULL) + { if (isdam < 0) + { EPRINTF(EPLACE,"%s: Could not open DB %s\n",Prog_Name,path); + goto error; + } + if (isdam > 0) + { EPRINTF(EPLACE,"%s: Could not open DAM %s\n",Prog_Name,path); + goto error; + } + cat = MyCatenate(pwd,"/",root,".dam"); + if (cat == NULL) + return (-1); + if ((dbvis = fopen(cat,"r")) == NULL) + { EPRINTF(EPLACE,"%s: Could not open %s as a DB or a DAM\n",Prog_Name,path); + goto error; + } + isdam = 1; + } + if (isdam < 0) + isdam = 0; + + if ((index = Fopen(MyCatenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) + goto error1; + if (fread(db,sizeof(DAZZ_DB),1,index) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + goto error2; + } + + { int p, nblocks, nfiles; + int64 size; + char fname[MAX_NAME], prolog[MAX_NAME]; + + nblocks = 0; + if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + for (p = 0; p < nfiles; p++) + if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) + if (part == 0) + { cutoff = 0; + all = DB_ALL; + } + else + { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", + Prog_Name,root); + goto error2; + } + else + { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + if (part > nblocks) + { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); + goto error2; + } + } + + if (part > 0) + { for (p = 1; p <= part; p++) + if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } + } + else + { ufirst = tfirst = 0; + ulast = db->ureads; + tlast = db->treads; + } + } + + db->trimmed = 0; + db->tracks = NULL; + db->part = part; + db->cutoff = cutoff; + db->allarr |= all; + db->ufirst = ufirst; + db->tfirst = tfirst; + + nreads = ulast-ufirst; + if (part <= 0) + { db->reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); + if (db->reads == NULL) + goto error2; + + db->reads += 1; + if (fread(db->reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(db->reads-1); + goto error2; + } + } + else + { DAZZ_READ *reads; + int i, r, maxlen; + int64 totlen; + + reads = (DAZZ_READ *) Malloc(sizeof(DAZZ_READ)*(nreads+2),"Allocating Open_DB index"); + if (reads == NULL) + goto error2; + reads += 1; + + fseeko(index,sizeof(DAZZ_READ)*ufirst,SEEK_CUR); + if (fread(reads,sizeof(DAZZ_READ),nreads,index) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(reads-1); + goto error2; + } + + totlen = 0; + maxlen = 0; + for (i = 0; i < nreads; i++) + { r = reads[i].rlen; + totlen += r; + if (r > maxlen) + maxlen = r; + } + + db->maxlen = maxlen; + db->totlen = totlen; + db->reads = reads; + } + + ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part + ((int *) (db->reads))[-2] = tlast - tfirst; + + db->nreads = nreads; + db->path = Strdup(MyCatenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); + if (db->path == NULL) + { free(db->reads-1); + goto error2; + } + bases = Fopen(MyCatenate(db->path,"","",".bps"),"r"); + if (bases == NULL) + { free(db->path); + free(db->reads-1); + goto error2; + } + db->bases = (void *) bases; + db->loaded = 0; + + status = isdam; + +error2: + fclose(index); +error1: + fclose(dbvis); +error: + if (bptr != NULL) + *bptr = '.'; + + free(pwd); + free(root); + + if (status < 0) + *db = dbcopy; + + return (status); +} + + +// Trim the DB or part thereof and all opened tracks according to the cuttof and all settings +// of the current DB partition. Reallocate smaller memory blocks for the information kept +// for the retained reads. + +void Trim_DB(DAZZ_DB *db) +{ int i, j, r, f; + int allflag, cutoff, css; + int64 totlen; + int maxlen, nreads; + DAZZ_TRACK *record; + DAZZ_READ *reads; + + if (db->trimmed) return; + + if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return; + + { int load_error; + + load_error = db->loaded; + for (record = db->tracks; record != NULL; record = record->next) + if (record->name == atrack_name) + { if (((DAZZ_ARROW *) record)->loaded) + load_error = 1; + } + else if (record->name != qtrack_name) + { if (record->loaded) + load_error = 1; + } + if (load_error) + { EPRINTF(EPLACE,"%s: Cannot load anything before trim (Trim_DB)\n",Prog_Name); + return; + } + } + + cutoff = db->cutoff; + if ((db->allarr & DB_ALL) != 0) + allflag = 0; + else + allflag = DB_BEST; + + reads = db->reads; + nreads = db->nreads; + + for (record = db->tracks; record != NULL; record = record->next) + if (record->name == qtrack_name) + { uint16 *table = ((DAZZ_QV *) record)->table; + + j = 0; + for (i = 0; i < db->nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + table[j++] = table[i]; + } + else if (record->name == atrack_name) + { DAZZ_ARROW *atrack = (DAZZ_ARROW *) record; + int64 *aoff = atrack->aoff; + + for (j = i = 0; i < nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + aoff[j++] = aoff[i]; + atrack->aoff = Realloc(aoff,sizeof(int64)*j,NULL); + } + else + { int size; + + size = record->size; + if (record->data == NULL) + { char *anno = (char *) record->anno; + j = 0; + for (i = r = 0; i < db->nreads; i++, r += size) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + { memmove(anno+j,anno+r,size); + j += size; + } + record->anno = Realloc(record->anno,record->size*j,NULL); + } + else if (size == 4) + { int *anno4 = (int *) (record->anno); + int *alen = record->alen; + + j = 0; + for (i = 0; i < db->nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + { anno4[j] = anno4[i]; + alen[j] = alen[i]; + j += 1; + } + record->alen = Realloc(record->alen,sizeof(int)*j,NULL); + record->anno = Realloc(record->anno,record->size*(j+1),NULL); + } + else // size == 8 + { int64 *anno8 = (int64 *) (record->anno); + int *alen = record->alen; + + j = 0; + for (i = 0; i < db->nreads; i++) + if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) + { anno8[j] = anno8[i]; + alen[j] = alen[i]; + j += 1; + } + record->alen = Realloc(record->alen,sizeof(int)*j,NULL); + record->anno = Realloc(record->anno,record->size*(j+1),NULL); + } + record->nreads = j; + } + + css = 0; + totlen = maxlen = 0; + for (j = i = 0; i < nreads; i++) + { f = reads[i].flags; + if ((f & DB_CCS) == 0) + css = 0; + r = reads[i].rlen; + if ((f & DB_BEST) >= allflag && r >= cutoff) + { totlen += r; + if (r > maxlen) + maxlen = r; + reads[j] = reads[i]; + if (css) + reads[j++].flags |= DB_CCS; + else + reads[j++].flags &= ~DB_CCS; + css = 1; + } + } + + db->totlen = totlen; + db->maxlen = maxlen; + db->nreads = j; + db->trimmed = 1; + + if (j < nreads) + { db->reads = Realloc(reads-1,sizeof(DAZZ_READ)*(j+2),NULL); + db->reads += 1; + } +} + + +// Return the size in bytes of the memory occupied by a given DB + +int64 sizeof_DB(DAZZ_DB *db) +{ int64 s; + DAZZ_TRACK *t; + + s = sizeof(DAZZ_DB) + + sizeof(DAZZ_READ)*(db->nreads+2) + + strlen(db->path)+1 + + (db->totlen+db->nreads+4); + + t = db->tracks; + if (t != NULL && strcmp(t->name,".@qvs") == 0) + { DAZZ_QV *q = (DAZZ_QV *) t; + s += sizeof(DAZZ_QV) + + sizeof(uint16) * db->nreads + + q->ncodes * sizeof(QVcoding) + + 6; + t = t->next; + } + + for (; t != NULL; t = t->next) + { s += sizeof(DAZZ_TRACK) + + strlen(t->name)+1 + + t->size * (db->nreads+1); + if (t->data != NULL) + { if (t->size == 8) + s += sizeof(int)*((int64 *) t->anno)[db->nreads]; + else // t->size == 4 + s += sizeof(int)*((int *) t->anno)[db->nreads]; + } + } + + return (s); +} + + +// For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all +// those of the form "prefix/[.]root.part" and call actor with the complete path to each file +// pointed at by path, and the suffix of the path by extension. The . proceeds the root +// name if the defined constant HIDE_FILES is set. Always the first call is with the +// path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for +// "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and +// so this routine gives one a way to know all the tracks associated with a given DB. +// -1 is returned if the path could not be found, and 1 is returned if an error (reported +// to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int List_DB_Files(char *path, void actor(char *path, char *extension)) +{ int status, plen, rlen, dlen; + char *root, *pwd, *name; + int isdam; + DIR *dirp; + struct dirent *dp; + + status = 0; + pwd = PathTo(path); + plen = strlen(path); + if (strcmp(path+(plen-4),".dam") == 0) + root = Root(path,".dam"); + else + root = Root(path,".db"); + rlen = strlen(root); + + if (root == NULL || pwd == NULL) + { free(pwd); + free(root); + EXIT(1); + } + + if ((dirp = opendir(pwd)) == NULL) + { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); + status = -1; + goto error; + } + + isdam = 0; + while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) + { name = dp->d_name; + if (strcmp(name,MyCatenate("","",root,".db")) == 0) + break; + if (strcmp(name,MyCatenate("","",root,".dam")) == 0) + { isdam = 1; + break; + } + } + if (dp == NULL) + { status = -1; + closedir(dirp); + goto error; + } + + if (isdam) + actor(MyCatenate(pwd,"/",root,".dam"),"dam"); + else + actor(MyCatenate(pwd,"/",root,".db"),"db"); + + rewinddir(dirp); // Report each auxiliary file + while ((dp = readdir(dirp)) != NULL) + { name = dp->d_name; + dlen = strlen(name); +#ifdef HIDE_FILES + if (name[0] != '.') + continue; + dlen -= 1; + name += 1; +#endif + if (dlen < rlen+1) + continue; + if (name[rlen] != '.') + continue; + if (strncmp(name,root,rlen) != 0) + continue; + actor(MyCatenate(pwd,PATHSEP,name,""),name+(rlen+1)); + } + closedir(dirp); + +error: + free(pwd); + free(root); + return (status); +} + +void Print_Read(char *s, int width) +{ int i; + + if (s[0] < 4) + { for (i = 0; s[i] != 4; i++) + { if (i%width == 0 && i != 0) + printf("\n"); + printf("%d",s[i]); + } + printf("\n"); + } + else + { for (i = 0; s[i] != '\0'; i++) + { if (i%width == 0 && i != 0) + printf("\n"); + printf("%c",s[i]); + } + printf("\n"); + } +} + + +// Shut down an open 'db' by freeing all associated space, including tracks and QV structures, +// and any open file pointers. The record pointed at by db however remains (the user +// supplied it and so should free it). + +void Close_DB(DAZZ_DB *db) +{ if (db->loaded) + free(((char *) (db->bases)) - 1); + else if (db->bases != NULL) + fclose((FILE *) db->bases); + if (db->reads != NULL) + free(db->reads-1); + free(db->path); + + Close_QVs(db); + + Close_Arrow(db); + + while (db->tracks != NULL) + Close_Track(db,db->tracks); +} + + +/******************************************************************************************* + * + * READ AND ARROW BUFFER ALLOCATION, LOAD, & LOAD_ALL + * + ********************************************************************************************/ + +// Allocate and return a buffer big enough for the largest read in 'db', leaving room +// for an initial delimiter character + +char *New_Read_Buffer(DAZZ_DB *db) +{ char *read; + + read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); + if (read == NULL) + EXIT(NULL); + return (read+1); +} + +// Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a +// lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and +// 3(T) otherwise. +// +// **NB**, the byte before read will be set to a delimiter character! + +int Load_Read(DAZZ_DB *db, int i, char *read, int ascii) +{ FILE *bases = (FILE *) db->bases; + int64 off; + int len, clen; + DAZZ_READ *r = db->reads; + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(1); + } + + if (db->loaded) + { len = r[i].rlen; + strncpy(read,(char *) bases + r[i].boff,len); + if (ascii == 0) + { if (*read < 4) + read[-1] = read[len] = 4; + else + { read[len] = '\0'; + Number_Read(read); + read[-1] = 4; + } + } + else + { if (*read < 4) + { read[len] = 4; + if (ascii == 1) + Lower_Read(read); + else + Upper_Read(read); + read[-1] = '\0'; + } + else + { read[len] = '\0'; + if ((ascii == 1) != islower(*read)) + Change_Read(read); + } + read[-1] = '\0'; + } + return (0); + } + + off = r[i].boff; + len = r[i].rlen; + + if (ftello(bases) != off) + fseeko(bases,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(read,clen,1,bases) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(1); + } + } + Uncompress_Read(len,read); + if (ascii == 1) + { Lower_Read(read); + read[-1] = '\0'; + } + else if (ascii == 2) + { Upper_Read(read); + read[-1] = '\0'; + } + else + read[-1] = 4; + return (0); +} + + +// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the +// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii +// string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string +// over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to +// the string holding the substring so it has a delimeter for traversals in either direction. +// A NULL pointer is returned if an error occured and INTERACTIVE is defined. + +char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii) +{ FILE *bases = (FILE *) db->bases; + int64 off; + int len, clen; + int bbeg, bend; + DAZZ_READ *r = db->reads; + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(NULL); + } + + if (db->loaded) + { len = end-beg; + strncpy(read,(char *) bases + r[i].boff + beg,len); + if (ascii == 0) + { if (*read < 4) + read[-1] = read[len] = 4; + else + { read[len] = '\0'; + Number_Read(read); + read[-1] = 4; + } + } + else + { if (*read < 4) + { read[len] = 4; + if (ascii == 1) + Lower_Read(read); + else + Upper_Read(read); + read[-1] = '\0'; + } + else + { read[len] = '\0'; + if ((ascii == 1) != islower(*read)) + Change_Read(read); + } + read[-1] = '\0'; + } + return (read); + } + + bbeg = beg/4; + bend = (end-1)/4+1; + + off = r[i].boff + bbeg; + len = end - beg; + + if (ftello(bases) != off) + fseeko(bases,off,SEEK_SET); + clen = bend-bbeg; + if (clen > 0) + { if (fread(read,clen,1,bases) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(NULL); + } + } + Uncompress_Read(4*clen,read); + read += beg%4; + read[len] = 4; + if (ascii == 1) + { Lower_Read(read); + read[-1] = '\0'; + } + else if (ascii == 2) + { Upper_Read(read); + read[-1] = '\0'; + } + else + read[-1] = 4; + + return (read); +} + +// Allocate a block big enough for all the uncompressed sequences, read them into it, +// reset the 'off' in each read record to be its in-memory offset, and set the +// bases pointer to point at the block after closing the bases file. If ascii is +// non-zero then the reads are converted to ACGT ascii, otherwise the reads are left +// as numeric strings over 0(A), 1(C), 2(G), and 3(T). + +int Load_All_Reads(DAZZ_DB *db, int ascii) +{ FILE *bases = (FILE *) db->bases; + int nreads = db->nreads; + DAZZ_READ *reads = db->reads; + void (*translate)(char *s); + + char *seq; + int64 o, off; + int i, len, clen; + + if (db->loaded) + return (0); + + seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); + if (seq == NULL) + EXIT(1); + + *seq++ = 4; + + if (ascii == 1) + translate = Lower_Read; + else + translate = Upper_Read; + + o = 0; + for (i = 0; i < nreads; i++) + { len = reads[i].rlen; + off = reads[i].boff; + if (ftello(bases) != off) + fseeko(bases,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(seq+o,clen,1,bases) != 1) + { EPRINTF(EPLACE,"%s: Read of .bps file failed (Load_All_Sequences)\n",Prog_Name); + free(seq-1); + EXIT(1); + } + } + Uncompress_Read(len,seq+o); + if (ascii) + translate(seq+o); + reads[i].boff = o; + o += (len+1); + } + reads[nreads].boff = o; + + fclose(bases); + + db->bases = (void *) seq; + db->loaded = 1; + + return (0); +} + + +/******************************************************************************************* + * + * ARROW OPEN, LOAD, LOAD_ALL, & CLOSE + * + ********************************************************************************************/ + +DAZZ_DB *Arrow_DB = NULL; // Last db/arw used by "Load_Arrow" +DAZZ_ARROW *Arrow_Ptr; // Becomes invalid after closing + +// If the Arrow pseudo track is not already in db's track list, then load it and set it up. +// The database reads must not have been loaded with Load_All_Reads yet. +// -1 is returned if a .arw file is not present, and 1 is returned if an error (reported +// to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int Open_Arrow(DAZZ_DB *db) +{ int64 *avector; + DAZZ_ARROW *atrack; + FILE *afile; + DAZZ_READ *reads; + int i, nreads; + + if (db->tracks != NULL && db->tracks->name == atrack_name) + return (0); + + if ((db->allarr & DB_ARROW) == 0) + { EPRINTF(EPLACE,"%s: The DB is not an Arrow database (Open_Arrow)\n",Prog_Name); + EXIT(1); + } + if (db->loaded) + { EPRINTF(EPLACE,"%s: Cannot open Arrow vectors after loading all reads (Open_Arrow)\n", + Prog_Name); + EXIT(1); + } + + afile = Fopen(MyCatenate(db->path,"","",".arw"),"r"); + if (afile == NULL) + return (-1); + + nreads = db->nreads; + avector = (int64 *) Malloc(sizeof(int64)*nreads,"Allocating Arrow index"); + atrack = (DAZZ_ARROW *) Malloc(sizeof(DAZZ_ARROW),"Allocating Arrow track"); + if (avector == NULL || atrack == NULL) + { fclose(afile); + if (avector != NULL) + free(avector); + EXIT(1); + } + db->tracks = (DAZZ_TRACK *) atrack; + atrack->next = NULL; + atrack->name = atrack_name; + atrack->aoff = avector; + atrack->arrow = (void *) afile; + atrack->loaded = 0; + + + reads = db->reads; + for (i = 0; i < nreads; i++) + avector[i] = reads[i].boff; + return (0); +} + +// Load into 'read' the i'th arrow in 'db'. As an ASCII string if ascii is 1, +// and as a numeric string otherwise. + +int Load_Arrow(DAZZ_DB *db, int i, char *arrow, int ascii) +{ FILE *afile; + int64 off; + int len, clen; + + if (db != Arrow_DB) + { if (db->tracks == NULL || db->tracks->name != atrack_name) + { EPRINTF(EPLACE,"%s: Arrow data is not available (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + Arrow_Ptr = (DAZZ_ARROW *) db->tracks; + Arrow_DB = db; + } + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + + afile = (FILE *) Arrow_Ptr->arrow; + off = Arrow_Ptr->aoff[i]; + len = db->reads[i].rlen; + + if (ftello(afile) != off) + fseeko(afile,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(arrow,clen,1,afile) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .arw file (Load_Arrow)\n",Prog_Name); + EXIT(1); + } + } + Uncompress_Read(len,arrow); + if (ascii == 1) + { Letter_Arrow(arrow); + arrow[-1] = '\0'; + } + else + arrow[-1] = 4; + return (0); +} + +// Allocate a block big enough for all the uncompressed Arrow vectors, read them into it, +// reset the 'off' in each arrow record to be its in-memory offset, and set the +// arrow pointer to point at the block after closing the arrow file. If ascii is +// non-zero then the arrows are converted to 0123 ascii, otherwise the arrows are left +// as numeric strings over [0-3]. + +int Load_All_Arrows(DAZZ_DB *db, int ascii) +{ int nreads = db->nreads; + DAZZ_READ *reads = db->reads; + FILE *afile; + int64 *aoff; + + char *seq; + int64 o, off; + int i, len, clen; + + if (db != Arrow_DB) + { if (db->tracks == NULL || db->tracks->name != atrack_name) + { EPRINTF(EPLACE,"%s: Arrow data is not available (Load_All_Arrows)\n",Prog_Name); + EXIT(1); + } + Arrow_Ptr = (DAZZ_ARROW *) db->tracks; + Arrow_DB = db; + } + + if (Arrow_Ptr->loaded) + return (0); + + afile = (FILE *) Arrow_Ptr->arrow; + aoff = Arrow_Ptr->aoff; + + seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Arrows"); + if (seq == NULL) + EXIT(1); + + *seq++ = 4; + o = 0; + for (i = 0; i < nreads; i++) + { len = reads[i].rlen; + off = aoff[i]; + if (ftello(afile) != off) + fseeko(afile,off,SEEK_SET); + clen = COMPRESSED_LEN(len); + if (clen > 0) + { if (fread(seq+o,clen,1,afile) != 1) + { EPRINTF(EPLACE,"%s: Read of .bps file failed (Load_All_Sequences)\n",Prog_Name); + free(seq-1); + EXIT(1); + } + } + Uncompress_Read(len,seq+o); + if (ascii) + Letter_Arrow(seq+o); + aoff[i] = o; + o += (len+1); + } + aoff[nreads] = o; + + fclose(afile); + + Arrow_Ptr->arrow = (void *) seq; + Arrow_Ptr->loaded = 1; + + return (0); +} + +// Remove the Arrow pseudo track, all space associated with it, and close the .arw file. + +void Close_Arrow(DAZZ_DB *db) +{ DAZZ_ARROW *atrack; + + Arrow_DB = NULL; + if (db->tracks != NULL && db->tracks->name == atrack_name) + { atrack = (DAZZ_ARROW *) db->tracks; + if (atrack->loaded) + free(atrack->arrow); + else + fclose((FILE *) atrack->arrow); + free(atrack->aoff); + db->tracks = db->tracks->next; + free(atrack); + } +} + + +/******************************************************************************************* + * + * TRACK CHECK, OPEN, BUFFER ALLOCATION, LOAD, LOAD_ALL & CLOSE ROUTINES + * TRACK EXTRAS READING & WRITING + * + ********************************************************************************************/ + +// Return status of track: +// 1: Track is for trimmed DB +// 0: Track is for untrimmed DB +// -1: Track is not the right size of DB either trimmed or untrimmed +// -2: Could not find the track +// -3: Error return (if INTERACTIVE mode only) + +int Check_Track(DAZZ_DB *db, char *track, int *kind) +{ FILE *afile; + int tracklen, size, ispart; + int ureads, treads; + + afile = NULL; + if (db->part > 0) + { afile = fopen(MyCatenate(db->path,MyNumbered_Suffix(".",db->part,"."),track,".anno"),"r"); + ispart = 1; + } + if (afile == NULL) + { afile = fopen(MyCatenate(db->path,".",track,".anno"),"r"); + ispart = 0; + } + if (afile == NULL) + return (-2); + + if (fread(&tracklen,sizeof(int),1,afile) != 1) + { EPRINTF(EPLACE,"%s: track files for %s are corrupted\n",Prog_Name,track); + fclose(afile); + EXIT(-3); + } + if (fread(&size,sizeof(int),1,afile) != 1) + { EPRINTF(EPLACE,"%s: track files for %s are corrupted\n",Prog_Name,track); + fclose(afile); + EXIT(-3); + } + + if (size == 0) + *kind = MASK_TRACK; + else if (size > 0) + *kind = CUSTOM_TRACK; + else + { EPRINTF(EPLACE,"%s: track files for %s are corrupted\n",Prog_Name,track); + fclose(afile); + EXIT(-3); + } + + fclose(afile); + + if (ispart) + { ureads = ((int *) (db->reads))[-1]; + treads = ((int *) (db->reads))[-2]; + } + else + { ureads = db->ureads; + treads = db->treads; + } + + if (tracklen == ureads) + return (0); + else if (tracklen == treads) + return (1); + else + return (-1); +} + +// The DB has already been trimmed, but a track over the untrimmed DB needs to be opened. +// Trim the track by rereading the untrimmed DB index from the file system. + +static int Late_Track_Trim(DAZZ_DB *db, DAZZ_TRACK *track, int ispart) +{ int i, j, r; + int allflag, cutoff; + int ureads; + char *root; + DAZZ_READ read; + FILE *indx; + + if (db->cutoff <= 0 && (db->allarr & DB_ALL) != 0) return (0); + + cutoff = db->cutoff; + if ((db->allarr & DB_ALL) != 0) + allflag = 0; + else + allflag = DB_BEST; + + root = rindex(db->path,'/') + 2; + indx = Fopen(MyCatenate(db->path,"","",".idx"),"r"); + fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*db->ufirst,SEEK_SET); + if (ispart) + ureads = ((int *) (db->reads))[-1]; + else + ureads = db->ureads; + + { int size; + + size = track->size; + if (track->data == NULL) + { char *anno = (char *) track->anno; + j = r = 0; + for (i = r = 0; i < ureads; i++, r += size) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + fclose(indx); + EXIT(1); + } + if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) + { memmove(anno+j,anno+r,size); + j += size; + } + r += size; + } + track->anno = Realloc(track->anno,track->size*j,NULL); + } + else if (size == 4) + { int *anno4 = (int *) (track->anno); + int *alen = track->alen; + + j = 0; + for (i = 0; i < ureads; i++) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + fclose(indx); + EXIT(1); + } + if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) + { anno4[j] = anno4[i]; + alen[j] = alen[i]; + j += 1; + } + } + track->alen = Realloc(track->alen,sizeof(int)*j,NULL); + track->anno = Realloc(track->anno,track->size*(j+1),NULL); + } + else // size == 8 + { int64 *anno8 = (int64 *) (track->anno); + int *alen = track->alen; + + j = 0; + for (i = 0; i < ureads; i++) + { if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + fclose(indx); + EXIT(1); + } + if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) + { anno8[j] = anno8[i]; + alen[j] = alen[i]; + j += 1; + } + } + track->alen = Realloc(track->alen,sizeof(int)*j,NULL); + track->anno = Realloc(track->anno,track->size*(j+1),NULL); + } + } + + fclose(indx); + return (0); +} + +// If track is not already in the db's track list, then allocate all the storage for it, +// read it in from the appropriate file, add it to the track list, and return a pointer +// to the newly created DAZZ_TRACK record. If the track does not exist or cannot be +// opened for some reason, then NULL is returned. + +DAZZ_TRACK *Open_Track(DAZZ_DB *db, char *track) +{ FILE *afile, *dfile; + int tracklen, size; + int nreads, ispart; + int treads, ureads; + int64 dmax; + void *anno; + int *alen; + void *data; + char *name; + DAZZ_TRACK *record; + + if (track[0] == '.') + { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); + EXIT(NULL); + } + + for (record = db->tracks; record != NULL; record = record->next) + if (strcmp(record->name,track) == 0) + return (record); + + afile = NULL; + if (db->part) + { afile = fopen(MyCatenate(db->path,MyNumbered_Suffix(".",db->part,"."),track,".anno"),"r"); + ispart = 1; + } + if (afile == NULL) + { afile = fopen(MyCatenate(db->path,".",track,".anno"),"r"); + ispart = 0; + } + if (afile == NULL) + { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); + return (NULL); + } + + dfile = NULL; + anno = NULL; + alen = NULL; + data = NULL; + record = NULL; + + if (ispart) + name = MyCatenate(db->path,MyNumbered_Suffix(".",db->part,"."),track,".data"); + else + name = MyCatenate(db->path,".",track,".data"); + if (name == NULL) + goto error; + dfile = fopen(name,"r"); + + if (fread(&tracklen,sizeof(int),1,afile) != 1) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (fread(&size,sizeof(int),1,afile) != 1) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + + if (size < 0) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (size == 0) + size = 8; + + if (ispart) + { ureads = ((int *) (db->reads))[-1]; + treads = ((int *) (db->reads))[-2]; + } + else + { ureads = db->ureads; + treads = db->treads; + } + + if (db->trimmed) + { if (tracklen != treads && tracklen != ureads) + { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; + } + if ( ! ispart && db->part > 0) + { if (tracklen == treads) + fseeko(afile,size*db->tfirst,SEEK_CUR); + else + fseeko(afile,size*db->ufirst,SEEK_CUR); + } + } + else + { if (tracklen != ureads) + { if (tracklen == treads) + EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); + else + EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; + } + if ( ! ispart && db->part > 0) + fseeko(afile,size*db->ufirst,SEEK_CUR); + } + if (tracklen == treads) + nreads = ((int *) (db->reads))[-2]; + else + nreads = ((int *) (db->reads))[-1]; + + anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); + if (anno == NULL) + goto error; + + if (dfile != NULL) + { int64 *anno8; + int *anno4; + int64 x, y; + int i; + + alen = (int *) Malloc(sizeof(int)*nreads,"Allocating Track Anno Lengths"); + if (alen == NULL) + goto error; + + if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + + dmax = 0; + if (size == 4) + { anno4 = (int *) anno; + y = anno4[0]; + for (i = 1; i <= nreads; i++) + { x = anno4[i]; + y = x-y; + if (y > dmax) + dmax = y; + alen[i-1] = y; + y = x; + } + } + else + { anno8 = (int64 *) anno; + y = anno8[0]; + for (i = 1; i <= nreads; i++) + { x = anno8[i]; + y = x-y; + if (y > dmax) + dmax = y; + alen[i-1] = y; + y = x; + } + } + } + else + { dmax = 0; + if (fread(anno,size,nreads,afile) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + } + + fclose(afile); + + record = (DAZZ_TRACK *) Malloc(sizeof(DAZZ_TRACK),"Allocating Track Record"); + if (record == NULL) + goto error; + record->name = Strdup(track,"Allocating Track Name"); + if (record->name == NULL) + goto error; + if (dfile == NULL) + record->data = NULL; + else + record->data = (void *) dfile; + record->anno = anno; + record->alen = alen; + record->size = size; + record->nreads = nreads; + record->loaded = 0; + record->dmax = dmax; + + if (db->trimmed && tracklen != treads) + { if (Late_Track_Trim(db,record,ispart)) + goto error; + } + + if (db->tracks != NULL && (db->tracks->name == qtrack_name || db->tracks->name == atrack_name)) + { record->next = db->tracks->next; + db->tracks->next = record; + } + else + { record->next = db->tracks; + db->tracks = record; + } + + return (record); + +error: + if (record != NULL) + free(record); + if (data != NULL) + free(data); + if (alen != NULL) + free(alen); + if (anno != NULL) + free(anno); + if (dfile != NULL) + fclose(dfile); + fclose(afile); + EXIT (NULL); +} + +// Allocate a data buffer large enough to hold the longest read data block that will occur +// in the track. If cannot allocate memory then return NULL if INTERACTIVE is defined, +// or print error to stderr and exit otherwise. + +void *New_Track_Buffer(DAZZ_TRACK *track) +{ void *data; + + data = (void *) Malloc(track->dmax,"Allocating New Track Data Buffer"); + if (data == NULL) + EXIT(NULL); + return (data); +} + +// Load into 'data' the read data block for read i's "track" data. Return the length of +// the data in bytes, unless an error occurs and INTERACTIVE is defined in which case +// return wtih -1. + +int Load_Track_Data(DAZZ_TRACK *track, int i, void *data) +{ FILE *dfile; + int64 off; + int len; + + if (i < 0 || i >= track->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Track_Data)\n",Prog_Name); + EXIT(-1); + } + + if (track->size == 4) + off = ((int *) track->anno)[i]; + else + off = ((int64 *) track->anno)[i]; + len = track->alen[i]; + + if (track->loaded) + { strncpy(data,(void *) track->data + off,len); + return (len); + } + + dfile = (FILE *) track->data; + if (ftello(dfile) != off) + fseeko(dfile,off,SEEK_SET); + if (len > 0) + if (fread(data,len,1,dfile) != 1) + { EPRINTF(EPLACE,"%s: Failed read of .data file (Load_Track_Data)\n",Prog_Name); + EXIT(-1); + } + return (len); +} + +// Allocate a block big enough for all the track data and read the data into it, +// reset the 'off' in each anno pointer to be its in-memory offset, and set the +// data pointer to point at the block after closing the data file. Return with a +// zero, except when an error occurs and INTERACTIVE is defined in which +// case return wtih 1. + +int Load_All_Track_Data(DAZZ_TRACK *track) +{ FILE *dfile; + void *data; + int *alen; + int64 dlen, off, o; + int i, len, nreads; + + if (track->loaded || track->data == NULL) + return (0); + + nreads = track->nreads; + dfile = (FILE *) track->data; + alen = track->alen; + + dlen = 0; + for (i = 0; i < nreads; i++) + dlen += alen[i]; + + data = (void *) Malloc(dlen,"Allocating All Track Data"); + if (data == NULL) + EXIT(1); + + o = 0; + if (track->size == 4) + { int *anno4 = (int *) track->anno; + + for (i = 0; i < nreads; i++) + { len = alen[i]; + off = anno4[i]; + if (ftello(dfile) != off) + fseeko(dfile,off,SEEK_SET); + if (len > 0) + { if (fread(data+o,len,1,dfile) != 1) + { EPRINTF(EPLACE,"%s: Read of .data failed (Load_All_Track_Data)\n",Prog_Name); + free(data); + EXIT(1); + } + } + anno4[i] = o; + o += len; + } + anno4[nreads] = o; + } + else + { int64 *anno8 = (int64 *) track->anno; + + for (i = 0; i < nreads; i++) + { len = alen[i]; + off = anno8[i]; + if (ftello(dfile) != off) + fseeko(dfile,off,SEEK_SET); + if (len > 0) + { if (fread(data+o,len,1,dfile) != 1) + { EPRINTF(EPLACE,"%s: Read of .data failed (Load_All_Track_Data)\n",Prog_Name); + free(data); + EXIT(1); + } + } + anno8[i] = o; + o += len; + } + anno8[nreads] = o; + } + + fclose(dfile); + + track->data = (void *) data; + track->loaded = 1; + + return (0); +} + + +// Assumming file pointer for afile is correctly positioned at the start of a extra item, +// and aname is the name of the .anno file, decode the value present and places it in +// extra if extra->nelem == 0, otherwise reduce the value just read into extra according +// according the to the directive given by 'accum'. Leave the read poinrt at the next +// extra or end-of-file. +// Returns: +// 1 if at the end of file, +// 0 if item was read and folded correctly, +// -1 if there was a system IO or allocation error (if interactive), and +// -2 if the new value could not be reduced into the currenct value of extra (interactive) + +int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra) +{ int vtype, nelem, accum, slen; + char *name; + void *value; + +#define EREAD(v,s,n,file,ret) \ + { if (fread(v,s,n,file) != (size_t) n) \ + { if (ferror(file)) \ + EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ + else if (ret) \ + return (1); \ + else \ + EPRINTF(EPLACE,"%s: The file %s is corrupted\n",Prog_Name,aname); \ + EXIT(-1); \ + } \ + } + + EREAD(&vtype,sizeof(int),1,afile,1) + EREAD(&nelem,sizeof(int),1,afile,0) + EREAD(&accum,sizeof(int),1,afile,0) + EREAD(&slen,sizeof(int),1,afile,0) + + if (extra == NULL) + { if (fseeko(afile,slen+8*nelem,SEEK_CUR) < 0) + { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); + EXIT(-1); + } + return (0); + } + + name = (char *) Malloc(slen+1,"Allocating extra name"); + value = Malloc(8*nelem,"Allocating extra value"); + if (value == NULL || name == NULL) + { free(name); + free(value); + EXIT(-1); + } + + EREAD(name,1,slen,afile,0); + EREAD(value,8,nelem,afile,0); + name[slen] = '\0'; + + if (extra->nelem == 0) + { extra->vtype = vtype; + extra->nelem = nelem; + extra->accum = accum; + extra->name = name; + extra->value = value; + return (0); + } + + if (vtype != extra->vtype) + { EPRINTF(EPLACE,"%s: Type of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + if (nelem != extra->nelem) + { EPRINTF(EPLACE,"%s: Length of extra %s does not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + if (accum != extra->accum) + { EPRINTF(EPLACE, + "%s: Reduction indicator of extra %s does not agree with previos .anno block files\n", + Prog_Name,name); + goto error; + } + if (strcmp(name,extra->name) != 0) + { EPRINTF(EPLACE,"%s: Expecting extra %s in .anno block file, not %s\n", + Prog_Name,extra->name,name); + goto error; + } + + if (vtype == DB_INT) + { int64 *ival = (int64 *) value; + int64 *eval = (int64 *) (extra->value); + int j; + + if (accum == DB_EXACT) + { for (j = 0; j < nelem; j++) + if (eval[j] != ival[j]) + { EPRINTF(EPLACE, + "%s: Value of extra %s doe not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + } + else + { for (j = 0; j < nelem; j++) + eval[j] += ival[j]; + } + } + + else + { double *ival = (double *) value; + double *eval = (double *) (extra->value); + int j; + + if (accum == DB_EXACT) + { for (j = 0; j < nelem; j++) + if (eval[j] != ival[j]) + { EPRINTF(EPLACE, + "%s: Value of extra %s doe not agree with previous .anno block files\n", + Prog_Name,name); + goto error; + } + } + else + { for (j = 0; j < nelem; j++) + eval[j] += ival[j]; + } + } + + free(value); + free(name); + return (0); + +error: + free(value); + free(name); + EXIT(-2); +} + +// Write extra record to end of file afile and advance write pointer +// If interactive, then return non-zero on error, if bash, then print +// and halt if an error + +int Write_Extra(FILE *afile, DAZZ_EXTRA *extra) +{ int slen; + + FFWRITE(&(extra->vtype),sizeof(int),1,afile) + FFWRITE(&(extra->nelem),sizeof(int),1,afile) + FFWRITE(&(extra->accum),sizeof(int),1,afile) + slen = strlen(extra->name); + FFWRITE(&slen,sizeof(int),1,afile) + FFWRITE(extra->name,1,slen,afile) + FFWRITE(extra->value,8,extra->nelem,afile) + + return (0); +} + +void Close_Track(DAZZ_DB *db, DAZZ_TRACK *track) +{ DAZZ_TRACK *record, *prev; + + prev = NULL; + for (record = db->tracks; record != NULL; record = record->next) + { if (track == record) + { free(record->anno); + free(record->alen); + if (record->loaded) + free(record->data); + else + fclose((FILE *) record->data); + free(record->name); + if (prev == NULL) + db->tracks = record->next; + else + prev->next = record->next; + free(record); + return; + } + prev = record; + } + return; +} + + +/******************************************************************************************* + * + * QV OPEN, BUFFER ALLOCATION, LOAD, & CLOSE ROUTINES + * + ********************************************************************************************/ + +DAZZ_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" +DAZZ_QV *Active_QV; // Becomes invalid after closing + +int Open_QVs(DAZZ_DB *db) +{ FILE *quiva, *istub, *indx; + char *root; + uint16 *table; + DAZZ_QV *qvtrk; + QVcoding *coding, *nx; + int ncodes = 0; + + if (db->tracks != NULL && db->tracks->name == qtrack_name) + return (0); + + if (db->trimmed) + { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); + EXIT(1); + } + + if (db->reads[db->nreads-1].coff < 0) + { if (db->part > 0) + { EPRINTF(EPLACE,"%s: All QVs for this block have not been added to the DB!\n",Prog_Name); + EXIT(1); + } + else + { EPRINTF(EPLACE,"%s: All QVs for this DB have not been added!\n",Prog_Name); + EXIT(1); + } + } + + // Open .qvs, .idx, and .db files + + quiva = Fopen(MyCatenate(db->path,"","",".qvs"),"r"); + if (quiva == NULL) + return (-1); + + istub = NULL; + indx = NULL; + table = NULL; + coding = NULL; + qvtrk = NULL; + + root = rindex(db->path,'/'); + if (root[1] == '.') + { *root = '\0'; + istub = Fopen(MyCatenate(db->path,"/",root+2,".db"),"r"); + *root = '/'; + } + else + istub = Fopen(MyCatenate(db->path,"","",".db"),"r"); + if (istub == NULL) + goto error; + + { int first, last, nfiles; + char prolog[MAX_NAME], fname[MAX_NAME]; + int i, j; + + if (fscanf(istub,DB_NFILE,&nfiles) != 1) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + + if (db->part > 0) + { int pfirst, plast; + int fbeg, fend; + int n, k; + FILE *indx; + + // Determine first how many and which files span the block (fbeg to fend) + + pfirst = db->ufirst; + plast = pfirst + db->nreads; + + first = 0; + for (fbeg = 0; fbeg < nfiles; fbeg++) + { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + if (last > pfirst) + break; + first = last; + } + for (fend = fbeg+1; fend <= nfiles; fend++) + { if (last >= plast) + break; + if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + first = last; + } + + indx = Fopen(MyCatenate(db->path,"","",".idx"),"r"); + ncodes = fend-fbeg; + coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (indx == NULL || coding == NULL || table == NULL) + { ncodes = 0; + goto error; + } + + // Carefully get the first coding scheme (its offset is most likely in a DAZZ_RECORD + // in .idx that is *not* in memory). Get all the other coding schemes normally and + // assign the tables # for each read in the block in "tables". + + rewind(istub); + (void) fscanf(istub,DB_NFILE,&nfiles); + + first = 0; + for (n = 0; n < fbeg; n++) + { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); + first = last; + } + + for (n = fbeg; n < fend; n++) + { (void) fscanf(istub,DB_FDATA,&last,fname,prolog); + + i = n-fbeg; + if (first < pfirst) + { DAZZ_READ read; + + fseeko(indx,sizeof(DAZZ_DB) + sizeof(DAZZ_READ)*first,SEEK_SET); + if (fread(&read,sizeof(DAZZ_READ),1,indx) != 1) + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + ncodes = i; + goto error; + } + fseeko(quiva,read.coff,SEEK_SET); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; + } + else + { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; + db->reads[first-pfirst].coff = ftello(quiva); + } + + j = first-pfirst; + if (j < 0) + j = 0; + k = last-pfirst; + if (k > db->nreads) + k = db->nreads; + while (j < k) + table[j++] = (uint16) i; + + first = last; + } + + fclose(indx); + indx = NULL; + } + + else + { // Load in coding scheme for each file, adjust .coff of first read in the file, and + // record which table each read uses + + ncodes = nfiles; + coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (coding == NULL || table == NULL) + goto error; + + first = 0; + for (i = 0; i < nfiles; i++) + { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } + + fseeko(quiva,db->reads[first].coff,SEEK_SET); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; + db->reads[first].coff = ftello(quiva); + + for (j = first; j < last; j++) + table[j] = (uint16) i; + + first = last; + } + } + + // Allocate and fill in the DAZZ_QV record and add it to the front of the + // track list + + qvtrk = (DAZZ_QV *) Malloc(sizeof(DAZZ_QV),"Allocating QV pseudo-track"); + if (qvtrk == NULL) + goto error; + qvtrk->name = qtrack_name; + if (qvtrk->name == NULL) + goto error; + qvtrk->next = db->tracks; + db->tracks = (DAZZ_TRACK *) qvtrk; + qvtrk->ncodes = ncodes; + qvtrk->table = table; + qvtrk->coding = coding; + qvtrk->quiva = quiva; + } + + fclose(istub); + return (0); + +error: + if (qvtrk != NULL) + free(qvtrk); + if (table != NULL) + free(table); + if (coding != NULL) + { int i; + for (i = 0; i < ncodes; i++) + Free_QVcoding(coding+i); + free(coding); + } + if (indx != NULL) + fclose(indx); + if (istub != NULL) + fclose(istub); + fclose(quiva); + EXIT(1); +} + +// Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' + +char **New_QV_Buffer(DAZZ_DB *db) +{ char **entry; + char *qvs; + int i; + + qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); + entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); + if (qvs == NULL || entry == NULL) + EXIT(NULL); + for (i = 0; i < 5; i++) + entry[i] = qvs + i*db->maxlen; + return (entry); +} + +// Load into entry the QV streams for the i'th read from db. The parameter ascii applies to +// the DELTAG stream as described for Load_Read. + +int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii) +{ DAZZ_READ *reads; + FILE *quiva; + int rlen; + + if (db != Active_DB) + { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) + { EPRINTF(EPLACE,"%s: QV's have not been opened (Load_QVentry)\n",Prog_Name); + EXIT(1); + } + Active_QV = (DAZZ_QV *) db->tracks; + Active_DB = db; + } + + if (i < 0 || i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); + EXIT(1); + } + + reads = db->reads; + quiva = Active_QV->quiva; + rlen = reads[i].rlen; + + fseeko(quiva,reads[i].coff,SEEK_SET); + if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) + EXIT(1); + + if (ascii != 1) + { char *deltag = entry[1]; + + if (ascii != 2) + { char x = deltag[rlen]; + deltag[rlen] = '\0'; + Number_Read(deltag); + deltag[rlen] = x; + } + else + { int j; + int u = 'A'-'a'; + + for (j = 0; j < rlen; j++) + deltag[j] = (char) (deltag[j]+u); + } + } + + return (0); +} + +// Close the QV stream, free the QV pseudo track and all associated memory + +void Close_QVs(DAZZ_DB *db) +{ DAZZ_TRACK *track; + DAZZ_QV *qvtrk; + int i; + + Active_DB = NULL; + + track = db->tracks; + if (track != NULL && strcmp(track->name,".@qvs") == 0) + { qvtrk = (DAZZ_QV *) track; + for (i = 0; i < qvtrk->ncodes; i++) + Free_QVcoding(qvtrk->coding+i); + free(qvtrk->coding); + free(qvtrk->table); + fclose(qvtrk->quiva); + db->tracks = track->next; + free(track); + } + return; +} + + +/******************************************************************************************* + * + * COMMAND LINE @-EXPANSION PARSER + * Take a command line argument and interpret the '@' block number ranges. + * Parse_Block_Arg produces an Block_Looper iterator object that can then + * be invoked multiple times to iterate through all the files implied by + * the @ pattern/range. + * + ********************************************************************************************/ + +typedef struct + { int first, last, next; + char *root, *pwd, *ppnt; + int isDB; + char *slice; + } _Block_Looper; + +// Advance the iterator e_parse to the next file, open it, and return the file pointer +// to it. Return NULL if at the end of the list of files. + +int Next_Block_Exists(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + char *disp; + struct stat sts; + + if (parse->isDB) + { if (parse->next+1 > parse->last) + return (0); + else + return (1); + } + + if (parse->next+1 > parse->last) + return (0); + + if (parse->next < 0) + disp = parse->root; + else + disp = MyNumbered_Suffix(parse->root,parse->next+1,parse->ppnt); + + if (stat(MyCatenate(parse->pwd,"/",disp,".las"),&sts)) + return (0); + else + return (1); +} + + +FILE *Next_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + char *disp; + FILE *input; + + if (parse->isDB) + { fprintf(stderr,"%s: Cannot open a DB block as a file (Next_Block_Arg)\n",Prog_Name); + exit (1); // exit even in interactive mode as this is a programming bug on + } // the part of the caller + + parse->next += 1; + if (parse->next > parse->last) + return (NULL); + + if (parse->next < 0) + disp = parse->root; + else + disp = MyNumbered_Suffix(parse->root,parse->next,parse->ppnt); + + if ((input = fopen(MyCatenate(parse->pwd,"/",disp,".las"),"r")) == NULL) + { if (parse->last != INT_MAX) + { EPRINTF(EPLACE,"%s: %s.las is not present\n",Prog_Name,disp); + EXIT(NULL); + } + return (NULL); + } + return (input); +} + +// Reset the iterator e_parse to the first file + +void Reset_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + parse->next = parse->first - 1; +} + +// Advance the iterator e_parse to the next file + +int Advance_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + if (Next_Block_Exists(e_parse)) + { parse->next += 1; + return (1); + } + else + return (0); +} + +// Return a pointer to the path for the current file + +char *Block_Arg_Path(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + return (Strdup(parse->pwd,"Allocating block path")); +} + +// Return a pointer to the root name for the current file + +char *Block_Arg_Root(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + char *name; + + if (parse->next < 0) + name = parse->root; + else + name = MyNumbered_Suffix(parse->root,parse->next,parse->ppnt); + return (Strdup(name,"Allocating block root")); +} + +// Free the iterator + +void Free_Block_Arg(Block_Looper *e_parse) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + free(parse->root); + free(parse->pwd); + free(parse->slice); + free(parse); +} + +char *Next_Block_Slice(Block_Looper *e_parse, int slice) +{ _Block_Looper *parse = (_Block_Looper *) e_parse; + + if (parse->slice == NULL) + { int size = strlen(parse->pwd) + strlen(Block_Arg_Root(parse)) + 30; + parse->slice = (char *) Malloc(size,"Block argument slice"); + if (parse->slice == NULL) + EXIT(NULL); + } + + if (parse->next+1 > parse->last) + return (NULL); + if (parse->next+slice > parse->last) + slice = parse->last-parse->next; + + if (parse->first < 0) + sprintf(parse->slice,"%s/%s",parse->pwd,parse->root); + else + sprintf(parse->slice,"%s/%s%c%d-%d%s",parse->pwd,parse->root,BLOCK_SYMBOL,parse->next+1, + parse->next+slice,parse->ppnt); + parse->next += slice; + return (parse->slice); +} + +// Parse the command line argument and return an iterator to move through the +// file names, setting it up to report the first file. + +static Block_Looper *parse_block_arg(char *arg, int isDB) +{ _Block_Looper *parse; + char *pwd, *root; + char *ppnt, *cpnt; + int first, last; + + parse = (_Block_Looper *) Malloc(sizeof(_Block_Looper),"Allocating parse node"); + pwd = PathTo(arg); + if (isDB) + { int len = strlen(arg); + if (strcmp(arg+(len-4),".dam") == 0) + { root = Root(arg,".dam"); + isDB = 2; + } + else + root = Root(arg,".db"); + } + else + root = Root(arg,".las"); + if (parse == NULL || pwd == NULL || root == NULL) + goto error; + + ppnt = index(root,BLOCK_SYMBOL); + if (ppnt == NULL) + first = last = -1; + else + { if (index(ppnt+1,BLOCK_SYMBOL) != NULL) + { EPRINTF(EPLACE,"%s: Two or more occurences of %c-sign in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + goto error; + } + *ppnt++ = '\0'; + first = strtol(ppnt,&cpnt,10); + if (cpnt == ppnt) + { first = 1; + last = INT_MAX; + } + else + { if (first < 1) + { EPRINTF(EPLACE, + "%s: Integer following %c-sigan is less than 1 in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + goto error; + } + if (*cpnt == '-') + { ppnt = cpnt+1; + last = strtol(ppnt,&cpnt,10); + if (cpnt == ppnt) + { EPRINTF(EPLACE,"%s: Second integer must follow - in source name '%s'\n", + Prog_Name,root); + goto error; + } + if (last < first) + { EPRINTF(EPLACE, + "%s: 2nd integer is less than 1st integer in source name '%s'\n", + Prog_Name,root); + goto error; + } + ppnt = cpnt; + } + else + { last = INT_MAX; + ppnt = cpnt; + } + } + } + + parse->pwd = pwd; + parse->root = root; + parse->ppnt = ppnt; + parse->first = first; + parse->last = last; + parse->next = first-1; + parse->slice = NULL; + parse->isDB = isDB; + + if (isDB && first >= 0 && last == INT_MAX) + { char buffer[2*MAX_NAME+100]; + char *dbname; + FILE *dbfile; + int i, nfiles, nblocks; + + dbname = MyCatenate(pwd,"/",root,"db"); + dbfile = fopen(dbname,"r"); + if (dbfile == NULL) + { dbname = MyCatenate(pwd,"/",root,"dam"); + dbfile = fopen(dbname,"r"); + if (dbfile == NULL) + { EPRINTF(EPLACE,"%s: Cannot open database %s[db|dam]\n",Prog_Name,root); + goto error; + } + } + + if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) + SYSTEM_READ_ERROR + if (fscanf(dbfile,DB_NBLOCK,&nblocks) != 1) + SYSTEM_READ_ERROR + fclose(dbfile); + + parse->last = nblocks; + } + + return ((Block_Looper *) parse); + +error: + free(parse); + free(root); + free(pwd); + EXIT(NULL); +} + +Block_Looper *Parse_Block_LAS_Arg(char *arg) +{ return (parse_block_arg(arg, 0)); } + +Block_Looper *Parse_Block_DB_Arg(char *arg) +{ return (parse_block_arg(arg, 1)); } diff --git a/DB.h b/DB.h new file mode 100644 index 0000000..a001d3a --- /dev/null +++ b/DB.h @@ -0,0 +1,729 @@ +/******************************************************************************************* + * + * Compressed data base module. Auxiliary routines to open and manipulate a data base for + * which the sequence and read information are separated into two separate files, and the + * sequence is compressed into 2-bits for each base. Support for tracks of additional + * information, and trimming according to the current partition. Eventually will also + * support compressed quality information. + * + * Author : Gene Myers + * Date : July 2013 + * Revised: April 2014 + * + ********************************************************************************************/ + +#ifndef _DAZZ_DB + +#define _DAZZ_DB + +#include + +#include "QV.h" + +#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" + // Undefine if you don't want this + +// For interactive applications where it is inappropriate to simply exit with an error +// message to standard error, define the constant INTERACTIVE. If set, then error +// messages are put in the global variable Ebuffer and the caller of a DB routine +// can decide how to deal with the error. +// +// DB, QV, or alignment routines that can encounter errors function as before in +// non-INTERACTIVE mode by exiting after printing an error message to stderr. In +// INTERACTIVE mode the routines place a message at EPLACE and return an error +// value. For such routines that were previously void, they are now int, and +// return 1 if an error occured, 0 otherwise. + +#ifdef INTERACTIVE + +#define EPRINTF sprintf +#define EPLACE Ebuffer +#define EXIT(x) return (x) + +#else // BATCH + +#define EPRINTF fprintf +#define EPLACE stderr +#define EXIT(x) exit (1) + +#endif + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; +typedef unsigned long long uint64; +typedef signed char int8; +typedef signed short int16; +typedef signed int int32; +typedef signed long long int64; +typedef float float32; +typedef double float64; + +#define LAST_READ_SYMBOL '$' +#define BLOCK_SYMBOL '@' + +/******************************************************************************************* + * + * COMMAND LINE INTERPRETATION MACROS + * + ********************************************************************************************/ + +extern char *Prog_Name; // Name of program + +#ifdef INTERACTIVE + +extern char Ebuffer[]; + +#endif + +#define ARG_INIT(name) \ + Prog_Name = Strdup(name,""); \ + for (i = 0; i < 128; i++) \ + flags[i] = 0; + +#define ARG_FLAGS(set) \ + for (k = 1; argv[i][k] != '\0'; k++) \ + { if (index(set,argv[i][k]) == NULL) \ + { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ + exit (1); \ + } \ + flags[(int) argv[i][k]] = 1; \ + } + +#define ARG_POSITIVE(var,name) \ + var = strtol(argv[i]+2,&eptr,10); \ + if (*eptr != '\0' || argv[i][2] == '\0') \ + { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ + Prog_Name,argv[i][1],argv[i]+2); \ + exit (1); \ + } \ + if (var <= 0) \ + { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ + exit (1); \ + } + +#define ARG_NON_NEGATIVE(var,name) \ + var = strtol(argv[i]+2,&eptr,10); \ + if (*eptr != '\0' || argv[i][2] == '\0') \ + { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ + Prog_Name,argv[i][1],argv[i]+2); \ + exit (1); \ + } \ + if (var < 0) \ + { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ + exit (1); \ + } + +#define ARG_REAL(var) \ + var = strtod(argv[i]+2,&eptr); \ + if (*eptr != '\0' || argv[i][2] == '\0') \ + { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \ + Prog_Name,argv[i][1],argv[i]+2); \ + exit (1); \ + } + + +/******************************************************************************************* + * + * GUARDED BATCH IO MACROS + * + ********************************************************************************************/ + + // Utilitieis + +int Count_Args(char *arg); + +#define SYSTEM_READ_ERROR \ + { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + exit (2); \ + } + +#define SYSTEM_WRITE_ERROR \ + { fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \ + exit (2); \ + } + +#define SYSTEM_CLOSE_ERROR \ + { fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \ + exit (2); \ + } + + // Output + +#define FFWRITE(v,s,n,file) \ + { if (fwrite(v,s,n,file) != (size_t) n) \ + SYSTEM_WRITE_ERROR \ + } + +#define FPRINTF(file,...) \ + { if (fprintf(file,__VA_ARGS__) < 0) \ + SYSTEM_WRITE_ERROR \ + } + +#define PRINTF(...) \ + { if (printf(__VA_ARGS__) < 0) \ + SYSTEM_WRITE_ERROR \ + } + +#define FPUTS(x,file) \ + { if (fputs(x,file) == EOF) \ + SYSTEM_WRITE_ERROR \ + } + + // Close + +#define FCLOSE(file) \ + { if (fclose(file) != 0) \ + SYSTEM_CLOSE_ERROR \ + } + + // Input + +#define FFREAD(v,s,n,file) \ + { if (fread(v,s,n,file) != (size_t) n) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FSCANF(file,...) \ + { if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FGETS(v,n,file) \ + { if (fgets(v,n,file) == NULL) \ + { if (ferror(file)) \ + SYSTEM_READ_ERROR \ + else \ + { fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \ + exit (1); \ + } \ + } \ + } + +#define FSEEKO(file,p,d) \ + { if (fseeko(file,p,d) < 0) \ + SYSTEM_READ_ERROR \ + } + +#define FTELLO(val,file) \ + { val = ftello(file); \ + if (val < 0) \ + SYSTEM_READ_ERROR \ + } + +/******************************************************************************************* + * + * UTILITIES + * + ********************************************************************************************/ + +// The following general utilities return NULL if any of their input pointers are NULL, or if they +// could not perform their function (in which case they also print an error to stderr). + +void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc +void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to +char *Strdup(char *string, char *mesg); // stderr if out of memory + +FILE *Fopen(char *path, char *mode); // Open file path for "mode" +char *PathTo(char *path); // Return path portion of file name "path" +char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" + +// Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer +// Numbered_Suffix returns concatenation of left..right in a *temporary* buffer + +char *Catenate(char *path, char *sep, char *root, char *suffix); +char *Numbered_Suffix(char *left, int num, char *right); + + +// DB-related utilities + +void Print_Number(int64 num, int width, FILE *out); // Print big integer with commas +int Number_Digits(int64 num); // Return # of digits in printed number + +#define COMPRESSED_LEN(len) (((len)+3) >> 2) + +void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form +void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form +void Print_Read(char *s, int width); + +void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) +void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) +void Number_Read(char *s); // Convert read from letters to numbers +void Change_Read(char *s); // Convert read from one case to the other + +void Letter_Arrow(char *s); // Convert arrow pw's from numbers to uppercase letters (0-3 to 1234) +void Number_Arrow(char *s); // Convert arrow pw string from letters to numbers + + +/******************************************************************************************* + * + * DB IN-CORE DATA STRUCTURES + * + ********************************************************************************************/ + +#define DB_QV 0x03ff // Mask for 3-digit quality value +#define DB_CCS 0x0400 // This is the second or later of a group of subreads from a given insert +#define DB_BEST 0x0800 // This is the "best" subread of a given insert (may be the only 1) + +#define DB_ARROW 0x2 // DB is an arrow DB +#define DB_ALL 0x1 // all wells are in the trimmed DB + +// Fields have different interpretations if a .db versus a .dam + +typedef struct + { int origin; // Well # (DB), Contig # (DAM) + int rlen; // Length of the sequence (Last pulse = fpulse + rlen) + int fpulse; // First pulse (DB), left index of contig in scaffold (DAM) + int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of + // uncompressed bases in memory block + int64 coff; // Offset (in bytes) of compressed quiva streams in '.qvs' file (DB), + // Offset (in bytes) of scaffold header string in '.hdr' file (DAM) + // 4 compressed shorts containing snr info if an arrow DB. + int flags; // QV of read + flags above (DB only) + } DAZZ_READ; + +// A track can be of 3 types: +// data == NULL: there are nreads 'anno' records of size 'size'. +// data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) +// contains the variable length data +// data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) +// contains the variable length data +// if loaded is set then the data is not loaded if present, rather data is an open file pointer +// set for reading. + +typedef struct _track + { struct _track *next; // Link to next track + char *name; // Symbolic name of track + int size; // Size in bytes of anno records + int nreads; // Number of reads in track + void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records + int *alen; // length of track data for read i (if data != NULL) + void *data; // data[anno[i] .. anno[i]+alen[i[) is data for read i (if data != NULL) + int loaded; // Is track data loaded in memory? + int64 dmax; // Largest read data segment in bytes + } DAZZ_TRACK; + +// The tailing part of a .anno track file can contain meta-information produced by the +// command that produced the track. For example, the coverage, or good/bad parameters +// for trimming, or even say a histogram of QV values. Each item is an array of 'nelem' +// 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that +// describes it, and an indicator as to whether the values should be equal accross all +// block tracks, or summed accross all block tracks (by Catrack). 'value' points at the +// array of values + +#define DB_INT 0 +#define DB_REAL 1 + +#define DB_EXACT 0 +#define DB_SUM 1 + +typedef struct + { int vtype; // INT64 or FLOAST64 + int nelem; // >= 1 + int accum; // EXACT, SUM + char *name; + void *value; + } DAZZ_EXTRA; + +// The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track" +// named ".@qvs" and is always the first track record in the list (if present). Since normal +// track names cannot begin with a . (this is enforced), this pseudo-track is never confused +// with a normal track. + +typedef struct + { struct _track *next; + char *name; + int ncodes; // # of coding tables + QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) + uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with + // scheme coding[table[i]] + FILE *quiva; // the open file pointer to the .qvs file + } DAZZ_QV; + +// The information for accessing Arrow streams is in a DAZZ_ARW record that is a "pseudo-track" +// named ".@arw" and is always the first track record in the list (if present). +// Since normal track names cannot begin with a . (this is enforced), this pseudo-track +// is never confused with a normal track. + +typedef struct + { struct _track *next; + char *name; + int64 *aoff; // offset in file or memory of arrow vector for read i + void *arrow; // FILE * to the .arw file if not loaded, memory block otherwise + int loaded; // Are arrow vectors loaded in memory? + } DAZZ_ARROW; + +// Every DB is referred to by an ASCII stub file with extension .db or .dam. This file +// contains the information about the SMRT cells in the DB and the current division of +// the DB into blocks for HPC processing. This file can be read into the following +// data structure: + +typedef struct + { int nfiles; // Number of files/SMRT cells in DB + int *nreads; // [0..nfiles) = # of reads from cell + char **fname; // [0..nfiles) = file name of cell + char **prolog; // [0..nfiles) = fasta header prolog for cell + int all; // Keep only best read from each well? + int cutoff; // Trim reads less than cutoff + int64 bsize; // Target size for blocks + int nblocks; // Number of blocks for DB + int *ublocks; // [0..nblcoks] = index of 1st read in block in untrimmed DB + int *tblocks; // [0..nblcoks] = index of 1st read in block in trimmed DB + } DAZZ_STUB; + +// The DB record holds all information about the current state of an active DB including an +// array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which +// is always a DAZZ_QV pseudo-track (if the QVs have been loaded). + +typedef struct + { int ureads; // Total number of reads in untrimmed DB + int treads; // Total number of reads in trimmed DB + int cutoff; // Minimum read length in block (-1 if not yet set) + int allarr; // DB_ALL | DB_ARROW + float freq[4]; // frequency of A, C, G, T, respectively + + // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) + + int maxlen; // length of maximum read (initially over all DB) + int64 totlen; // total # of bases (initially over all DB) + + int nreads; // # of reads in actively loaded portion of DB + int trimmed; // DB has been trimmed by cutoff/all + int part; // DB block (if > 0), total DB (if == 0) + int ufirst; // Index of first read in block (without trimming) + int tfirst; // Index of first read in block (with trimming) + + // In order to avoid forcing users to have to rebuild all thier DBs to accommodate + // the addition of fields for the size of the actively loaded trimmed and untrimmed + // blocks, an additional read record is allocated in "reads" when a DB is loaded into + // memory (reads[-1]) and the two desired fields are crammed into the first two + // integer spaces of the record. + + char *path; // Root name of DB for .bps, .qvs, and tracks + int loaded; // Are reads loaded in memory? + void *bases; // file pointer for bases file (to fetch reads from), + // or memory pointer to uncompressed block of all sequences. + DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ + DAZZ_TRACK *tracks; // Linked list of loaded tracks + } DAZZ_DB; + + +/******************************************************************************************* + * + * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock + * + ********************************************************************************************/ + +#define MAX_NAME 10000 // Longest file name or fasta header line + +#define DB_NFILE "files = %9d\n" // number of files +#define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name +#define DB_NBLOCK "blocks = %9d\n" // number of blocks +#define DB_PARAMS "size = %11lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well +#define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) + + // Read the specified contents of the DB stub file at "path" and return it encoded in a DAZZ_STUB + // structure. This is allocated by the routine. "path" is assumed to be the complete + // name of the file. If all flags are off, then just the scalar parts of the stub + // are returned (i.e. nfiles, all, cutoff, bsize, nblocks). Returns NULL if an error + // occured in INTERACTIVE mode + +#define DB_STUB_NREADS 0x1 +#define DB_STUB_FILES 0x2 +#define DB_STUB_PROLOGS 0x4 +#define DB_STUB_BLOCKS 0x8 + +DAZZ_STUB *Read_DB_Stub(char *path, int what); + + // Read the DB stub file "path" and extract the read index range [*first,*last) + // for block n, for the trimmed DB if trim is set, the untrimmed DB otherwise. + // If n is out of range first and last will be set to -1. Returns 0 unless + // an error occurs in INTERACTIVE mode in which case it returns 1. + +int Fetch_Block_Range(char *path, int trim, int n, int *first, int *last); + + // Free a DAZZ_STUB data structure returned by Read_DB_Stub + +void Free_DB_Stub(DAZZ_STUB *stub); + + +/******************************************************************************************* + * + * DB ROUTINES + * + ********************************************************************************************/ + + // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, + // .DB.qvs, and files .DB..anno and DB..data where is a track name + // (not containing a . !). + + // A DAM is basically a DB except that: + // 1. there are no QV's, instead .coff points to the '\0' terminated fasta header of the read + // in an additional file: .DB.hdr + // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences + // contain N-separated contigs), and .fpulse the first base of the contig in the + // fasta entry + + // Open the given database or dam, "path", into the supplied DAZZ_DB record "db". If the name has + // a part # in it then just the part is opened. The index array is allocated (for all or + // just the part) and read in. + // Return status of routine: + // -1: The DB could not be opened for a reason reported by the routine to EPLACE + // 0: Open of DB proceeded without mishap + // 1: Open of DAM proceeded without mishap + +int Open_DB(char *path, DAZZ_DB *db); + + // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings + // of the current DB partition. Reallocate smaller memory blocks for the information kept + // for the retained reads. + +void Trim_DB(DAZZ_DB *db); + + // Return the size in bytes of the given DB + +int64 sizeof_DB(DAZZ_DB *db); + + // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all + // those of the form "prefix/[.]root.part" and call actor with the complete path to each file + // pointed at by path, and the suffix of the path by extension. The . proceeds the root + // name if the defined constant HIDE_FILES is set. Always the first call is with the + // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for + // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and + // so this routine gives one a way to know all the tracks associated with a given DB. + // -1 is returned if the path could not be found, and 1 is returned if an error (reported + // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int List_DB_Files(char *path, void actor(char *path, char *extension)); + + // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, + // and any open file pointers. The record pointed at by db however remains (the user + // supplied it and so should free it). + +void Close_DB(DAZZ_DB *db); + + +/******************************************************************************************* + * + * READ ROUTINES + * + ********************************************************************************************/ + + // Allocate and return a buffer big enough for the largest read in 'db'. + // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte + // are needed by the alignment algorithms. If cannot allocate memory then return NULL + // if INTERACTIVE is defined, or print error to stderr and exit otherwise. + +char *New_Read_Buffer(DAZZ_DB *db); + + // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an + // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) + // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter + // for traversals in either direction. A non-zero value is returned if an error occured + // and INTERACTIVE is defined. + +int Load_Read(DAZZ_DB *db, int i, char *read, int ascii); + + // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the + // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii + // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string + // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to + // the string holding the substring so it has a delimeter for traversals in either direction. + // A NULL pointer is returned if an error occured and INTERACTIVE is defined. + +char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii); + + // Allocate a block big enough for all the uncompressed read sequences and read and uncompress + // the reads into it, reset the 'boff' in each read record to be its in-memory offset, + // and set the bases pointer to point at the block after closing the bases file. Return + // with a zero, except when an error occurs and INTERACTIVE is defined in which + // case return wtih 1. + +int Load_All_Reads(DAZZ_DB *db, int ascii); + + +/******************************************************************************************* + * + * ARROW ROUTINES + * + ********************************************************************************************/ + + // If the Arrow pseudo track is not already in db's track list, then load it and set it up. + // The database reads must not have been loaded with Load_All_Reads yet. + // -1 is returned if a .arw file is not present, and 1 is returned if an error (reported + // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. + +int Open_Arrow(DAZZ_DB *db); + + // Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence, + // and there is only a choice between numeric (0) or ascii (1); + +int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii); + + // Allocate a block big enough for all the uncompressed Arrow vectors, read them into it, + // reset the 'off' in each arrow record to be its in-memory offset, and set the + // arrow pointer to point at the block after closing the arrow file. If ascii is + // non-zero then the arrows are converted to 0123 ascii, otherwise the arrows are left + // as numeric strings over [0-3]. + +int Load_All_Arrows(DAZZ_DB *db, int ascii); + + // Remove the Arrow pseudo track, all space associated with it, and close the .arw file. + +void Close_Arrow(DAZZ_DB *); + + +/******************************************************************************************* + * + * TRACK ROUTINES + * + ********************************************************************************************/ + + // Look up the file and header in the file of the indicated track. Return: + // 1: Track is for trimmed DB + // 0: Track is for untrimmed DB + // -1: Track is not the right size of DB either trimmed or untrimmed + // -2: Could not find the track + // -3: Error return (if INTERACTIVE mode only) + // In addition, if opened (0 or 1 returned), then kind points at an integer indicating + // the type of track as follows: + // CUSTOM 0 => a custom track + // MASK 1 => a mask track + +#define CUSTOM_TRACK 0 +#define MASK_TRACK 1 + +int Check_Track(DAZZ_DB *db, char *track, int *kind); + + // If track is not already in the db's track list, then allocate all the storage for the anno + // index, read it in from the appropriate file, add it to the track list, and return a pointer + // to the newly created DAZZ_TRACK record. If the track does not exist or cannot be + // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise + // the routine prints an error message to stderr and exits if an error occurs, and returns + // with NULL only if the track does not exist. + +DAZZ_TRACK *Open_Track(DAZZ_DB *db, char *track); + + // Allocate a data buffer large enough to hold the longest read data block that will occur + // in the track. If cannot allocate memory then return NULL if INTERACTIVE is defined, + // or print error to stderr and exit otherwise. + +void *New_Track_Buffer(DAZZ_TRACK *track); + + // Load into 'data' the read data block for read i's "track" data. Return the length of + // the data in bytes, unless an error occurs and INTERACTIVE is defined in which case + // return wtih -1. + +int Load_Track_Data(DAZZ_TRACK *track, int i, void *data); + + // Allocate a block big enough for all the track data and read the data into it, + // reset the 'off' in each anno pointer to be its in-memory offset, and set the + // data pointer to point at the block after closing the data file. Return with a + // zero, except when an error occurs and INTERACTIVE is defined in which + // case return wtih 1. + +int Load_All_Track_Data(DAZZ_TRACK *track); + + // Assumming file pointer for afile is correctly positioned at the start of an extra item, + // and aname is the name of the .anno file, decode the value present and place it in + // extra if extra->nelem == 0, otherwise reduce the value just read into extra according + // according to the directive given by 'accum'. Leave the read pointer at the next + // extra or end-of-file. + // Returns: + // 1 if at the end of file, + // 0 if item was read and folded correctly, + // -1 if there was a system IO or allocation error (if interactive), and + // -2 if the new value could not be reduced into the current value of extra (interactive) + +int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra); + + // Write extra record to end of file afile and advance write pointer + // If interactive, then return non-zero on error, if batch, then print + // and halt if an error + +int Write_Extra(FILE *afile, DAZZ_EXTRA *extra); + + // If track is on the db's track list, then it is removed and all storage associated with it + // is freed. + +void Close_Track(DAZZ_DB *db, DAZZ_TRACK *track); + + +/******************************************************************************************* + * + * QV ROUTINES + * + ********************************************************************************************/ + + // If QV pseudo track is not already in db's track list, then load it and set it up. + // The database must not have been trimmed yet. -1 is returned if a .qvs file is not + // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE + // is defined. Otherwise a 0 is returned. + +int Open_QVs(DAZZ_DB *db); + + // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur + // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, + // or print error to stderr and exit otherwise. + +#define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer +#define DEL_TAG 1 // The deleted characters +#define INS_QV 2 // The insertion QVs +#define SUB_QV 3 // The substitution QVs +#define MRG_QV 4 // The merge QVs + +char **New_QV_Buffer(DAZZ_DB *db); + + // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters + // are converted to a numeric or upper/lower case ascii string as per ascii. Return with + // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. + +int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii); + + // Remove the QV pseudo track, all space associated with it, and close the .qvs file. + +void Close_QVs(DAZZ_DB *db); + + +/******************************************************************************************* + * + * @-SIGN EXPANSION ROUTINES + * + ********************************************************************************************/ + + // Take a command line argument and interpret the '@' block number ranges. + // Parse_Block_[LAS,DB]_Arg produces a Block_Looper iterator object that can then + // be invoked multiple times to iterate through all the file names implied by + // the @ pattern/range. Next_Block_Slice returns a string encoding the next + // slice files represented by an @-notation, and advances the iterator by + // that many files. + +typedef void Block_Looper; + +Block_Looper *Parse_Block_LAS_Arg(char *arg); +Block_Looper *Parse_Block_DB_Arg(char *arg); + +int Next_Block_Exists(Block_Looper *e_parse); +FILE *Next_Block_Arg(Block_Looper *e_parse); +void Reset_Block_Arg(Block_Looper *e_parse); // Reset iterator to first file +int Advance_Block_Arg(Block_Looper *e_parse); // Advance iterator to next file, 0 if none +void Free_Block_Arg(Block_Looper *e_parse); // Free the iterator + +char *Next_Block_Slice(Block_Looper *e_parse,int slice); + +char *Block_Arg_Path(Block_Looper *e_parse); // Path of current file, must free +char *Block_Arg_Root(Block_Looper *e_parse); // Root name of current file, must free + +#endif // _DAZZ_DB diff --git a/HPC.daligner.c b/HPC.daligner.c new file mode 100644 index 0000000..b9e5187 --- /dev/null +++ b/HPC.daligner.c @@ -0,0 +1,1159 @@ +/*********************************************************************************************\ + * + * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge + * them into as many .las files as their are blocks. + * + * Author: Gene Myers + * Date : June 1, 2014 + * + *********************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" + +#undef LSF // define if want a directly executable LSF script +#undef SLURM // define if want a directly executable SLURM script + +static char *Usage[] = + { "[-vad] [-l] [-s] [-t] [-M]", + " [-P] [-B] [-T] [-f]", + " ( [-k] [-%] [-h] [-e] [-H]", + " [-k] [-%] [-h] [-e] )", + " [-m]+ [[-]]" + }; + + // Command Options + +static int BUNIT; +static int VON, CON, DON; +static int WINT, TINT, HGAP, HINT, KINT, SINT, PINT, LINT, MINT; +static int NTHREADS; +static double EREL; +static int MMAX, MTOP; +static char **MASK; +static char *ONAME; +static char *PDIR; + +#ifdef LSF + +#define HPC + +#define HPC_ALIGN \ + "bsub -q medium -n %d -o DALIGNER.out -e DALIGNER.err -R span[hosts=1] -J align#%d" +#define HPC_MERGE \ + "bsub -q short -n 12 -o MERGE.DAL.out -e MERGE.DAL.err -R span[hosts=1] -J merge#%d" +#define HPC_CHECK \ + "bsub -q short -n 12 -o CHECK.DAL.out -e CHECK.DAL.err -R span[hosts=1] -J check#%d" + +#endif + +#ifdef SLURM + +#define HPC + +#define HPC_ALIGN \ + "srun -p batch -n 1 -c %d --mem_per_cpu=%d -o DALIGNER.out -e DALIGNER.err -J align#%d" +#define HPC_MERGE \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o MERGE.DAL.out -e MERGE.DAL.err -J merge#%d" +#define HPC_CHECK \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o CHECK.DAL.out -e CHECK.DAL.err -J check#%d" + +#endif + +void daligner_script(int argc, char *argv[]) +{ int nblocks; + int usepath; + int useblock; + int fblock, lblock; +#ifdef HPC + int jobid; +#endif + + FILE *out; + char name[100]; + char *pwd, *root; + + // Make sure DB exists and is partitioned, get number of blocks in partition + + pwd = PathTo(argv[1]); + if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) + root = Root(argv[1],".dam"); + else + root = Root(argv[1],".db"); + + { int i, nfiles; + FILE *dbvis; + + dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); + if (dbvis == NULL) + { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r"); + if (dbvis == NULL) + exit (1); + } + + if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + { char buffer[30001]; + + if (fgets(buffer,30000,dbvis) == NULL) + SYSTEM_READ_ERROR + } + + useblock = 1; + if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1 || nblocks == 1) + { useblock = 0; + nblocks = 1; + } + + usepath = (strcmp(pwd,".") != 0); + + fclose(dbvis); + } + + // Set range fblock-lblock checking that DB..las exists & DB..las does not + + { char *eptr, *fptr; + FILE *file; + + if (argc == 3) + { fblock = strtol(argv[2],&eptr,10); + if (*eptr != '\0' && *eptr != '-') + { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", + Prog_Name,argv[2]); + exit (1); + } + useblock = 1; + if (*eptr == '-') + { lblock = strtol(eptr+1,&fptr,10); + if (*fptr != '\0') + { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", + Prog_Name,eptr+1); + exit (1); + } + } + else + lblock = fblock; + if (fblock < 1 || lblock > nblocks || fblock > lblock) + { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); + exit (1); + } + } + else + { fblock = 1; + lblock = nblocks; + } + + if (fblock > 1) + { file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock-1,".las")),"r"); + if (file == NULL) + { if (usepath) + fprintf(stderr,"%s: File %s/%s.%d.las should already be present!\n", + Prog_Name,pwd,root,fblock-1); + else + fprintf(stderr,"%s: File %s.%d.las should already be present!\n", + Prog_Name,root,fblock-1); + exit (1); + } + else + fclose(file); + } + if (useblock) + file = fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",fblock,".las")),"r"); + else + file = fopen(Catenate(pwd,"/",root,".las"),"r"); + if (file != NULL) + { if (usepath) + if (useblock) + fprintf(stderr,"%s: File %s/%s.%d.las should not yet exist!\n", + Prog_Name,pwd,root,fblock); + else + fprintf(stderr,"%s: File %s/%s.las should not yet exist!\n",Prog_Name,pwd,root); + else + if (useblock) + fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n",Prog_Name,root,fblock); + else + fprintf(stderr,"%s: File %s.las should not yet exist!\n",Prog_Name,root); + exit (1); + } + + DON = (DON && (lblock > 1)); + out = stdout; + } + + { int njobs; + int i, j, k; + + // Create all work subdirectories if DON + + if (DON && lblock > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.00.MKDIR",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Create work subdirectories\n"); + for (i = 1; i <= lblock; i++) + fprintf(out,"mkdir -p work%d\n",i); + + if (ONAME != NULL) + fclose(out); + } + + // Produce all necessary daligner jobs + + if (ONAME != NULL) + { sprintf(name,"%s.01.OVL",ONAME); + out = fopen(name,"w"); + } + + njobs = 0; + for (i = fblock; i <= lblock; i++) + njobs += (i-1)/BUNIT+1; + + fprintf(out,"# Daligner jobs (%d)\n",njobs); + +#ifdef HPC + jobid = 1; +#endif + for (i = fblock; i <= lblock; i++) + { int bits; + int low, hgh; + + bits = (i-1)/BUNIT+1; + low = 1; + for (j = 1; j <= bits; j++) + { +#ifdef LSF + fprintf(out,HPC_ALIGN,NTHREADS,jobid++); + fprintf(out," \""); +#endif +#ifdef SLURM + if (MINT >= 0) + fprintf(out,HPC_ALIGN,NTHREADS,(MINT*1024)/NTHREADS,jobid++); + else + fprintf(out,HPC_ALIGN,NTHREADS,(16*1024)/NTHREADS,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"daligner"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (KINT != 16) + fprintf(out," -k%d",KINT); + if (PINT != 28) + fprintf(out," -%%%d",PINT); + if (WINT != 6) + fprintf(out," -w%d",WINT); + if (HINT != 50) + fprintf(out," -h%d",HINT); + if (TINT > 0) + fprintf(out," -t%d",TINT); + if (HGAP > 0) + fprintf(out," -H%d",HGAP); + if (EREL > 0.) + fprintf(out," -e%g",EREL); + if (LINT != 1500) + fprintf(out," -l%d",LINT); + if (SINT != 100) + fprintf(out," -s%d",SINT); + if (MINT >= 0) + fprintf(out," -M%d",MINT); + if (PDIR != NULL) + fprintf(out," -P%s",PDIR); + if (NTHREADS != 4) + fprintf(out," -T%d",NTHREADS); + for (k = 0; k < MTOP; k++) + fprintf(out," -m%s",MASK[k]); + if (useblock) + if (usepath) + fprintf(out," %s/%s.%d",pwd,root,i); + else + fprintf(out," %s.%d",root,i); + else + if (usepath) + fprintf(out," %s/%s",pwd,root); + else + fprintf(out," %s",root); + hgh = (i*j)/bits + 1; + + if (useblock) + if (usepath) + fprintf(out," %s/%s.@%d-%d",pwd,root,low,hgh-1); + else + fprintf(out," %s.@%d-%d",root,low,hgh-1); + else + if (usepath) + fprintf(out," %s/%s",pwd,root); + else + fprintf(out," %s",root); + + if (lblock == 1) // ==> i = 1, [low,hgh) = [1,2) + { fprintf(out," && mv"); + if (useblock) + fprintf(out," %s.1.%s.1.las",root,root); + else + fprintf(out," %s.%s.las",root,root); + if (usepath) + fprintf(out," %s/",pwd); + else + fprintf(out," "); + if (useblock) + fprintf(out,"%s.1.las",root); + else + fprintf(out,"%s.las",root); + } + else if (DON) + { fprintf(out," && mv"); + for (k = low; k < hgh; k++) + fprintf(out," %s.%d.%s.%d.las",root,i,root,k); + fprintf(out," work%d",i); + for (k = low; k < hgh; k++) + if (k != i) + fprintf(out," && mv %s.%d.%s.%d.las work%d",root,k,root,i,k); + } + +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + low = hgh; + } + } + + // Check .las files (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.02.CHECK.OPT",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n",lblock); + +#ifdef HPC + jobid = 1; +#endif + for (i = 1; i <= lblock; i++) + { +#ifdef HPC + fprintf(out,HPC_CHECK,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAcheck -v%sS",CON?"a":""); + if (usepath) + fprintf(out," %s/%s",pwd,root); + else + fprintf(out," %s",root); + if (lblock == 1) + { if (usepath) + if (useblock) + fprintf(out," %s/%s.1",pwd,root); + else + fprintf(out," %s/%s",pwd,root); + else + if (useblock) + fprintf(out," %s.1",root); + else + fprintf(out," %s",root); + } + else if (i < fblock) + { if (DON) + fprintf(out," work%d/%s.%d.%s.%c%d",i,root,i,root,BLOCK_SYMBOL,fblock); + else + fprintf(out," %s.%d.%s.%c%d",root,i,root,BLOCK_SYMBOL,fblock); + } + else + { if (DON) + fprintf(out," work%d/%s.%d.%s.%c",i,root,i,root,BLOCK_SYMBOL); + else + fprintf(out," %s.%d.%s.%c",root,i,root,BLOCK_SYMBOL); + } +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + + // Merges required if lblock > 1 + + if (lblock > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.03.MERGE",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Merge jobs (%d)\n",lblock); + + // Incremental update merges + +#ifdef HPC + jobid = 1; +#endif + for (j = 1; j < fblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MERGE,jobid++); + fprintf(out," \""); +#endif + if (DON) + { if (usepath) + fprintf(out,"mv %s/%s.%d.las work%d/_%s.%d.las && ", + pwd,root,j,j,root,j); + else + fprintf(out,"mv %s.%d.las work%d/_%s.%d.las && ",root,j,j,root,j); + } + else + { if (usepath) + fprintf(out,"mv %s/%s.%d.las _%s.%d.las && ",pwd,root,j,root,j); + else + fprintf(out,"mv %s.%d.las _%s.%d.las && ",root,j,root,j); + } + fprintf(out,"LAmerge"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (usepath) + fprintf(out," %s/%s.%d",pwd,root,j); + else + fprintf(out," %s.%d",root,j); + if (DON) + fprintf(out," work%d/_%s.%d",j,root,j); + else + fprintf(out," _%s.%d",root,j); + if (DON) + fprintf(out," work%d/%s.%d.%s.%c%d-%d",j,root,j,root,BLOCK_SYMBOL,fblock,lblock); + else + fprintf(out," %s.%d.%s.%c%d-%d",root,j,root,BLOCK_SYMBOL,fblock,lblock); + if (usepath) + fprintf(out," && LAcheck -v%sS %s/%s %s/%s.%d",CON?"a":"",pwd,root,pwd,root,j); + else + fprintf(out," && LAcheck -v%sS %s %s.%d",CON?"a":"",root,root,j); + if (DON) + fprintf(out," && rm work%d/_%s.%d.las",j,root,j); + else + fprintf(out," && rm _%s.%d.las",root,j); +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + // New block merges + + for (j = fblock; j <= lblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MERGE,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAmerge"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (usepath) + fprintf(out," %s/%s.%d",pwd,root,j); + else + fprintf(out," %s.%d",root,j); + if (DON) + fprintf(out," work%d/%s.%d.%s.%c",j,root,j,root,BLOCK_SYMBOL); + else + fprintf(out," %s.%d.%s.%c",root,j,root,BLOCK_SYMBOL); + if (usepath) + fprintf(out," && LAcheck -v%sS %s/%s %s/%s.%d",CON?"a":"",pwd,root,pwd,root,j); + else + fprintf(out," && LAcheck -v%sS %s %s.%d",CON?"a":"",root,root,j); +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + // Cleanup (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.04.RM.OPT",ONAME); + out = fopen(name,"w"); + } + fprintf(out,"# Remove block .las files (optional)\n"); + + for (i = 1; i <= lblock; i++) + { if (DON) + fprintf(out,"cd work%d; ",i); + fprintf(out,"rm %s.%d.%s.*.las",root,i,root); + if (DON) + fprintf(out,"; cd .."); + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + } + } + + free(root); + free(pwd); +} + +/*********************************************************************************************\ + * + * Produce a script to compute overlaps for all block pairs between two DBs, and then sort + * and merge them into as many .las files as their are blocks of the 1st DB. + * + * Author: Gene Myers + * Date : December 31, 2014 + * + *********************************************************************************************/ + +#ifdef LSF + +#define HPC_MALIGN \ + "bsub -q medium -n %d -o MAP.ALL.out -e MAP.ALL.err -R span[hosts=1] -J malign#%d" +#define HPC_MMERGE \ + "bsub -q short -n 12 -o MERGE.ALL.out -e MERGE.ALL.err -R span[hosts=1] -J mmerge#%d" +#define HPC_MCHECK \ + "bsub -q short -n 12 -o CHECK.ALL.out -e CHECK.ALL.err -R span[hosts=1] -J mcheck#%d" + +#endif + +#ifdef SLURM + +#define HPC_MALIGN \ + "srun -p batch -n 1 -c %d --mem_per_cpu=%d -o MAP.ALL.out -e MAP.ALL.err -J malign#%d" +#define HPC_MMERGE \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o MERGE.ALL.out -e MERGE.ALL.err -J mmerge#%d" +#define HPC_MCHECK \ + "srun -p batch -n 1 -c 12 -t 00:05:00 -o CHECK.ALL.out -e CHECK.DAL.err -J mcheck#%d" + +#endif + +void mapper_script(int argc, char *argv[]) +{ int nblocks1, nblocks2; + int useblock1, useblock2; + int usepath1, usepath2; + int fblock, lblock; +#ifdef HPC + int jobid; +#endif + + FILE *out; + char name[100]; + char *pwd1, *root1; + char *pwd2, *root2; + + // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition + + pwd1 = PathTo(argv[1]); + if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) + root1 = Root(argv[1],".dam"); + else + root1 = Root(argv[1],".db"); + + { int i, nfiles; + FILE *dbvis; + + dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r"); + if (dbvis == NULL) + { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r"); + if (dbvis == NULL) + exit (1); + } + + if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + { char buffer[30001]; + + if (fgets(buffer,30000,dbvis) == NULL) + SYSTEM_READ_ERROR + } + + useblock1 = 1; + if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1 || nblocks1 == 1) + { useblock1 = 0; + nblocks1 = 1; + } + + usepath1 = (strcmp(pwd1,".") != 0); + + fclose(dbvis); + } + + pwd2 = PathTo(argv[2]); + if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) + root2 = Root(argv[2],".dam"); + else + root2 = Root(argv[2],".db"); + + if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0) + { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n", + Prog_Name,pwd1,root1); + exit (1); + } + + { int i, nfiles; + FILE *dbvis; + + dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r"); + if (dbvis == NULL) + { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r"); + if (dbvis == NULL) + exit (1); + } + + if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) + SYSTEM_READ_ERROR + for (i = 0; i < nfiles; i++) + { char buffer[30001]; + + if (fgets(buffer,30000,dbvis) == NULL) + SYSTEM_READ_ERROR + } + + useblock2 = 1; + if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1 || nblocks2 == 1) + { useblock2 = 0; + nblocks2 = 1; + } + + usepath2 = (strcmp(pwd2,".") != 0); + + fclose(dbvis); + } + + // Set range fblock-lblock checking that DB..las exists & DB..las does not + + { char *eptr, *fptr, *src2; + FILE *file; + + if (argc == 4) + { fblock = strtol(argv[3],&eptr,10); + if ((*eptr != '\0' && *eptr != '-') || eptr <= argv[3]) + { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", + Prog_Name,argv[3]); + exit (1); + } + useblock2 = 1; + if (*eptr == '-') + { lblock = strtol(eptr+1,&fptr,10); + if (*fptr != '\0' || fptr <= eptr+1) + { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", + Prog_Name,eptr+1); + exit (1); + } + } + else + lblock = fblock; + if (fblock < 1 || lblock > nblocks2 || fblock > lblock) + { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); + exit (1); + } + } + else + { fblock = 1; + lblock = nblocks2; + } + + if (usepath2) + src2 = Strdup(Catenate(pwd2,"/",root2,""),"Allocating small string!"); + else + src2 = Strdup(root2,"Allocating small string!"); + if (src2 == NULL) + exit (1); + + if (fblock > 1) + { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock-1,".las")),"r"); + if (file == NULL) + { fprintf(stderr,"%s: File %s.%d.%s.las should already be present!\n", + Prog_Name,src2,fblock-1,root1); + exit (1); + } + else + fclose(file); + } + if (useblock2) + { file = fopen(Catenate(src2,".",root1,Numbered_Suffix(".",fblock,".las")),"r"); + if (file != NULL) + { fprintf(stderr,"%s: File %s.%d.%s.las should not yet exist!\n", + Prog_Name,src2,fblock,root1); + exit (1); + } + } + else + { file = fopen(Catenate(src2,".",root1,".las"),"r"); + if (file != NULL) + { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n", + Prog_Name,src2,root1); + exit (1); + } + } + + free(src2); + + DON = (DON && (nblocks1 > 1)); + out = stdout; + } + + { int njobs; + int i, j, k; + + // Create all work subdirectories if DON + + if (DON && nblocks1 > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.00.MKDIR",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Create work subdirectories\n"); + for (i = fblock; i <= lblock; i++) + fprintf(out,"mkdir -p work%d\n",i); + + if (ONAME != NULL) + fclose(out); + } + + // Produce all necessary daligner jobs ... + + if (ONAME != NULL) + { sprintf(name,"%s.01.CMP",ONAME); + out = fopen(name,"w"); + } + + njobs = nblocks1 * ( (lblock-fblock)/BUNIT + 1); + + fprintf(out,"# Daligner jobs (%d)\n",njobs); + +#ifdef HPC + jobid = 1; +#endif + for (i = fblock; i <= lblock; i++) + { int bits; + int low, hgh; + + bits = (nblocks1-1)/BUNIT+1; + low = 1; + for (j = 1; j <= bits; j++) + { +#ifdef LSF + fprintf(out,HPC_MALIGN,NTHREADS,jobid++); +#endif +#ifdef SLURM + if (MINT >= 0) + fprintf(out,HPC_MALIGN,NTHREADS,(MINT*1024)/NTHREADS,jobid++); + else + fprintf(out,HPC_MALIGN,NTHREADS,(16*1024)/NTHREADS,jobid++); +#endif +#ifdef HPC + fprintf(out," \""); +#endif + fprintf(out,"daligner -A"); + if (VON) + fprintf(out," -v"); + if (CON) + fprintf(out," -a"); + if (KINT != 20) + fprintf(out," -k%d",KINT); + if (PINT != 50) + fprintf(out," -%%%d",PINT); + if (WINT != 6) + fprintf(out," -w%d",WINT); + if (HINT != 70) + fprintf(out," -h%d",HINT); + if (TINT > 0) + fprintf(out," -t%d",TINT); + if (EREL > 0.) + fprintf(out," -e%g",EREL); + else + fprintf(out," -e.85"); + if (LINT != 1000) + fprintf(out," -l%d",LINT); + if (SINT != 100) + fprintf(out," -s%d",SINT); + if (NTHREADS != 4) + fprintf(out," -T%d",NTHREADS); + if (MINT >= 0) + fprintf(out," -M%d",MINT); + if (PDIR != NULL) + fprintf(out," -P%s",PDIR); + for (k = 0; k < MTOP; k++) + fprintf(out," -m%s",MASK[k]); + + fprintf(out," "); + if (usepath2) + fprintf(out,"%s/",pwd2); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",i); + + hgh = 1 + (nblocks1*j)/bits; + for (k = low; k < hgh; k++) + { fprintf(out," "); + if (usepath1) + fprintf(out,"%s/",pwd1); + fprintf(out,"%s",root1); + if (useblock1) + fprintf(out,".%d",k); + } + + if (nblocks1 == 1) + { if (usepath2) + { fprintf(out," && mv %s",root2); + if (useblock2) + fprintf(out,".%d",i); + fprintf(out,".%s.las %s",root1,pwd2); + } + } + else if (DON) + { fprintf(out," && mv"); + for (k = low; k < hgh; k++) + { fprintf(out," %s",root2); + if (useblock2) + fprintf(out,".%d",i); + fprintf(out,".%s.%d.las",root1,k); + } + fprintf(out," work%d",i); + } +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + low = hgh; + } + } + + // Check .las files (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.02.CHECK.OPT",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Check initial .las files jobs (%d) (optional but recommended)\n", + (lblock-fblock)+1); + +#ifdef HPC + jobid = 1; +#endif + for (j = fblock; j <= lblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MCHECK,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAcheck -v%sS",CON?"a":""); + if (usepath2) + fprintf(out," %s/%s",pwd2,root2); + else + fprintf(out," %s",root2); + if (usepath1) + fprintf(out," %s/%s",pwd1,root1); + else + fprintf(out," %s",root1); + fprintf(out," "); + if (nblocks1 == 1) + { if (usepath2) + fprintf(out,"%s/",pwd2); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s",root1); + } + else + { if (DON) + fprintf(out,"work%d/",j); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s.%c",root1,BLOCK_SYMBOL); + } +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + + // Higher level merges (if lblock > 1) + + if (nblocks1 > 1) + { if (ONAME != NULL) + { sprintf(name,"%s.03.MERGE",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Merge jobs (%d)\n",(lblock-fblock)+1); + +#ifdef HPC + jobid = 1; +#endif + for (j = fblock; j <= lblock; j++) + { +#ifdef HPC + fprintf(out,HPC_MMERGE,jobid++); + fprintf(out," \""); +#endif + fprintf(out,"LAmerge "); + if (VON) + fprintf(out,"-v "); + if (CON) + fprintf(out,"-a "); + if (usepath2) + fprintf(out,"%s/",pwd2); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s",root1); + if (DON) + fprintf(out," work%d/",j); + else + fprintf(out," "); + fprintf(out,"%s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s.%c",root1,BLOCK_SYMBOL); + +#ifdef HPC + fprintf(out,"\""); +#endif + fprintf(out,"\n"); + } + + // Cleanup (optional) + + if (ONAME != NULL) + { fclose(out); + sprintf(name,"%s.04.RM",ONAME); + out = fopen(name,"w"); + } + + fprintf(out,"# Remove temporary .las files\n"); + + for (j = fblock; j <= lblock; j++) + { if (DON) + fprintf(out,"cd work%d; ",j); + fprintf(out,"rm %s",root2); + if (useblock2) + fprintf(out,".%d",j); + fprintf(out,".%s.*.las",root1); + if (DON) + fprintf(out,"; cd .."); + fprintf(out,"\n"); + } + + if (ONAME != NULL) + fclose(out); + } + } + + free(root2); + free(pwd2); + free(root1); + free(pwd1); + + exit (0); +} + +int main(int argc, char *argv[]) +{ int i, j, k; + int flags[128]; + char *eptr; + int mapper; + + // Process options and decide if its a overlap or mapper script + + ARG_INIT("HPC.daligner") + + KINT = 0; + HINT = 0; + HGAP = 0; + EREL = 0.; + + BUNIT = 4; + TINT = 0; + WINT = 6; + LINT = 1500; + SINT = 100; + MINT = -1; + PINT = -1; + PDIR = NULL; + + MTOP = 0; + MMAX = 10; + MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); + if (MASK == NULL) + exit (1); + ONAME = NULL; + + NTHREADS = 4; + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("vadAI"); + break; + case 'e': + ARG_REAL(EREL) + if (EREL < .7 || EREL >= 1.) + { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); + exit (1); + } + break; + case 'f': + ONAME = argv[i]+2; + break; + case 'h': + ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") + break; + case 'k': + ARG_POSITIVE(KINT,"K-mer length") + if (KINT > 32) + { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name); + exit (1); + } + break; + case 'l': + ARG_POSITIVE(LINT,"Minimum ovlerap length") + break; + case 'm': + if (MTOP >= MMAX) + { MMAX = 1.2*MTOP + 10; + MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); + if (MASK == NULL) + exit (1); + } + MASK[MTOP++] = argv[i]+2; + break; + case 's': + ARG_POSITIVE(SINT,"Trace spacing") + break; + case 't': + ARG_POSITIVE(TINT,"Tuple suppression frequency") + break; + case 'w': + ARG_POSITIVE(WINT,"Log of bin width") + break; + case 'B': + ARG_NON_NEGATIVE(BUNIT,"Blocks per command") + break; + case 'H': + ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") + break; + case 'M': + ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") + break; + case 'P': + PDIR = argv[i]+2; + break; + case 'T': + ARG_POSITIVE(NTHREADS,"Number of threads") + break; + case '%': + ARG_POSITIVE(PINT,"Modimer percentage") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VON = flags['v']; + CON = flags['a']; + DON = flags['d']; + + if (argc < 2 || argc > 4) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[4]); + fprintf(stderr,"\n"); + fprintf(stderr," Passed through to daligner.\n"); + fprintf(stderr," -k: k-mer size (must be <= 32).\n"); + fprintf(stderr," -%%: modimer percentage (take %% of the k-mers).\n"); + fprintf(stderr," -w: Look for k-mers in averlapping bands of size 2^-w.\n"); + fprintf(stderr," -h: A seed hit if the k-mers in band cover >= -h bps in the"); + fprintf(stderr," targest read.\n"); + fprintf(stderr," -t: Ignore k-mers that occur >= -t times in a block.\n"); + fprintf(stderr," -M: Use only -M GB of memory by ignoring most frequent k-mers.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -e: Look for alignments with -e percent similarity.\n"); + fprintf(stderr," -l: Look for alignments of length >= -l.\n"); + fprintf(stderr," -s: Use -s as the trace point spacing for encoding alignments.\n"); + fprintf(stderr," -H: HGAP option: align only target reads of length >= -H.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -T: Use -T threads.\n"); + fprintf(stderr," -P: Do first level sort and merge in directory -P.\n"); + fprintf(stderr," -m: Soft mask the blocks with the specified mask.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," Script control.\n"); + fprintf(stderr," -v: Run all commands in script in verbose mode.\n"); + fprintf(stderr," -a: Instruct LAsort & LAmerge to sort only on (a,ab).\n"); + fprintf(stderr," -d: Put .las files for each target block in a sub-directory\n"); + fprintf(stderr," -B: # of block compares per daligner job\n"); + fprintf(stderr," -f: Place script bundles in separate files with prefix \n"); + exit (1); + } + + if (argc == 2) + mapper = 0; + else if (argc == 4) + mapper = 1; + else + { (void) strtol(argv[2],&eptr,10); + if ((*eptr == '\0' || *eptr == '-') && eptr > argv[2]) + mapper = 0; + else + mapper = 1; + } + + if (mapper) + { if (HGAP > 0) + { fprintf(stderr,"%s: Cannot use -H option in a comparison script\n",Prog_Name); + exit (1); + } + if (KINT <= 0) + KINT = 20; + if (HINT <= 0) + HINT = 70; + if (EREL <= 0.) + EREL = .85; + if (PINT <= 0) + PINT = 50; + } + else + { if (KINT <= 0) + KINT = 16; + if (HINT <= 0) + HINT = 50; + if (PINT <= 0) + PINT = 28; + } + + if (mapper) + mapper_script(argc,argv); + else + daligner_script(argc,argv); + + exit (0); +} diff --git a/LA2ONE.c b/LA2ONE.c new file mode 100644 index 0000000..a69db40 --- /dev/null +++ b/LA2ONE.c @@ -0,0 +1,617 @@ +/******************************************************************************************* + * + * Utility for displaying the information in the overlaps of a .las file in a very + * simple to parse format. + * + * Author: Gene Myers + * Creation: July 2013 + * Last Mod: Jan 2015 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" +#include "ONElib.h" + +static char *Usage = + "[-cto] [] [ | ...]"; + +static char *One_Schema = + "P 3 dal This is a 1-code las file from daligner\n" + "D X 1 3 INT Data prolog: trace spacing\n" + "O P 2 3 INT 8 INT_LIST A-read and B-read list\n" + ". All per B-read/l.a.s:\n" + "D O 1 6 STRING Orientation [+-]\n" + "D C 1 6 STRING Chain directive [>+-.]\n" + "D A 1 8 INT_LIST A-read alignment intervals: (ab,ae)\n" + "D B 1 8 INT_LIST B-read alignment intervals: (bb,be)\n" + "D D 1 8 INT_LIST Diffs in alignment\n" + "D L 2 3 INT 8 INT_LIST A-read length and then each B-read length\n" + ". One line per B-read:\n" + "D T 1 8 INT_LIST Trace segment length\n" + "D Q 1 8 INT_LIST Trace segment diffs\n"; + +static Overlap *ovls; +static uint16 *trace; +static int64 *list; +static char *string; +static OneFile *file1; +static DAZZ_READ *read1, *read2; + +static int OVERLAP; +static int DOCOORDS; +static int DOTRACE; + +static void output_pile(Overlap *optr) +{ int i, k; + Overlap *o; + + i = 0; + for (o = ovls; o < optr; o++) + list[i++] = o->bread+1; + oneInt(file1,0) = ovls->aread+1; + oneWriteLine(file1,'P',i,list); + + i = 0; + for (o = ovls; o < optr; o++) + if (COMP(o->flags)) + string[i++] = 'c'; + else + string[i++] = 'n'; + oneWriteLine(file1,'O',i,string); + + i = 0; + for (o = ovls; o < optr; o++) + if (CHAIN_NEXT(o->flags)) + string[i++] = '-'; + else if (BEST_CHAIN(o->flags)) + string[i++] = '>'; + else if (CHAIN_START(o->flags)) + string[i++] = '+'; + else + string[i++] = '.'; + oneWriteLine(file1,'C',i,string); + + if (DOCOORDS) + { i = 0; + for (o = ovls; o < optr; o++) + { list[i++] = o->path.abpos; + list[i++] = o->path.aepos; + } + oneWriteLine(file1,'A',i,list); + + i = 0; + for (o = ovls; o < optr; o++) + { list[i++] = o->path.bbpos; + list[i++] = o->path.bepos; + } + oneWriteLine(file1,'B',i,list); + + oneInt(file1,0) = read1[ovls->aread].rlen; + i = 0; + for (o = ovls; o < optr; o++) + list[i++] = read2[o->bread].rlen; + oneWriteLine(file1,'L',i,list); + + i = 0; + for (o = ovls; o < optr; o++) + list[i++] = o->path.diffs; + oneWriteLine(file1,'D',i,list); + } + + if (DOTRACE) + { uint16 *trace; + int tlen; + + for (o = ovls; o < optr; o++) + { trace = (uint16 *) o->path.trace; + tlen = o->path.tlen; + + i = 0; + for (k = 1; k < tlen; k += 2) + list[i++] = trace[k]; + oneWriteLine(file1,'T',i,list); + + i = 0; + for (k = 0; k < tlen; k += 2) + list[i++] = trace[k]; + oneWriteLine(file1,'Q',i,list); + } + } +} + +static int ORDER(const void *l, const void *r) +{ int x = *((int *) l); + int y = *((int *) r); + return (x-y); +} + +int main(int argc, char *argv[]) +{ DAZZ_DB _db1, *db1 = &_db1; + DAZZ_DB _db2, *db2 = &_db2; + OneSchema *schema; + char *command; + + FILE *input; + int64 novl, omax, tmax; + int tspace, tbytes; + int reps, *pts; + int input_pts; + + int ISTWO; + + // Process options and capture command line for provenance + + { int i, j, k; + int flags[128]; + + ARG_INIT("LA2ONE") + + { int n, t; + char *c; + + n = 0; + for (t = 1; t < argc; t++) + n += strlen(argv[t])+1; + + command = Malloc(n+1,"Allocating command string"); + if (command == NULL) + exit (1); + + c = command; + if (argc >= 1) + { c += sprintf(c,"%s",argv[1]); + for (t = 2; t < argc; t++) + c += sprintf(c," %s",argv[t]); + } + *c = '\0'; + } + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("cto") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + OVERLAP = flags['o']; + DOCOORDS = flags['c']; + DOTRACE = flags['t']; + + if (DOTRACE) + DOCOORDS = 1; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," Output pile reads, orientation, and chains by default"); + fprintf(stderr," (P, O, C lines)\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -c: Ootput also aligned intervals, read lengths, and diffs"); + fprintf(stderr," (B, E, L, and D lines)\n"); + fprintf(stderr," -t: Output also traces (T and Q lines)\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -o: Output proper overlaps only\n"); + + exit (1); + } + } + + // Open trimmed DB or DB pair + + { int status; + char *pwd, *root; + FILE *input; + + ISTWO = 0; + status = Open_DB(argv[1],db1); + if (status < 0) + exit (1); + if (db1->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); + exit (1); + } + + if (argc > 3) + { pwd = PathTo(argv[3]); + root = Root(argv[3],".las"); + if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) + { ISTWO = 1; + fclose(input); + status = Open_DB(argv[2],db2); + if (status < 0) + exit (1); + if (db2->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); + exit (1); + } + Trim_DB(db2); + } + else + db2 = db1; + free(root); + free(pwd); + } + else + db2 = db1; + Trim_DB(db1); + } + + // Process read index arguments into a sorted list of read ranges + + input_pts = 0; + if (argc == ISTWO+4) + { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') + { char *eptr, *fptr; + int b, e; + + b = strtol(argv[ISTWO+3],&eptr,10); + if (eptr > argv[ISTWO+3] && b > 0) + { if (*eptr == '-') + { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') + { e = strtol(eptr+1,&fptr,10); + input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); + } + } + else + input_pts = (*eptr != '\0'); + } + else + input_pts = 1; + } + } + + if (input_pts) + { int v, x; + FILE *input; + + input = Fopen(argv[ISTWO+3],"r"); + if (input == NULL) + exit (1); + + reps = 0; + while ((v = fscanf(input," %d",&x)) != EOF) + if (v == 0) + { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", + Prog_Name,reps+1,argv[2]); + exit (1); + } + else + reps += 1; + + reps *= 2; + pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + rewind(input); + for (v = 0; v < reps; v += 2) + { fscanf(input," %d",&x); + pts[v] = pts[v+1] = x; + } + + fclose(input); + } + + else + { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + reps = 0; + if (argc > 3+ISTWO) + { int c, b, e; + char *eptr, *fptr; + + for (c = 3+ISTWO; c < argc; c++) + { if (argv[c][0] == LAST_READ_SYMBOL) + { b = db1->nreads; + eptr = argv[c]+1; + } + else + b = strtol(argv[c],&eptr,10); + if (eptr > argv[c]) + { if (b <= 0) + { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); + exit (1); + } + if (*eptr == '\0') + { pts[reps++] = b; + pts[reps++] = b; + continue; + } + else if (*eptr == '-') + { if (eptr[1] == LAST_READ_SYMBOL) + { e = INT32_MAX; + fptr = eptr+2; + } + else + e = strtol(eptr+1,&fptr,10); + if (fptr > eptr+1 && *fptr == 0 && e > 0) + { pts[reps++] = b; + pts[reps++] = e; + if (b > e) + { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); + exit (1); + } + continue; + } + } + } + fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); + exit (1); + } + + qsort(pts,reps/2,sizeof(int64),ORDER); + + b = 0; + for (c = 0; c < reps; c += 2) + if (b > 0 && pts[b-1] >= pts[c]-1) + { if (pts[c+1] > pts[b-1]) + pts[b-1] = pts[c+1]; + } + else + { pts[b++] = pts[c]; + pts[b++] = pts[c+1]; + } + pts[b++] = INT32_MAX; + reps = b; + } + else + { pts[reps++] = 1; + pts[reps++] = INT32_MAX; + } + } + + // Initiate file reading and read header + + { char *over, *pwd, *root; + + pwd = PathTo(argv[2+ISTWO]); + root = Root(argv[2+ISTWO],".las"); + over = Catenate(pwd,"/",root,".las"); + input = Fopen(over,"r"); + if (input == NULL) + exit (1); + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + free(pwd); + free(root); + } + + schema = oneSchemaCreateFromText(One_Schema); + file1 = oneFileOpenWriteNew("-",schema,"dal",true,1); + oneAddProvenance(file1,Prog_Name,"1.0","%s >?.dal",command); + + // Scan to determine max trace length and max pile size + + { int in, npt, idx; + int j, ar, al; + int tlen; + int64 odeg; + Overlap _ovl, *ovl = &_ovl; + + in = 0; + npt = pts[0]; + idx = 1; + + // For each record do + + omax = tmax = 0; + odeg = 0; + + al = 0; + for (j = 0; j < novl; j++) + + // Read it in + + { Read_Overlap(input,ovl); + tlen = ovl->path.tlen; + fseeko(input,tlen*tbytes,SEEK_CUR); + + // Determine if it should be displayed + + ar = ovl->aread+1; + if (in) + { while (ar > npt) + { npt = pts[idx++]; + if (ar < npt) + { in = 0; + break; + } + npt = pts[idx++]; + } + } + else + { while (ar >= npt) + { npt = pts[idx++]; + if (ar <= npt) + { in = 1; + break; + } + npt = pts[idx++]; + } + } + if (!in) + continue; + + // If -o check display only overlaps + + if (OVERLAP) + { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) + continue; + if (ovl->path.aepos != db1->reads[ovl->aread].rlen && + ovl->path.bepos != db2->reads[ovl->bread].rlen) + continue; + } + + if (ar != al) + { if (odeg > omax) + omax = odeg; + al = ar; + } + odeg += 1; + if (tlen > tmax) + tmax = tlen; + } + if (odeg > omax) + omax = odeg; + } + + // Read the file and display selected records + + { int j; + Overlap *optr; + uint16 *tptr; + int in, npt, idx, ar, last; + + rewind(input); + fread(&novl,sizeof(int64),1,input); + fread(&tspace,sizeof(int),1,input); + + ovls = Malloc(sizeof(Overlap)*omax,"Allocating alignment array"); + trace = Malloc(sizeof(uint16)*omax*tmax,"Allocating trace buffer"); + string = Malloc(sizeof(int64)*omax,"Allocating 1-string"); + if (tmax > 2*omax) + list = Malloc(sizeof(int64)*tmax,"Allocating 1-list"); + else + list = Malloc(sizeof(int64)*omax*2,"Allocating 1-list"); + if (ovls == NULL || trace == NULL || string == NULL || list == NULL) + exit (1); + + read1 = db1->reads; + read2 = db2->reads; + + if (DOTRACE) + { oneInt(file1,0) = tspace; + oneWriteLine(file1,'X',0,NULL); + } + + // For each record do + + in = 0; + npt = pts[0]; + idx = 1; + + optr = ovls; + tptr = trace; + last = -1; + for (j = 0; j < novl; j++) + + // Read it in + + { Read_Overlap(input,optr); + ar = optr->aread+1; + + if (in) + + { if (ar == last) + { optr->path.trace = (void *) tptr; + Read_Trace(input,optr,tbytes); + if (tbytes == 1) + Decompress_TraceTo16(optr); + tptr += sizeof(uint16)*optr->path.tlen; + optr += 1; + } + + else + { output_pile(optr); + + while (ar > npt) + { npt = pts[idx++]; + if (ar < npt) + { in = 0; + break; + } + npt = pts[idx++]; + } + + if (in) + { ovls[0] = *optr++; + tptr = trace; + optr = ovls; + last = ar; + + optr->path.trace = (void *) tptr; + Read_Trace(input,optr,tbytes); + if (tbytes == 1) + Decompress_TraceTo16(optr); + tptr += sizeof(uint16)*optr->path.tlen; + optr += 1; + } + else + { fseeko(input,optr->path.tlen*tbytes,SEEK_CUR); + optr = ovls; + tptr = trace; + } + } + } + + else + { while (ar >= npt) + { npt = pts[idx++]; + if (ar <= npt) + { in = 1; + break; + } + npt = pts[idx++]; + } + + if (in) + { last = ar; + + optr->path.trace = (void *) tptr; + Read_Trace(input,optr,tbytes); + if (tbytes == 1) + Decompress_TraceTo16(optr); + tptr += sizeof(uint16)*optr->path.tlen; + optr += 1; + } + else + fseeko(input,optr->path.tlen*tbytes,SEEK_CUR); + } + } + + if (in) + output_pile(optr); + + free(string); + free(list); + free(trace); + free(ovls); + } + + oneFileClose(file1); + oneSchemaDestroy(schema); + + Close_DB(db1); + if (ISTWO) + Close_DB(db2); + + free(command); + + exit (0); +} diff --git a/LAcat.c b/LAcat.c new file mode 100644 index 0000000..fa6b03b --- /dev/null +++ b/LAcat.c @@ -0,0 +1,199 @@ +/******************************************************************************************* + * + * Merge together in index order, overlap files .1.las, .2.las, ... into a + * single overlap file and output to the standard output + * + * Author: Gene Myers + * Date : July 2013 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "[-v] ... > .las"; + +#define MEMORY 1000 // How many megabytes for output buffer + +int main(int argc, char *argv[]) +{ char *iblock, *oblock; + FILE *input; + int64 novl, bsize, ovlsize, ptrsize; + int tspace, tbytes; + int c; + + int VERBOSE; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAcat") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + { ARG_FLAGS("v") } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + + if (argc <= 1) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," 's may contain a template that is %c-sign optionally\n", + BLOCK_SYMBOL); + fprintf(stderr," followed by an integer or integer range\n"); + exit (1); + } + } + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + bsize = MEMORY * 1000000ll; + oblock = (char *) Malloc(bsize,"Allocating output block"); + iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); + if (oblock == NULL || iblock == NULL) + exit (1); + iblock += ptrsize; + + novl = 0; + tspace = -1; + for (c = 1; c < argc; c++) + { Block_Looper *parse; + FILE *input; + + parse = Parse_Block_LAS_Arg(argv[c]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { int64 povl; + int mspace; + + if (fread(&povl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + novl += povl; + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (tspace < 0) + tspace = mspace; + else if (tspace != mspace) + { fprintf(stderr,"%s: trace-point spacing conflict between %s and earlier files", + Prog_Name,Block_Arg_Root(parse)); + fprintf(stderr," (%d vs %d)\n",tspace,mspace); + exit (1); + } + + fclose(input); + } + + Free_Block_Arg(parse); + } + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + if (fwrite(&novl,sizeof(int64),1,stdout) != 1) + SYSTEM_READ_ERROR + if (fwrite(&tspace,sizeof(int),1,stdout) != 1) + SYSTEM_READ_ERROR + + { Block_Looper *parse; + int c, j; + Overlap *w; + int64 tsize, povl; + int mspace; + char *iptr, *itop; + char *optr, *otop; + + optr = oblock; + otop = oblock + bsize; + + for (c = 1; c < argc; c++) + { parse = Parse_Block_LAS_Arg(argv[c]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { if (fread(&povl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + if (VERBOSE) + { fprintf(stderr, + " Concatenating %s: %lld la\'s\n",Block_Arg_Root(parse),povl); + fflush(stderr); + } + + iptr = iblock; + itop = iblock + fread(iblock,1,bsize,input); + + for (j = 0; j < povl; j++) + { if (iptr + ovlsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + } + + w = (Overlap *) (iptr - ptrsize); + tsize = w->path.tlen*tbytes; + + if (optr + ovlsize + tsize > otop) + { if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + optr = oblock; + } + + memmove(optr,iptr,ovlsize); + optr += ovlsize; + iptr += ovlsize; + + if (iptr + tsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + } + + memmove(optr,iptr,tsize); + optr += tsize; + iptr += tsize; + } + + fclose(input); + } + + Free_Block_Arg(parse); + } + + if (optr > oblock) + { if (fwrite(oblock,1,optr-oblock,stdout) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + } + } + + if (VERBOSE) + { fprintf(stderr," Totalling %lld la\'s\n",novl); + fflush(stderr); + } + + free(oblock); + free(iblock-ptrsize); + + exit (0); +} diff --git a/LAcheck.c b/LAcheck.c new file mode 100644 index 0000000..f9e545e --- /dev/null +++ b/LAcheck.c @@ -0,0 +1,397 @@ +/******************************************************************************************* + * + * Check the structural integrity of .las files + * + * Author: Gene Myers + * Date : July 2014 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "[-vaS] [ ] ..."; + +#define MEMORY 1000 // How many megabytes for output buffer + +int main(int argc, char *argv[]) +{ DAZZ_DB _db1, *db1 = &_db1; + DAZZ_DB _db2, *db2 = &_db2; + int VERBOSE; + int MAP_ORDER; + int SORTED; + int ISTWO; + int status; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAcheck") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("vaS") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + MAP_ORDER = flags['a']; + SORTED = flags['S']; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output error messages.\n"); + fprintf(stderr," -S: Check that .las is in sorted order.\n"); + fprintf(stderr," -a: If -S, then check sorted by A-read, A-position pairs\n"); + fprintf(stderr," off => check sorted by A,B-read pairs (LA-piles)\n"); + exit (1); + } + } + + // Open trimmed DB + + { Block_Looper *parse; + int status; + + ISTWO = 0; + status = Open_DB(argv[1],db1); + if (status < 0) + exit (1); + if (db1->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); + exit (1); + } + + if (argc <= 3) + db2 = db1; + else + { parse = Parse_Block_LAS_Arg(argv[2]); + if (! Next_Block_Exists(parse)) + { ISTWO = 1; + status = Open_DB(argv[2],db2); + if (status < 0) + exit (1); + if (db2->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); + exit (1); + } + Trim_DB(db2); + } + else + db2 = db1; + Free_Block_Arg(parse); + } + Trim_DB(db1); + } + + { char *iblock; + int64 bsize, ovlsize, ptrsize; + int i, j; + DAZZ_READ *reads1 = db1->reads; + int nreads1 = db1->nreads; + DAZZ_READ *reads2 = db2->reads; + int nreads2 = db2->nreads; + + // Setup IO buffers + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + bsize = MEMORY * 1000000ll; + iblock = (char *) Malloc(bsize+ptrsize,"Allocating input block"); + if (iblock == NULL) + exit (1); + iblock += ptrsize; + + // For each file do + + status = 0; + for (i = 2+ISTWO; i < argc; i++) + { Block_Looper *parse; + FILE *input; + char *disp; + char *iptr, *itop; + Overlap last, prev; + int64 novl; + int tspace, tbytes; + int has_chains; + + // Establish IO and (novl,tspace) header + + parse = Parse_Block_LAS_Arg(argv[i]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { disp = Block_Arg_Root(parse); + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (novl < 0) + { if (VERBOSE) + fprintf(stderr," %s: Number of alignments < 0\n",disp); + goto error; + } + if (tspace < 0) + { if (VERBOSE) + fprintf(stderr," %s: Trace spacing < 0\n",disp); + goto error; + } + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + iptr = iblock; + itop = iblock + fread(iblock,1,bsize,input); + + // For each record in file do + + has_chains = 0; + last.aread = -1; + last.bread = -1; + last.flags = 0; + last.path.bbpos = last.path.abpos = 0; + last.path.bepos = last.path.aepos = 0; + prev = last; + for (j = 0; j < novl; j++) + { Overlap ovl; + int tsize; + int equal; + + // Fetch next record + + if (iptr + ovlsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + if (iptr + ovlsize > itop) + { if (VERBOSE) + fprintf(stderr," %s: Too few alignment records\n",disp); + goto error; + } + } + + ovl = *((Overlap *) (iptr - ptrsize)); + iptr += ovlsize; + tsize = ovl.path.tlen*tbytes; + + if (iptr + tsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,input); + if (iptr + tsize > itop) + { if (VERBOSE) + fprintf(stderr," %s: Too few alignment records\n",disp); + goto error; + } + } + ovl.path.trace = iptr; + iptr += tsize; + + // Basic checks + + if (ovl.aread < 0 || ovl.bread < 0) + { if (VERBOSE) + fprintf(stderr," %s: Read indices < 0\n",disp); + goto error; + } + if (ovl.aread >= nreads1 || ovl.bread >= nreads2) + { if (VERBOSE) + fprintf(stderr," %s: Read indices out of range\n",disp); + goto error; + } + + if (ovl.path.abpos >= ovl.path.aepos || ovl.path.aepos > reads1[ovl.aread].rlen || + ovl.path.bbpos >= ovl.path.bepos || ovl.path.bepos > reads2[ovl.bread].rlen || + ovl.path.abpos < 0 || ovl.path.bbpos < 0 ) + { if (VERBOSE) + fprintf(stderr," %s: Non-sense alignment intervals\n",disp); + goto error; + } + + if (ovl.path.diffs < 0 || ovl.path.diffs > reads1[ovl.aread].rlen || + ovl.path.diffs > reads2[ovl.bread].rlen) + { if (VERBOSE) + fprintf(stderr," %s: Non-sense number of differences\n",disp); + goto error; + } + + if (Check_Trace_Points(&ovl,tspace,VERBOSE,disp)) + goto error; + + if (j == 0) + has_chains = ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0); + if (has_chains) + { if (CHAIN_START(ovl.flags) && CHAIN_NEXT(ovl.flags)) + { if (VERBOSE) + fprintf(stderr," %s: LA has both start & next flag set\n",disp); + goto error; + } + if (BEST_CHAIN(ovl.flags) && CHAIN_NEXT(ovl.flags)) + { if (VERBOSE) + fprintf(stderr," %s: LA has both best & next flag set\n",disp); + goto error; + } + } + else + { if ((ovl.flags & (START_FLAG | NEXT_FLAG | BEST_FLAG)) != 0) + { if (VERBOSE) + fprintf(stderr," %s: LAs should not have chain flags\n",disp); + goto error; + } + } + + // Duplicate check and sort check if -S set + + equal = 0; + if (SORTED) + { if (CHAIN_NEXT(ovl.flags)) + { if (ovl.aread == last.aread && ovl.bread != last.bread && + COMP(ovl.flags) != COMP(last.flags) && + ovl.path.abpos >= last.path.abpos && + ovl.path.bbpos >= last.path.bbpos) + goto dupcheck; + if (VERBOSE) + fprintf(stderr," %s: Chain is not valid (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + else if (!has_chains) + { if (ovl.aread > last.aread) goto inorder; + if (ovl.aread == last.aread) + { if (MAP_ORDER) + { if (ovl.path.abpos > prev.path.abpos) goto inorder; + if (ovl.path.abpos == prev.path.abpos) + goto dupcheck; + } + else + { if (ovl.bread > last.bread) goto inorder; + if (ovl.bread == last.bread) + { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder; + if (COMP(ovl.flags) == COMP(last.flags)) + { if (ovl.path.abpos > last.path.abpos) goto inorder; + if (ovl.path.abpos == last.path.abpos) + { equal = 1; + goto inorder; + } + } + } + } + } + if (VERBOSE) + fprintf(stderr," %s: LAs are not sorted (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + else // First element of a chain + { if (ovl.aread > prev.aread) goto inorder; + if (ovl.aread == prev.aread) + { if (MAP_ORDER) + { if (ovl.path.abpos > prev.path.abpos) goto inorder; + if (ovl.path.abpos == prev.path.abpos) + goto dupcheck; + } + else + { if (ovl.bread > prev.bread) goto inorder; + if (ovl.bread == prev.bread) + { if (COMP(ovl.flags) > COMP(prev.flags)) goto inorder; + if (COMP(ovl.flags) == COMP(prev.flags)) + { if (ovl.path.abpos > prev.path.abpos) goto inorder; + if (ovl.path.abpos == prev.path.abpos) + { equal = 1; + goto dupcheck; + } + } + } + } + } + if (VERBOSE) + fprintf(stderr," %s: Chains are not sorted (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + } + dupcheck: + if (ovl.aread == last.aread && ovl.bread == last.bread && + COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos) + equal = 1; + inorder: + if (equal) + { if (ovl.path.aepos == last.path.aepos && + ovl.path.bbpos == last.path.bbpos && + ovl.path.bepos == last.path.bepos) + { if (VERBOSE) + fprintf(stderr," %s: Duplicate LAs (%d vs %d)\n", + disp,ovl.aread+1,ovl.bread+1); + goto error; + } + } + + last = ovl; + if (CHAIN_START(ovl.flags)) + prev = ovl; + } + + // File processing epilog: Check all data read and print OK if -v + + if (iptr < itop) + { if (VERBOSE) + fprintf(stderr," %s: Too many alignment records\n",disp); + goto error; + } + + if (VERBOSE) + { printf(" %s: ",disp); + Print_Number(novl,0,stdout); + printf(" all OK\n"); + fflush(stdout); + } + goto cleanup; + + error: + status = 1; + if (VERBOSE) + { printf(" %s: Not OK, see stderr\n",disp); + fflush(stdout); + } + cleanup: + if (input != NULL) + fclose(input); + } + + Free_Block_Arg(parse); + } + + free(iblock-ptrsize); + } + + Close_DB(db1); + if (ISTWO) + Close_DB(db2); + + exit (status); +} diff --git a/LAmerge.c b/LAmerge.c new file mode 100644 index 0000000..2d72545 --- /dev/null +++ b/LAmerge.c @@ -0,0 +1,524 @@ +/******************************************************************************************* + * + * Given a list of sorted .las files, merge them into a single sorted .las file. + * + * Author: Gene Myers + * Date : July 2013 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +#undef DEBUG + +static char *Usage = "[-va] [-P] ..."; + +#define MEMORY 4000 // in Mb + +#define MAX_FILES 250 + + // Heap sort of records according to (aread,bread,COMP(flags),abpos) order + +#define COMPARE(lp,rp) \ + if (lp->aread > rp->aread) \ + bigger = 1; \ + else if (lp->aread < rp->aread) \ + bigger = 0; \ + else if (lp->bread > rp->bread) \ + bigger = 1; \ + else if (lp->bread < rp->bread) \ + bigger = 0; \ + else if (COMP(lp->flags) > COMP(rp->flags)) \ + bigger = 1; \ + else if (COMP(lp->flags) < COMP(rp->flags)) \ + bigger = 0; \ + else if (lp->path.abpos > rp->path.abpos) \ + bigger = 1; \ + else if (lp->path.abpos < rp->path.abpos) \ + bigger = 0; \ + else if (lp > rp) \ + bigger = 1; \ + else \ + bigger = 0; + +static void reheap(int s, Overlap **heap, int hsize) +{ int c, l, r; + int bigger; + Overlap *hs, *hr, *hl; + + c = s; + hs = heap[s]; + while ((l = 2*c) <= hsize) + { r = l+1; + hl = heap[l]; + if (r > hsize) + bigger = 1; + else + { hr = heap[r]; + COMPARE(hr,hl) + } + if (bigger) + { COMPARE(hs,hl) + if (bigger) + { heap[c] = hl; + c = l; + } + else + break; + } + else + { COMPARE(hs,hr) + if (bigger) + { heap[c] = hr; + c = r; + } + else + break; + } + } + if (c != s) + heap[c] = hs; +} + + // Heap sort of records according to (aread,abpos) order + +#define MAPARE(lp,rp) \ + if (lp->aread > rp->aread) \ + bigger = 1; \ + else if (lp->aread < rp->aread) \ + bigger = 0; \ + else if (lp->path.abpos > rp->path.abpos) \ + bigger = 1; \ + else if (lp->path.abpos < rp->path.abpos) \ + bigger = 0; \ + else if (lp > rp) \ + bigger = 1; \ + else \ + bigger = 0; + +static void maheap(int s, Overlap **heap, int hsize) +{ int c, l, r; + int bigger; + Overlap *hs, *hr, *hl; + + c = s; + hs = heap[s]; + while ((l = 2*c) <= hsize) + { r = l+1; + hl = heap[l]; + if (r > hsize) + bigger = 1; + else + { hr = heap[r]; + MAPARE(hr,hl) + } + if (bigger) + { MAPARE(hs,hl) + if (bigger) + { heap[c] = hl; + c = l; + } + else + break; + } + else + { MAPARE(hs,hr) + if (bigger) + { heap[c] = hr; + c = r; + } + else + break; + } + } + if (c != s) + heap[c] = hs; +} + +#ifdef DEBUG + +static void showheap(Overlap **heap, int hsize) +{ int i; + printf("\n"); + for (i = 1; i <= hsize; i++) + printf(" %3d: %5d, %5d\n",i,heap[i]->aread,heap[i]->bread); +} + +#endif + + // Input block data structure and block fetcher + +typedef struct + { FILE *stream; + char *block; + char *ptr; + char *top; + int64 count; + } IO_block; + +static void ovl_reload(IO_block *in, int64 bsize) +{ int64 remains; + + remains = in->top - in->ptr; + if (remains > 0) + memmove(in->block, in->ptr, remains); + in->ptr = in->block; + in->top = in->block + remains; + in->top += fread(in->top,1,bsize-remains,in->stream); +} + + // The program + +int main(int argc, char *argv[]) +{ IO_block *in; + int64 bsize, osize, psize; + char *block, *oblock; + int i, c, fway, clen, nfile[argc]; + Overlap **heap; + int hsize; + Overlap *ovls; + int64 totl; + int tspace, tbytes; + FILE *output; + char *optr, *otop; + + int VERBOSE; + int MAP_SORT; + char *TEMP_PATH; + + // Process command line + + { int j, k; + int flags[128]; + DIR *dirp; + + ARG_INIT("LAmerge") + + TEMP_PATH = "/tmp"; + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("va") + break; + case 'P': + TEMP_PATH = argv[i]+2; + if ((dirp = opendir(TEMP_PATH)) == NULL) + { fprintf(stderr,"%s: -P option: cannot open directory %s\n",Prog_Name,TEMP_PATH); + exit (1); + } + closedir(dirp); + break; + } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + MAP_SORT = flags['a']; + + if (argc < 3) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); + fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n"); + fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n"); + fprintf(stderr," -P: Do any intermediate merging in directory -P.\n"); + exit (1); + } + } + + // Determine the number of files and check they are all mergeable + + clen = 2*strlen(TEMP_PATH) + 50; + fway = 0; + totl = 0; + tspace = -1; + for (c = 2; c < argc; c++) + { Block_Looper *parse; + FILE *input; + char *root, *path; + + parse = Parse_Block_LAS_Arg(argv[c]); + + path = Block_Arg_Path(parse); + root = Block_Arg_Root(parse); + + clen += strlen(path) + strlen(root) + 30; + + free(root); + free(path); + + nfile[c] = 0; + while ((input = Next_Block_Arg(parse)) != NULL) + { int64 povl; + int mspace; + + if (fread(&povl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + totl += povl; + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (tspace < 0) + tspace = mspace; + else if (tspace != mspace) + { fprintf(stderr,"%s: trace-point spacing conflict between %s and earlier files", + Prog_Name,Block_Arg_Root(parse)); + fprintf(stderr," (%d vs %d)\n",tspace,mspace); + exit (1); + } + + fclose(input); + nfile[c] += 1; + } + + Free_Block_Arg(parse); + fway += nfile[c]; + } + + if (VERBOSE) + { printf(" Merging %d files totaling ",fway); + Print_Number(totl,0,stdout); + printf(" records\n"); + fflush(stdout); + } + + // Must recursively merge, emit sub-merges, then merge their results + + if (fway > MAX_FILES) + { Block_Looper *parse; + int mul, dim, fsum, cut; + char command[clen], *com; + int pid; + + mul = 1; + for (c = 0; mul < fway; c++) + mul *= MAX_FILES; + dim = pow(1.*fway,1./c)+1; + + fsum = 0; + c = 2; + + parse = Parse_Block_LAS_Arg(argv[c]); + + pid = getpid(); + for (i = 1; i <= dim; i++) + { com = command; + com += sprintf(com,"LAmerge"); + if (MAP_SORT) + com += sprintf(com," -a"); + if (mul > 2) + com += sprintf(com," -P%s",TEMP_PATH); + com += sprintf(com," %s/LM%d.P%d",TEMP_PATH,pid,i); + + cut = (fway * i) / dim; + while (fsum + nfile[c] <= cut) + { com += sprintf(com," %s",Next_Block_Slice(parse,nfile[c])); + fsum += nfile[c]; + + c += 1; + if (c >= argc) + break; + + Free_Block_Arg(parse); + + parse = Parse_Block_LAS_Arg(argv[c]); + } + if (c < argc && fsum < cut) + { int n = cut-fsum; + com += sprintf(com," %s",Next_Block_Slice(parse,n)); + nfile[c] -= n; + fsum += n; + } + system(command); + } + + Free_Block_Arg(parse); + + com = command; + com += sprintf(com,"LAmerge"); + if (MAP_SORT) + com += sprintf(com," -a"); + com += sprintf(com," %s %s/LM%d.P%c",argv[1],TEMP_PATH,pid,BLOCK_SYMBOL); + system(command); + + sprintf(command,"rm %s/LM%d.P*.las",TEMP_PATH,pid); + system(command); + + exit (0); + } + + // Base level merge: Open all the input files and initialize their buffers + + psize = sizeof(void *); + osize = sizeof(Overlap) - psize; + bsize = (MEMORY*1000000ll)/(fway + 1); + block = (char *) Malloc(bsize*(fway+1)+psize,"Allocating LAmerge blocks"); + in = (IO_block *) Malloc(sizeof(IO_block)*fway,"Allocating LAmerge IO-reacords"); + if (block == NULL || in == NULL) + exit (1); + block += psize; + + fway = 0; + for (c = 2; c < argc; c++) + { Block_Looper *parse; + FILE *input; + + parse = Parse_Block_LAS_Arg(argv[c]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { int64 novl; + int mspace; + char *iblock; + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&mspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + in[fway].stream = input; + in[fway].block = iblock = block+fway*bsize; + in[fway].ptr = iblock; + in[fway].top = iblock + fread(in[fway].block,1,bsize,input); + in[fway].count = 0; + fway += 1; + } + + Free_Block_Arg(parse); + } + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + // Open the output file buffer and write (novl,tspace) header + + { char *pwd, *root; + + pwd = PathTo(argv[1]); + root = Root(argv[1],".las"); + output = Fopen(Catenate(pwd,"/",root,".las"),"w"); + if (output == NULL) + exit (1); + free(pwd); + free(root); + + if (fwrite(&totl,sizeof(int64),1,output) != 1) + SYSTEM_READ_ERROR + if (fwrite(&tspace,sizeof(int),1,output) != 1) + SYSTEM_READ_ERROR + + oblock = block+fway*bsize; + optr = oblock; + otop = oblock + bsize; + } + + // Initialize the heap + + heap = (Overlap **) Malloc(sizeof(Overlap *)*(fway+1),"Allocating heap"); + ovls = (Overlap *) Malloc(sizeof(Overlap)*fway,"Allocating heap"); + if (heap == NULL || ovls == NULL) + exit (1); + + hsize = 0; + for (i = 0; i < fway; i++) + { if (in[i].ptr < in[i].top) + { ovls[i] = *((Overlap *) (in[i].ptr - psize)); + in[i].ptr += osize; + hsize += 1; + heap[hsize] = ovls + i; + } + } + + if (hsize > 3) + { if (MAP_SORT) + for (i = hsize/2; i > 1; i--) + maheap(i,heap,hsize); + else + for (i = hsize/2; i > 1; i--) + reheap(i,heap,hsize); + } + + // While the heap is not empty do + + while (hsize > 0) + { Overlap *ov; + IO_block *src; + int64 tsize, span; + + if (MAP_SORT) + maheap(1,heap,hsize); + else + reheap(1,heap,hsize); + + ov = heap[1]; + src = in + (ov - ovls); + + do + { src->count += 1; + + tsize = ov->path.tlen*tbytes; + span = osize + tsize; + if (src->ptr + span > src->top) + ovl_reload(src,bsize); + if (optr + span > otop) + { if (fwrite(oblock,1,optr-oblock,output) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + optr = oblock; + } + + memmove(optr,((char *) ov) + psize,osize); + optr += osize; + memmove(optr,src->ptr,tsize); + optr += tsize; + + src->ptr += tsize; + if (src->ptr >= src->top) + { heap[1] = heap[hsize]; + hsize -= 1; + break; + } + *ov = *((Overlap *) (src->ptr - psize)); + src->ptr += osize; + } + while (CHAIN_NEXT(ov->flags)); + } + + // Flush output buffer and wind up + + if (optr > oblock) + { if (fwrite(oblock,1,optr-oblock,output) != (size_t) (optr-oblock)) + SYSTEM_READ_ERROR + } + fclose(output); + + for (i = 0; i < fway; i++) + fclose(in[i].stream); + + for (i = 0; i < fway; i++) + totl -= in[i].count; + if (totl != 0) + { fprintf(stderr,"%s: Did not write all records to %s (%lld)\n",argv[0],argv[1],totl); + exit (1); + } + + free(ovls); + free(heap); + free(in); + free(block-psize); + + exit (0); +} diff --git a/LAshow.c b/LAshow.c new file mode 100644 index 0000000..ede907f --- /dev/null +++ b/LAshow.c @@ -0,0 +1,650 @@ +/******************************************************************************************* + * + * Utility for displaying the overlaps in a .las file in a variety of ways including + * a minimal listing of intervals, a cartoon, and a full out alignment. + * + * Author: Gene Myers + * Creation: July 2013 + * Last Mod: Jan 2015 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage[] = + { "[-caroUF] [-i] [-w] [-b] ", + " [ ] [ | ... ]" + }; + +static int ORDER(const void *l, const void *r) +{ int x = *((int *) l); + int y = *((int *) r); + return (x-y); +} + +int main(int argc, char *argv[]) +{ DAZZ_DB _db1, *db1 = &_db1; + DAZZ_DB _db2, *db2 = &_db2; + Overlap _ovl, *ovl = &_ovl; + Alignment _aln, *aln = &_aln; + + FILE *input; + int sameDB; + int64 novl; + int tspace, tbytes, small; + int reps, *pts; + int input_pts; + + int ALIGN, CARTOON, REFERENCE, OVERLAP; + int FLIP, MAP; + int INDENT, WIDTH, BORDER, UPPERCASE; + int ISTWO; + + // Process options + + { int i, j, k; + int flags[128]; + char *eptr; + + ARG_INIT("LAshow") + + INDENT = 4; + WIDTH = 100; + BORDER = 10; + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + switch (argv[i][1]) + { default: + ARG_FLAGS("caroUFM") + break; + case 'i': + ARG_NON_NEGATIVE(INDENT,"Indent") + break; + case 'w': + ARG_POSITIVE(WIDTH,"Alignment width") + break; + case 'b': + ARG_NON_NEGATIVE(BORDER,"Alignment border") + break; + } + else + argv[j++] = argv[i]; + argc = j; + + CARTOON = flags['c']; + ALIGN = flags['a']; + REFERENCE = flags['r']; + OVERLAP = flags['o']; + UPPERCASE = flags['U']; + FLIP = flags['F']; + MAP = flags['M']; + + if (argc <= 2) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); + fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); + fprintf(stderr,"\n"); + fprintf(stderr," -c: Show a cartoon of the LA between reads.\n"); + fprintf(stderr," -a: Show the alignment of each LA.\n"); + fprintf(stderr," -r: Show the alignment of each LA with -w bp's of A in each row.\n"); + fprintf(stderr," -o: Show only proper overlaps.\n"); + fprintf(stderr," -F: Switch the roles of A- and B-reads.\n"); + fprintf(stderr,"\n"); + fprintf(stderr," -U: Show alignments in upper case.\n"); + fprintf(stderr," -i: Indent alignments and cartoons by -i.\n"); + fprintf(stderr," -w: Width of each row of alignment in symbols (-a) or bps (-r).\n"); + fprintf(stderr," -b: # of border bp.s to show on each side of LA.\n"); + exit (1); + } + } + + // Open trimmed DB or DB pair + + { int status; + char *pwd, *root; + FILE *input; + struct stat stat1, stat2; + + ISTWO = 0; + status = Open_DB(argv[1],db1); + if (status < 0) + exit (1); + if (db1->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); + exit (1); + } + + sameDB = 1; + if (argc > 3) + { pwd = PathTo(argv[3]); + root = Root(argv[3],".las"); + if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) + { ISTWO = 1; + fclose(input); + status = Open_DB(argv[2],db2); + if (status < 0) + exit (1); + if (db2->part > 0) + { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); + exit (1); + } + stat(Catenate(db1->path,"","",".idx"),&stat1); + stat(Catenate(db2->path,"","",".idx"),&stat2); + if (stat1.st_ino != stat2.st_ino) + sameDB = 0; + Trim_DB(db2); + } + else + db2 = db1; + free(root); + free(pwd); + } + else + db2 = db1; + Trim_DB(db1); + } + + // Process read index arguments into a sorted list of read ranges + + input_pts = 0; + if (argc == ISTWO+4) + { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') + { char *eptr, *fptr; + int b, e; + + b = strtol(argv[ISTWO+3],&eptr,10); + if (eptr > argv[ISTWO+3] && b > 0) + { if (*eptr == '-') + { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') + { e = strtol(eptr+1,&fptr,10); + input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); + } + } + else + input_pts = (*eptr != '\0'); + } + else + input_pts = 1; + } + } + + if (input_pts) + { int v, x; + FILE *input; + + input = Fopen(argv[ISTWO+3],"r"); + if (input == NULL) + exit (1); + + reps = 0; + while ((v = fscanf(input," %d",&x)) != EOF) + if (v == 0) + { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", + Prog_Name,reps+1,argv[2]); + exit (1); + } + else + reps += 1; + + reps *= 2; + pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + rewind(input); + for (v = 0; v < reps; v += 2) + { fscanf(input," %d",&x); + pts[v] = pts[v+1] = x; + } + + fclose(input); + } + + else + { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); + if (pts == NULL) + exit (1); + + reps = 0; + if (argc > 3+ISTWO) + { int c, b, e; + char *eptr, *fptr; + + for (c = 3+ISTWO; c < argc; c++) + { if (argv[c][0] == LAST_READ_SYMBOL) + { b = db1->nreads; + eptr = argv[c]+1; + } + else + b = strtol(argv[c],&eptr,10); + if (eptr > argv[c]) + { if (b <= 0) + { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); + exit (1); + } + if (*eptr == '\0') + { pts[reps++] = b; + pts[reps++] = b; + continue; + } + else if (*eptr == '-') + { if (eptr[1] == LAST_READ_SYMBOL) + { e = INT32_MAX; + fptr = eptr+2; + } + else + e = strtol(eptr+1,&fptr,10); + if (fptr > eptr+1 && *fptr == 0 && e > 0) + { pts[reps++] = b; + pts[reps++] = e; + if (b > e) + { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); + exit (1); + } + continue; + } + } + } + fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); + exit (1); + } + + qsort(pts,reps/2,sizeof(int64),ORDER); + + b = 0; + for (c = 0; c < reps; c += 2) + if (b > 0 && pts[b-1] >= pts[c]-1) + { if (pts[c+1] > pts[b-1]) + pts[b-1] = pts[c+1]; + } + else + { pts[b++] = pts[c]; + pts[b++] = pts[c+1]; + } + pts[b++] = INT32_MAX; + reps = b; + } + else + { pts[reps++] = 1; + pts[reps++] = INT32_MAX; + } + } + + // Initiate file reading and read (novl, tspace) header + + { char *over, *pwd, *root; + + pwd = PathTo(argv[2+ISTWO]); + root = Root(argv[2+ISTWO],".las"); + over = Catenate(pwd,"/",root,".las"); + input = Fopen(over,"r"); + if (input == NULL) + exit (1); + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + if (tspace < 0) + { fprintf(stderr,"%s: Garbage .las file, trace spacing < 0 !\n",Prog_Name); + exit (1); + } + + if (tspace <= TRACE_XOVR && tspace != 0) + { small = 1; + tbytes = sizeof(uint8); + } + else + { small = 0; + tbytes = sizeof(uint16); + } + + printf("\n%s: ",root); + Print_Number(novl,0,stdout); + printf(" records\n"); + + free(pwd); + free(root); + } + + // Read the file and display selected records + + { int j; + uint16 *trace; + Work_Data *work; + int tmax; + int in, npt, idx, ar; + int64 tps; + + char *abuffer, *bbuffer; + int ar_wide, br_wide; + int ai_wide, bi_wide; + int mn_wide, mx_wide; + int tp_wide; + int blast, match, seen, lhalf, rhalf; + + aln->path = &(ovl->path); + if (ALIGN || REFERENCE) + { work = New_Work_Data(); + abuffer = New_Read_Buffer(db1); + bbuffer = New_Read_Buffer(db2); + } + else + { abuffer = NULL; + bbuffer = NULL; + work = NULL; + } + + tmax = 1000; + trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); + if (trace == NULL) + exit (1); + + in = 0; + npt = pts[0]; + idx = 1; + + ar_wide = Number_Digits((int64) db1->nreads); + br_wide = Number_Digits((int64) db2->nreads); + ai_wide = Number_Digits((int64) db1->maxlen); + bi_wide = Number_Digits((int64) db2->maxlen); + if (db1->maxlen < db2->maxlen) + { mn_wide = ai_wide; + mx_wide = bi_wide; + if (tspace > 0) + tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); + else + tp_wide = 0; + } + else + { mn_wide = bi_wide; + mx_wide = ai_wide; + if (tspace > 0) + tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); + else + tp_wide = 0; + } + ar_wide += (ar_wide-1)/3; + br_wide += (br_wide-1)/3; + ai_wide += (ai_wide-1)/3; + bi_wide += (bi_wide-1)/3; + mn_wide += (mn_wide-1)/3; + tp_wide += (tp_wide-1)/3; + + if (FLIP) + { int x; + x = ar_wide; ar_wide = br_wide; br_wide = x; + x = ai_wide; ai_wide = bi_wide; bi_wide = x; + } + + // For each record do + + blast = -1; + match = 0; + seen = 0; + lhalf = rhalf = 0; + for (j = 0; j < novl; j++) + + // Read it in + + { Read_Overlap(input,ovl); + if (ovl->path.tlen > tmax) + { tmax = ((int) 1.2*ovl->path.tlen) + 100; + trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); + if (trace == NULL) + exit (1); + } + ovl->path.trace = (void *) trace; + Read_Trace(input,ovl,tbytes); + + if (ovl->aread >= db1->nreads) + { fprintf(stderr,"%s: A-read is out-of-range of DB %s\n",Prog_Name,argv[1]); + exit (1); + } + if (ovl->bread >= db2->nreads) + { fprintf(stderr,"%s: B-read is out-of-range of DB %s\n",Prog_Name,argv[1+ISTWO]); + exit (1); + } + + // Determine if it should be displayed + + ar = ovl->aread+1; + if (in) + { while (ar > npt) + { npt = pts[idx++]; + if (ar < npt) + { in = 0; + break; + } + npt = pts[idx++]; + } + } + else + { while (ar >= npt) + { npt = pts[idx++]; + if (ar <= npt) + { in = 1; + break; + } + npt = pts[idx++]; + } + } + if (!in) + continue; + + // If -o check display only overlaps + + aln->alen = db1->reads[ovl->aread].rlen; + aln->blen = db2->reads[ovl->bread].rlen; + aln->flags = ovl->flags; + tps = ovl->path.tlen/2; + + if (OVERLAP) + { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) + continue; + if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) + continue; + } + + // If -M option then check the completeness of the implied mapping + + if (MAP) + { while (ovl->bread != blast) + { if (!match && seen && !(lhalf && rhalf)) + { printf("Missing "); + Print_Number((int64) blast+1,br_wide+1,stdout); + printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); + } + match = 0; + seen = 0; + lhalf = rhalf = 0; + blast += 1; + } + seen = 1; + if (ovl->path.abpos == 0) + rhalf = 1; + if (ovl->path.aepos == aln->alen) + lhalf = 1; + if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) + continue; + match = 1; + } + + // Display it + + if (ALIGN || CARTOON || REFERENCE) + printf("\n"); + + if (BEST_CHAIN(ovl->flags)) + printf("> "); + else if (CHAIN_START(ovl->flags)) + printf("+ "); + else if (CHAIN_NEXT(ovl->flags)) + printf(" -"); + + if (FLIP) + { Flip_Alignment(aln,0); + Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); + printf(" "); + Print_Number((int64) ovl->aread+1,br_wide+1,stdout); + } + else + { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); + printf(" "); + Print_Number((int64) ovl->bread+1,br_wide+1,stdout); + } + if (COMP(ovl->flags)) + printf(" c"); + else + printf(" n"); + if (ovl->path.abpos == 0) + printf(" <"); + else + printf(" ["); + Print_Number((int64) ovl->path.abpos,ai_wide,stdout); + printf(".."); + Print_Number((int64) ovl->path.aepos,ai_wide,stdout); + if (ovl->path.aepos == aln->alen) + printf("> x "); + else + printf("] x "); + if (ovl->path.bbpos == 0) + printf("<"); + else + printf("["); + if (COMP(ovl->flags)) + { Print_Number((int64) (aln->blen - ovl->path.bbpos),bi_wide,stdout); + printf(".."); + Print_Number((int64) (aln->blen - ovl->path.bepos),bi_wide,stdout); + } + else + { Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); + printf(".."); + Print_Number((int64) ovl->path.bepos,bi_wide,stdout); + } + if (ovl->path.bepos == aln->blen) + printf(">"); + else + printf("]"); + + if (!CARTOON) + printf(" ~ %5.2f%% ",(200.*ovl->path.diffs) / + ((ovl->path.aepos - ovl->path.abpos) + (ovl->path.bepos - ovl->path.bbpos)) ); + printf(" ("); + Print_Number(aln->alen,ai_wide,stdout); + printf(" x "); + Print_Number(aln->blen,bi_wide,stdout); + printf(" bps,"); + if (CARTOON) + { Print_Number(tps,tp_wide,stdout); + printf(" trace pts)\n\n"); + } + else + { Print_Number((int64) ovl->path.diffs,mn_wide,stdout); + printf(" diffs, "); + Print_Number(tps,tp_wide,stdout); + printf(" trace pts)\n"); + } + + if (ALIGN || CARTOON || REFERENCE) + { if (ALIGN || REFERENCE) + { char *aseq, *bseq; + int amin, amax; + int bmin, bmax; + int self; + + if (FLIP) + Flip_Alignment(aln,0); + if (small) + Decompress_TraceTo16(ovl); + + self = sameDB && (ovl->aread == ovl->bread) && !COMP(ovl->flags); + + amin = ovl->path.abpos - BORDER; + if (amin < 0) amin = 0; + amax = ovl->path.aepos + BORDER; + if (amax > aln->alen) amax = aln->alen; + if (COMP(aln->flags)) + { bmin = (aln->blen-ovl->path.bepos) - BORDER; + if (bmin < 0) bmin = 0; + bmax = (aln->blen-ovl->path.bbpos) + BORDER; + if (bmax > aln->blen) bmax = aln->blen; + } + else + { bmin = ovl->path.bbpos - BORDER; + if (bmin < 0) bmin = 0; + bmax = ovl->path.bepos + BORDER; + if (bmax > aln->blen) bmax = aln->blen; + if (self) + { if (bmin < amin) + amin = bmin; + if (bmax > amax) + amax = bmax; + } + } + + aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); + if (!self) + bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); + else + bseq = aseq; + + aln->aseq = aseq - amin; + if (COMP(aln->flags)) + { Complement_Seq(bseq,bmax-bmin); + aln->bseq = bseq - (aln->blen - bmax); + } + else if (self) + aln->bseq = aln->aseq; + else + aln->bseq = bseq - bmin; + + if (tspace == 0) + Compute_Trace_IRR(aln,work,GREEDIEST); + else + Compute_Trace_PTS(aln,work,tspace,GREEDIEST); + + if (FLIP) + { if (COMP(aln->flags)) + { Complement_Seq(aseq,amax-amin); + Complement_Seq(bseq,bmax-bmin); + aln->aseq = aseq - (aln->alen - amax); + aln->bseq = bseq - bmin; + } + Flip_Alignment(aln,1); + } + } + if (CARTOON) + Alignment_Cartoon(stdout,aln,INDENT,mx_wide); + if (REFERENCE) + Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); + if (ALIGN) + Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); + } + } + + free(trace); + if (ALIGN) + { free(bbuffer-1); + free(abuffer-1); + Free_Work_Data(work); + } + } + + Close_DB(db1); + if (ISTWO) + Close_DB(db2); + + exit (0); +} diff --git a/LAsort.c b/LAsort.c new file mode 100644 index 0000000..2a7e11f --- /dev/null +++ b/LAsort.c @@ -0,0 +1,413 @@ +/******************************************************************************************* + * + * Load a file U.las of overlaps into memory, sort them all by A,B index, + * and then output the result to U.S.las + * + * Author: Gene Myers + * Date : July 2013 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "[-va] ..."; + +#define MEMORY 1000 // How many megabytes for output buffer + +static char *IBLOCK; + +static int SORT_OVL(const void *x, const void *y) +{ int64 l = *((int64 *) x); + int64 r = *((int64 *) y); + + Overlap *ol, *or; + int al, ar; + int bl, br; + int cl, cr; + int pl, pr; + + ol = (Overlap *) (IBLOCK+l); + or = (Overlap *) (IBLOCK+r); + + al = ol->aread; + ar = or->aread; + if (al != ar) + return (al-ar); + + bl = ol->bread; + br = or->bread; + if (bl != br) + return (bl-br); + + cl = COMP(ol->flags); + cr = COMP(or->flags); + if (cl != cr) + return (cl-cr); + + pl = ol->path.abpos; + pr = or->path.abpos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.aepos; + pr = or->path.aepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bbpos; + pr = or->path.bbpos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bepos; + pr = or->path.bepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.diffs; + pr = or->path.diffs; + if (pl != pr) + return (pl-pr); + + if (ol < or) + return (-1); + else if (ol > or) + return (1); + else + return (0); +} + +static int SORT_MAP(const void *x, const void *y) +{ int64 l = *((int64 *) x); + int64 r = *((int64 *) y); + + Overlap *ol, *or; + int al, ar; + int bl, br; + int cl, cr; + int pl, pr; + + ol = (Overlap *) (IBLOCK+l); + or = (Overlap *) (IBLOCK+r); + + al = ol->aread; + ar = or->aread; + if (al != ar) + return (al-ar); + + pl = ol->path.abpos; + pr = or->path.abpos; + if (pl != pr) + return (pl-pr); + + bl = ol->bread; + br = or->bread; + if (bl != br) + return (bl-br); + + cl = COMP(ol->flags); + cr = COMP(or->flags); + if (cl != cr) + return (cl-cr); + + pl = ol->path.aepos; + pr = or->path.aepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bbpos; + pr = or->path.bbpos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.bepos; + pr = or->path.bepos; + if (pl != pr) + return (pl-pr); + + pl = ol->path.diffs; + pr = or->path.diffs; + if (pl != pr) + return (pl-pr); + + if (ol < or) + return (-1); + else if (ol > or) + return (1); + else + return (0); +} + +static int EQUAL(Overlap *ol, Overlap *or) +{ int al, ar; + int bl, br; + int cl, cr; + int pl, pr; + + al = ol->aread; + ar = or->aread; + if (al != ar) + return (0); + + bl = ol->bread; + br = or->bread; + if (bl != br) + return (0); + + cl = COMP(ol->flags); + cr = COMP(or->flags); + if (cl != cr) + return (0); + + pl = ol->path.abpos; + pr = or->path.abpos; + if (pl != pr) + return (0); + + pl = ol->path.aepos; + pr = or->path.aepos; + if (pl != pr) + return (0); + + pl = ol->path.bbpos; + pr = or->path.bbpos; + if (pl != pr) + return (0); + + pl = ol->path.bepos; + pr = or->path.bepos; + if (pl != pr) + return (0); + + return (1); +} + +int main(int argc, char *argv[]) +{ char *iblock, *fblock, *iend; + int64 isize, osize; + int64 ovlsize, ptrsize; + int tspace, tbytes; + int i; + + int VERBOSE; + int MAP_ORDER; + + // Process options + + { int j, k; + int flags[128]; + + ARG_INIT("LAsort") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + { ARG_FLAGS("va") } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + MAP_ORDER = flags['a']; + + if (argc <= 1) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," -v: Verbose mode, output statistics as proceed.\n"); + fprintf(stderr," -a: sort .las by A-read,A-position pairs for map usecase\n"); + fprintf(stderr," off => sort .las by A,B-read pairs for overlap piles\n"); + exit (1); + } + } + + // For each file do + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + isize = 0; + iblock = NULL; + osize = MEMORY * 1000000ll; + fblock = Malloc(osize,"Allocating LAsort output block"); + + for (i = 1; i < argc; i++) + { int64 *perm; + FILE *input, *foutput; + int64 novl, sov; + Block_Looper *parse; + + parse = Parse_Block_LAS_Arg(argv[i]); + + while ((input = Next_Block_Arg(parse)) != NULL) + { + // Read in the entire file and output header + + { int64 size; + struct stat info; + char *root, *path; + + path = Block_Arg_Path(parse); + root = Block_Arg_Root(parse); + + stat(Catenate(path,"/",root,".las"),&info); + size = info.st_size; + + if (fread(&novl,sizeof(int64),1,input) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,input) != 1) + SYSTEM_READ_ERROR + + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + if (VERBOSE) + { printf(" %s: ",root); + Print_Number(novl,0,stdout); + printf(" records "); + Print_Number(size-novl*ovlsize,0,stdout); + printf(" trace bytes\n"); + fflush(stdout); + } + + foutput = Fopen(Catenate(path,"/",root,".S.las"),"w"); + if (foutput == NULL) + exit (1); + + if (fwrite(&novl,sizeof(int64),1,foutput) != 1) + SYSTEM_WRITE_ERROR + if (fwrite(&tspace,sizeof(int),1,foutput) != 1) + SYSTEM_WRITE_ERROR + + if (size > isize) + { if (iblock == NULL) + iblock = Malloc(size+ptrsize,"Allocating LAsort input block"); + else + iblock = Realloc(iblock-ptrsize,size+ptrsize,"Allocating LAsort input block"); + if (iblock == NULL) + exit (1); + iblock += ptrsize; + isize = size; + } + size -= (sizeof(int64) + sizeof(int)); + if (size > 0) + { if (fread(iblock,size,1,input) != 1) + SYSTEM_READ_ERROR + } + fclose(input); + iend = iblock + (size - ptrsize); + + free(root); + free(path); + } + + if (novl == 0) + { fclose(foutput); + continue; + } + + // Set up unsorted permutation array + + perm = (int64 *) Malloc(sizeof(int64)*novl,"Allocating LAsort permutation vector"); + if (perm == NULL) + exit (1); + + { int64 off; + int j; + + if (CHAIN_START(((Overlap *) (iblock-ptrsize))->flags)) + { sov = 0; + off = -ptrsize; + for (j = 0; j < novl; j++) + { if (CHAIN_START(((Overlap *) (iblock+off))->flags)) + perm[sov++] = off; + off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; + } + } + else + { off = -ptrsize; + for (j = 0; j < novl; j++) + { perm[j] = off; + off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; + } + sov = novl; + } + } + + // Sort permutation array of ptrs to records + + IBLOCK = iblock; + if (MAP_ORDER) + qsort(perm,sov,sizeof(int64),SORT_MAP); + else + qsort(perm,sov,sizeof(int64),SORT_OVL); + + // Output the records in sorted order + + { int j, equal; + Overlap *w, *x, y; + int64 tsize, span; + char *fptr, *ftop, *wo; + + y.aread = ((Overlap *) (iblock+perm[0]))->aread+1; + x = &y; + + fptr = fblock; + ftop = fblock + osize; + for (j = 0; j < sov; j++) + { w = (Overlap *) (wo = iblock+perm[j]); + do + { equal = EQUAL(w,x); + tsize = w->path.tlen*tbytes; + span = ovlsize + tsize; + if (fptr + span > ftop) + { if (fwrite(fblock,1,fptr-fblock,foutput) != (size_t) (fptr-fblock)) + SYSTEM_WRITE_ERROR + fptr = fblock; + } + if (equal) + { fptr += (ovlsize + tsize); + novl -= 1; + } + else + { memmove(fptr,((char *) w)+ptrsize,ovlsize); + fptr += ovlsize; + memmove(fptr,(char *) (w+1),tsize); + fptr += tsize; + } + x = w; + w = (Overlap *) (wo += span); + } + while (wo < iend && CHAIN_NEXT(w->flags)); + } + if (fptr > fblock) + { if (fwrite(fblock,1,fptr-fblock,foutput) != (size_t) (fptr-fblock)) + SYSTEM_WRITE_ERROR + } + } + + rewind(foutput); + if (fwrite(&novl,sizeof(int64),1,foutput) != 1) + SYSTEM_WRITE_ERROR + + free(perm); + fclose(foutput); + } + Free_Block_Arg(parse); + } + + if (iblock != NULL) + free(iblock - ptrsize); + free(fblock); + + exit (0); +} diff --git a/LAsplit.c b/LAsplit.c new file mode 100644 index 0000000..966a0ff --- /dev/null +++ b/LAsplit.c @@ -0,0 +1,229 @@ +/******************************************************************************************* + * + * Split an OVL file arriving from the standard input into 'parts' equal sized .las-files + * .1.las, .2.las ... or according to a current partitioning of + * + * Author: Gene Myers + * Date : June 2014 + * + *******************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "DB.h" +#include "align.h" + +static char *Usage = "-v ( | ) < .las"; + +#define MEMORY 1000 // How many megabytes for output buffer + +int main(int argc, char *argv[]) +{ char *iblock, *oblock; + FILE *output; + DAZZ_STUB *stub; + int64 novl, bsize, ovlsize, ptrsize; + int parts, tspace, tbytes; + char *pwd, *root, *root2; + + int VERBOSE; + + // Process options + + { int i, j, k; + int flags[128]; + + ARG_INIT("LAsplit") + + j = 1; + for (i = 1; i < argc; i++) + if (argv[i][0] == '-') + { ARG_FLAGS("v") } + else + argv[j++] = argv[i]; + argc = j; + + VERBOSE = flags['v']; + + if (argc != 3) + { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); + fprintf(stderr,"\n"); + fprintf(stderr," is a template that must have a single %c-sign in it\n", + BLOCK_SYMBOL); + fprintf(stderr," This symbol is replaced by numbers 1 to n = the number of parts\n"); + exit (1); + } + } + + { char *eptr; + + parts = strtol(argv[2],&eptr,10); + if (*eptr != '\0') + { pwd = PathTo(argv[2]); + if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) + { root = Root(argv[2],".dam"); + stub = Read_DB_Stub(Catenate(pwd,"/",root,".dam"),DB_STUB_BLOCKS); + parts = stub->nblocks; + } + else + { root = Root(argv[2],".db"); + stub = Read_DB_Stub(Catenate(pwd,"/",root,".db"),DB_STUB_BLOCKS); + parts = stub->nblocks; + } + free(pwd); + free(root); + } + else + { stub = NULL; + if (parts <= 0) + { fprintf(stderr,"%s: Number of parts is not positive\n",Prog_Name); + exit (1); + } + } + } + + ptrsize = sizeof(void *); + ovlsize = sizeof(Overlap) - ptrsize; + bsize = MEMORY * 1000000ll; + oblock = (char *) Malloc(bsize,"Allocating output block"); + iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); + if (oblock == NULL || iblock == NULL) + exit (1); + iblock += ptrsize; + + pwd = PathTo(argv[1]); + root = Root(argv[1],".las"); + + root2 = index(root,BLOCK_SYMBOL); + if (root2 == NULL) + { fprintf(stderr,"%s: No %c-sign in source name '%s'\n",Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } + if (index(root2+1,BLOCK_SYMBOL) != NULL) + { fprintf(stderr,"%s: Two or more occurences of %c-sign in source name '%s'\n", + Prog_Name,BLOCK_SYMBOL,root); + exit (1); + } + *root2++ = '\0'; + + if (fread(&novl,sizeof(int64),1,stdin) != 1) + SYSTEM_READ_ERROR + if (fread(&tspace,sizeof(int),1,stdin) != 1) + SYSTEM_READ_ERROR + if (tspace <= TRACE_XOVR && tspace != 0) + tbytes = sizeof(uint8); + else + tbytes = sizeof(uint16); + + if (VERBOSE) + { printf(" Distributing %lld la\'s\n",novl); + fflush(stdout); + } + + { int i; + Overlap *w; + int64 j, low, hgh, last; + int64 tsize, povl; + char *iptr, *itop; + char *optr, *otop; + + iptr = iblock; + itop = iblock + fread(iblock,1,bsize,stdin); + + hgh = 0; + for (i = 0; i < parts; i++) + { output = Fopen(Catenate(pwd,"/",Numbered_Suffix(root,i+1,root2),".las"),"w"); + if (output == NULL) + exit (1); + + low = hgh; + if (stub != NULL) + { last = stub->tblocks[i+1]; + hgh = 0; + } + else + { last = 0; + hgh = (novl*(i+1))/parts; + } + + povl = 0; + fwrite(&povl,sizeof(int64),1,output); + fwrite(&tspace,sizeof(int),1,output); + + optr = oblock; + otop = oblock + bsize; + + for (j = low; j < novl; j++) + { if (iptr + ovlsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,stdin); + } + + w = (Overlap *) (iptr-ptrsize); + if (stub == NULL) + { if (j >= hgh && w->aread > last) + break; + last = w->aread; + } + else + { if (w->aread >= last) + break; + } + + tsize = w->path.tlen*tbytes; + if (optr + ovlsize + tsize > otop) + { fwrite(oblock,1,optr-oblock,output); + optr = oblock; + } + + memmove(optr,iptr,ovlsize); + optr += ovlsize; + iptr += ovlsize; + + if (iptr + tsize > itop) + { int64 remains = itop-iptr; + if (remains > 0) + memmove(iblock,iptr,remains); + iptr = iblock; + itop = iblock + remains; + itop += fread(itop,1,bsize-remains,stdin); + } + memmove(optr,iptr,tsize); + optr += tsize; + iptr += tsize; + } + hgh = j; + + if (optr > oblock) + fwrite(oblock,1,optr-oblock,output); + + rewind(output); + povl = hgh-low; + fwrite(&povl,sizeof(int64),1,output); + + if (VERBOSE) + { printf(" Split off %s: %lld la\'s\n",Numbered_Suffix(root,i+1,root2),povl); + fflush(stdout); + } + + fclose(output); + } + } + + free(pwd); + free(root); + Free_DB_Stub(stub); + free(iblock-ptrsize); + free(oblock); + + exit (0); +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9aa819c --- /dev/null +++ b/LICENSE @@ -0,0 +1,34 @@ + + Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + · Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + · Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + + · The name of EWM may not be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + For any issues regarding this software and its use, contact EWM at: + + Eugene W. Myers Jr. + Bautzner Str. 122e + 01099 Dresden + GERMANY + Email: gene.myers@gmail.com + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ba9cae7 --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +DEST_DIR = ~/bin + +# CFLAGS = -O0 -g -Wall -Wextra -Wno-unused-result -fno-strict-aliasing -fsanitize=address -fsanitize=undefined +# Above is for debug out of bound addresses, must compile with -lASAN -lUBSAN if gcc instead of clang + +CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing + +ALL = daligner HPC.daligner LAsort LAmerge LAsplit LAcat LAshow LA2ONE LAcheck ONE2LA + +all: $(ALL) + +daligner: daligner.c filter.c filter.h lsd.sort.c lsd.sort.h align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o daligner daligner.c filter.c lsd.sort.c align.c DB.c QV.c -lpthread -lm + +HPC.daligner: HPC.daligner.c DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o HPC.daligner HPC.daligner.c DB.c QV.c -lm + +LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm + +LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm + +LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm + +LA2ONE: LA2ONE.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h + gcc $(CFLAGS) -o LA2ONE LA2ONE.c align.c DB.c QV.c ONElib.c -lm + +LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm + +LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm + +LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h + gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm + +ONE2LA: ONE2LA.c align.c align.h DB.c DB.h QV.c QV.h ONElib.c ONElib.h + gcc $(CFLAGS) -o ONE2LA ONE2LA.c align.c DB.c QV.c ONElib.c -lm + +clean: + rm -f $(ALL) + rm -fr *.dSYM + rm -f daligner.tar.gz + +install: + cp $(ALL) $(DEST_DIR) + +package: + make clean + tar -zcf daligner.tar.gz README.md Makefile *.h *.c diff --git a/ONE2LA.c b/ONE2LA.c new file mode 100644 index 0000000..77b4bae --- /dev/null +++ b/ONE2LA.c @@ -0,0 +1,275 @@ +#include +#include +#include + +#include "DB.h" +#include "align.h" +#include "ONElib.h" + +static char *One_Schema = + "P 3 dal\n" + "D X 1 3 INT\n" // Data prolog: trace spacing + "O P 2 3 INT 8 INT_LIST\n" // A-read and B-read list + // All per B-read + "D O 1 6 STRING\n" // orientation [+-] + "D C 1 6 STRING\n" // chain directive [>+-.] + "D A 1 8 INT_LIST\n" // (ab,ae) + "D B 1 8 INT_LIST\n" // (be,be) + "D L 2 3 INT 8 INT_LIST\n" // la and then each lb + "D D 1 8 INT_LIST\n" // diff + // One line per B-read + "D T 1 8 INT_LIST\n" // trace segment length + "D Q 1 8 INT_LIST\n"; // trace segment diffs + +int main(int argc, char *argv[]) +{ int64 novls, Pmax, Tmax, psize; + Overlap *ovls, *otop, *o; + void *trace, *ttop; + int aread; + int has[128]; + int tspace, small, tbytes; + + OneFile *file1; + OneSchema *schema; + int64 *list; + char *string, *command; + + int t, i, j, k; + + // Process arguments and capture command line for provenance + + { int n, t; + char *c; + + n = 0; + for (t = 1; t < argc; t++) + n += strlen(argv[t])+1; + + command = Malloc(n+1,"Allocating command string"); + if (command == NULL) + exit (1); + + c = command; + if (argc >= 1) + { c += sprintf(c,"%s",argv[1]); + for (t = 2; t < argc; t++) + c += sprintf(c," %s",argv[t]); + } + *c = '\0'; + } + + if (argc != 2) + { fprintf(stderr,"Usage: ONE2LA > (.las)\n"); + exit (1); + } + + { char *pwd, *root, *path; + FILE *output; + + pwd = PathTo(argv[1]); + root = Root(argv[1],".dal"); + path = Catenate(pwd,"/",root,".dal"); + if ((output = fopen(path,"r")) == NULL) + { free(root); + root = Root(argv[1],".1dal"); + path = Catenate(pwd,"/",root,".1dal"); + if ((output = fopen(path,"r")) == NULL) + { fprintf(stderr,"ONE2LA: Cannot open %s for reading\n",argv[1]); + exit (1); + } + } + fclose(output); + free(root); + free(pwd); + + schema = oneSchemaCreateFromText(One_Schema); + + file1 = oneFileOpenRead(path,schema,"dal",1); + + oneAddProvenance(file1,"ONE2LA","1.0","%s >?.las",command); + } + + if (file1->info['A']->given.count == 0) + { fprintf(stderr,"ONE2LA: .dal file does not contatin coordinate information\n"); + exit (1); + } + if (file1->info['T']->given.count == 0) + { fprintf(stderr,"ONE2LA: .dal file does not contatin trace information\n"); + exit (1); + } + + t = oneReadLine(file1); + if (t == 0 || t != 'X') + { fprintf(stderr,"ONE2LA: .dal data segment does not begine with an 'X'-line\n"); + exit (1); + } + + tspace = oneInt(file1,0); + if (tspace <= TRACE_XOVR && tspace != 0) + { small = 1; + tbytes = 1; + } + else + { small = 0; + tbytes = 2; + } + + Pmax = file1->info['P']->given.max; + Tmax = file1->info['T']->given.max; + + trace = Malloc(sizeof(uint16)*Tmax*Pmax,"Allocating trace buffer"); + ovls = Malloc(sizeof(Overlap)*Pmax,"Allocating overlap vector"); + list = Malloc(sizeof(int64)*(Pmax+Tmax),"Allocating integer list"); + string = Malloc(Pmax+1,"Allocating max string"); + + novls = file1->info['P']->given.total; + fwrite(&novls,sizeof(int64),1,stdout); + fwrite(&tspace,sizeof(int),1,stdout); + + while ((t = oneReadLine(file1)) != 0) + { if (t != 'P') + { fprintf(stderr,"ONE2LA: Pile data does not begin with a P-line\n"); + exit(1); + } + psize = oneLen(file1); + list = oneIntList(file1); + aread = oneInt(file1,0)-1; + for (i = 0; i < psize; i++) + { ovls[i].aread = aread; + ovls[i].bread = list[i]-1; + } + + ttop = trace; + otop = ovls + psize; + has['O'] = has['C'] = has['A'] = has['B'] = has['L'] = has['D'] = has['T'] = has['Q'] = 0; + for (o = ovls; o < otop; o++) + { o->flags = 0; + o->path.tlen = -1; + } + + + for (j = 0; j < 6+2*psize; j++) + { t = oneReadLine(file1); + + if (t == 0) + { fprintf(stderr,"ONE2LA: Pile object not followed by sufficient auxilliary lines\n"); + exit (1); + } + if (has[t] > 0 && t != 'T' && t != 'Q') + { fprintf(stderr,"ONE2LA: Pile has more than one '%c' line\n",t); + exit (1); + } + has[t] += 1; + if (t == 'A' || t == 'B') + { if (oneLen(file1) != 2*psize) + { fprintf(stderr,"ONE2LA: %c-line has incorrect list length\n",t); + exit (1); + } + } + else if (t != 'T' && t != 'Q') + { if (oneLen(file1) != psize) + { fprintf(stderr,"ONE2LA: %c-line has incorrect list length\n",t); + exit (1); + } + } + else + { if (has[t] > psize) + { fprintf(stderr,"ONE2LA: Too many %c-lines for pile\n",t); + exit (1); + } + } + + switch (t) + { case 'O': + string = oneString(file1); + i = 0; + for (o = ovls; o < otop; o++) + if (string[i++] == 'c') + o->flags |= COMP_FLAG; + break; + case 'C': + string = oneString(file1); + i = 0; + for (o = ovls; o < otop; o++) + if (string[i] == '-') + o->flags |= NEXT_FLAG; + else if (string[i] == '>') + o->flags |= BEST_FLAG; + else if (string[i] == '+') + o->flags |= START_FLAG; + break; + case 'A': + list = oneIntList(file1); + i = 0; + for (o = ovls; o < otop; o++) + { o->path.abpos = list[i++]; + o->path.aepos = list[i++]; + } + break; + case 'B': + list = oneIntList(file1); + i = 0; + for (o = ovls; o < otop; o++) + { o->path.bbpos = list[i++]; + o->path.bepos = list[i++]; + } + break; + case 'L': + break; + case 'D': + list = oneIntList(file1); + i = 0; + for (o = ovls; o < otop; o++) + o->path.diffs = list[i++]; + break; + case 'T': + case 'Q': + list = oneIntList(file1); + o = ovls + (has[t]-1); + if (o->path.tlen >= 0) + { if (o->path.tlen != 2*oneLen(file1)) + { fprintf(stderr,"LA2ONE: T and Q line lengths do not correspond\n"); + exit (1); + } + } + else + { o->path.tlen = 2*oneLen(file1); + o->path.trace = ttop; + ttop += o->path.tlen*tbytes; + } + if (t == 'Q') + k = 0; + else + k = 1; + if (tbytes == 1) + { uint8 *t8 = (uint8 *) o->path.trace; + for (i = 0; k < o->path.tlen; k += 2) + t8[k] = list[i++]; + } + else + { uint16 *t16 = (uint16 *) o->path.trace; + for (i = 0; k < o->path.tlen; k += 2) + t16[k] = list[i++]; + } + break; + default: + fprintf(stderr,"LA2ONE: Unrecognized line type '%c'\n",t); + exit (1); + } + } + + if (has['T'] != psize || has['Q'] != psize) + { fprintf(stderr,"ONE2LA: # of pile traces != pile size\n"); + exit (1); + } + + for (o = ovls; o < otop; o++) + Write_Overlap(stdout,o,tbytes); + } + + oneSchemaDestroy(schema); + + free(command); + + exit (0); +} diff --git a/ONElib.c b/ONElib.c new file mode 100644 index 0000000..a47e3bd --- /dev/null +++ b/ONElib.c @@ -0,0 +1,3924 @@ +/***************************************************************************************** + * + * file: ONElib.c + * implementation for ONElib.h + * + * Author: Richard Durbin (rd109@cam.ac.uk) + * Copyright (C) Richard Durbin, Cambridge University and Eugene Myers 2019- + * + * HISTORY: + * Last edited: Dec 4 23:57 2022 (rd109) + * * Apr 23 00:31 2020 (rd109): global rename of VGP to ONE, Vgp to One, vgp to one + * * Apr 20 11:27 2020 (rd109): added VgpSchema to make schema dynamic + * * Dec 27 09:46 2019 (gene): style edits + compactify code + * * Jul 8 04:28 2019 (rd109): refactored to use info[] + * * Created: Thu Feb 21 22:40:28 2019 (rd109) + * + ****************************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#include +#else +#define assert(x) 0 +#endif + +#include "ONElib.h" + +// set major and minor code versions + +#define MAJOR 1 +#define MINOR 1 + +// utilities with implementation at the end of the file + +static void die(char *format, ...); // print message to stderr and exit -1 +static void *myalloc(size_t size); // allocate block, die if malloc fails +static void *mycalloc(size_t number, size_t size); // allocate & zero # objects of size +#define new(n,type) (type *) myalloc((n)*sizeof(type)) // actually use these not myalloc +#define new0(n,type) (type *) mycalloc((n),sizeof(type)) + +// global required for parallelisation + +static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER; + +// forward declarations of serialisation functions lower in the file +// RD 220818: I think that many of int below should be I64, e.g. for len, ilen etc. + +OneCodec *vcCreate(); +void vcAddToTable(OneCodec *vc, int len, char *bytes); +void vcAddHistogram(OneCodec *vc, OneCodec *vh); +void vcCreateCodec(OneCodec *vc, int partial); +void vcDestroy(OneCodec *vc); +int vcMaxSerialSize(); +int vcSerialize(OneCodec *vc, void *out); +OneCodec *vcDeserialize(void *in); +int vcEncode(OneCodec *vc, int ilen, char *ibytes, char *obytes); +int vcDecode(OneCodec *vc, int ilen, char *ibytes, char *obytes); + +// forward declarations of 64-bit integer encoding/decoding + +static inline int ltfWrite (I64 x, FILE *f) ; +static inline I64 ltfRead (FILE *f) ; + +/*********************************************************************************** + * + * ONE_FILE CREATION & DESTRUCTION + * + **********************************************************************************/ + +char* oneTypeString[] = { 0, "INT", "REAL", "CHAR", "STRING", + "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ; + +/******************* OneInfo ********************/ + +static OneInfo *infoCreate (int nField) +{ OneInfo *vi = new0 (1, OneInfo) ; + vi->nField = nField ; + if (nField) vi->fieldType = new (nField, OneType) ; + return vi; +} + +static OneInfo *infoDeepCopy (OneInfo *vi0) +{ OneInfo *vi = new (1, OneInfo) ; + *vi = *vi0 ; + if (vi->nField) + { vi->fieldType = new (vi->nField, OneType) ; + memcpy (vi->fieldType, vi0->fieldType, vi->nField*sizeof(OneType)) ; + } + if (vi->listCodec && vi->listCodec != DNAcodec) vi->listCodec = vcCreate() ; + if (vi->comment) vi->comment = strdup (vi0->comment) ; + return vi ; +} + +static bool infoCheckFields (OneInfo *vi, OneFile *vf) +{ // check field types against the STRING_LIST in vf + char *s = oneString(vf) ; + int i ; + if (vi->nField != oneLen(vf)) return false ; + for (i = 0 ; i < vi->nField ; ++i, s = oneNextString(vf,s)) + if (strcmp (oneTypeString[vi->fieldType[i]], s)) return false ; + return true ; +} + +static void infoDestroy (OneInfo *vi) +{ if (vi->buffer && ! vi->isUserBuf) free (vi->buffer) ; + if (vi->listCodec) vcDestroy (vi->listCodec) ; + if (vi->fieldType) free (vi->fieldType) ; + if (vi->comment) free (vi->comment) ; + free (vi); +} + +/******************* OneSchema ********************/ + +// a utility to set the OneInfo list information + +static int listEltSize[9] = { 0, 0, 0, 0, 1, sizeof(I64), sizeof(double), 1, 1 } ; + +static void schemaAddInfoFromArray (OneSchema *vs, int n, OneType *a, char t, char type) +{ + // use during the bootstrap, while parsing .def files, and while parsing ~ lines in other files + + if (vs->info[(int) t]) + die ("duplicate schema specification for linetype %c in filetype %s", t, vs->primary) ; + if (isalpha(t) && type == 'G') + { if (vs->groupType) die ("second group type in schema for filetype %s", vs->primary) ; + vs->groupType = t ; + } + else if (isalpha(t) && type == 'O') + { if (vs->objectType) die ("second object type in schema for filetype %s", vs->primary) ; + vs->objectType = t ; + } + else if (vs->primary && (type != 'D' || !isalpha(t))) // allow non-alphabetic lines in header + die ("non-alphabetic linetype %c (ascii %d) in schema for filetype %s",t,t,vs->primary) ; + + if (n > vs->nFieldMax) vs->nFieldMax = n ; + + OneInfo *vi = infoCreate (n) ; + + memcpy (vi->fieldType, a, n*sizeof(OneType)) ; + int i ; + for (i = 0 ; i < n ; ++i) + if (a[i] >= oneSTRING) + { if (vi->listEltSize) + die ("OneFile schema error; multiple list types for linetype definition %c", t) ; + vi->listEltSize = listEltSize[vi->fieldType[i]] ; + vi->listField = i ; + if (a[i] == oneDNA) + { vi->listCodec = DNAcodec ; vi->isUseListCodec = true ; } + else + vi->listCodec = vcCreate () ; // always make a listCodec for any list type + } + + if (t >= 'A' && t <= 'Z') vi->binaryTypePack = ((t-'A') << 1) | (char) 0x80 ; + else if (t >= 'a' && t <= 'z') vi->binaryTypePack = ((26+t-'a') << 1) | (char) 0x80 ; + else if (t == ';') vi->binaryTypePack = (52 << 2) | (char) 0x80 ; + else if (t == '&') vi->binaryTypePack = (53 << 2) | (char) 0x80 ; + else if (t == '*') vi->binaryTypePack = (54 << 2) | (char) 0x80 ; + else if (t == '/') vi->binaryTypePack = (55 << 2) | (char) 0x80 ; + else if (t == '.') vi->binaryTypePack = (56 << 2) | (char) 0x80 ; + + vs->info[(int)t] = vi ; +} + +static void schemaAddInfoFromLine (OneSchema *vs, OneFile *vf, char t, char type) +{ // assumes field specification is in the STRING_LIST of the current vf line + // need to set vi->comment separately + + static OneType a[32] ; + int i ; + OneType j ; + char *s = oneString(vf) ; + int n = oneLen(vf) ; + + if (n > 32) + die ("line specification %d fields too long - need to recompile", n) ; + + for (i = 0 ; i < n ; ++i, s = oneNextString(vf,s)) + { a[i] = 0 ; + for (j = oneINT ; j <= oneDNA ; ++j) + if (!strcmp (s, oneTypeString[j])) a[i] = j ; + if (!a[i]) + die ("ONE schema error: bad field %d of %d type %s in line %d type %c", + i, n, s, vf->line, t) ; + } + + schemaAddInfoFromArray (vs, n, a, t, type) ; + + if (oneReadComment (vf)) + vs->info[(int)t]->comment = strdup (oneReadComment(vf)) ; +} + +static OneSchema *schemaLoadRecord (OneSchema *vs, OneFile *vf) +{ char *s; + + // parse a schema specfication line from vf and add into vs + // return value is vs unless a new primary type is declared, in which case vs->nxt + + switch (vf->lineType) + { + case '.': // ignore - blank or comment line in schema file + break ; + case 'P': + if (vs->primary && !vs->objectType) + die ("schema: file type %s has no object type", vs->primary) ; + if (oneLen(vf) == 0) die ("schema: primary name must have at least one letter") ; + OneSchema *vsNxt = new0 (1, OneSchema) ; + vs->nxt = vsNxt ; + vs = vsNxt ; + s = oneString(vf); + vs->primary = new (oneLen(vf)+1, char) ; + strcpy (vs->primary, s) ; + vs->nFieldMax = 4 ; // needed for header + break ; + case 'S': + if (oneLen(vf) == 0) die ("schema: secondary name must have at least one letter") ; + if (vs->nSecondary) + { char **temp = vs->secondary ; + vs->secondary = new (vs->nSecondary+1, char*) ; + memcpy (vs->secondary, temp, vs->nSecondary*sizeof(char*)) ; + free (temp) ; + } + else + vs->secondary = new (1, char*) ; + s = oneString(vf); + vs->secondary[vs->nSecondary] = new0 (oneLen(vf)+1, char) ; + strcpy (vs->secondary[vs->nSecondary++], s) ; + break ; + case 'G': // group type + case 'O': // object type + case 'D': // standard record type + schemaAddInfoFromLine (vs, vf, oneChar(vf,0), vf->lineType) ; + break ; + default: + die ("unrecognized schema line %d starting with %c", vf->line, vf->lineType) ; + } + + return vs ; +} + +static void oneFileDestroy (OneFile *vf) ; // need a forward declaration here + +OneSchema *oneSchemaCreateFromFile (char *filename) +{ + FILE *fs = fopen (filename, "r") ; + if (!fs) return 0 ; + fclose(fs); + + OneSchema *vs = new0 (1, OneSchema) ; + + OneFile *vf = new0 (1, OneFile) ; // shell object to support bootstrap + // bootstrap specification of linetypes to read schemas + { OneInfo *vi ; + vi = vf->info['P'] = infoCreate (1) ; // to define the schema for parsing a .def file + vi->fieldType[0] = oneSTRING ; vi->listEltSize = 1 ; vi->listField = 0 ; + vi = vf->info['O'] = infoCreate (2) ; // object type specification + vi->fieldType[0] = oneCHAR ; + vi->fieldType[1] = oneSTRING_LIST ; vi->listEltSize = 1 ; vi->listField = 1 ; + vi = vf->info['D'] = infoCreate (2) ; // line type specification + vi->fieldType[0] = oneCHAR ; + vi->fieldType[1] = oneSTRING_LIST ; vi->listEltSize = 1 ; vi->listField = 1 ; + vf->info['/'] = infoCreate (0) ; // to store comments + vf->field = new (2, OneField) ; + } + + // first load the universal header and footer (non-alphabetic) line types + // do this by writing their schema into a temporary file and parsing it into the base schema + { errno = 0 ; + static char template[64] ; +#define VALGRIND_MACOS +#ifdef VALGRIND_MACOS // MacOS valgrind is missing functions to make temp files it seems + sprintf (template, "/tmp/OneSchema.%d", getpid()) ; + vf->f = fopen (template, "w+") ; + if (errno) die ("failed to open temporary file %s errno %d\n", template, errno) ; +#else + strcpy (template, "/tmp/OneSchema.XXXXXX") ; + int fd = mkstemp (template) ; + if (errno) die ("failed to open temporary file %s errno %d\n", template, errno) ; + vf->f = fdopen (fd, "w+") ; + if (errno) die ("failed to assign temporary file to stream: errno %d\n", errno) ; +#endif + unlink (template) ; // this ensures that the file is removed on closure + if (errno) die ("failed to remove temporary file %s errno %d\n", template, errno) ; + } + + // NB if you change the header spec and add a record with more than 4 fields, + // change the assignment of ->nFieldMax in the 'P' section of schemaLoadRecord() above + + fprintf (vf->f, "D 1 3 6 STRING 3 INT 3 INT first line: 3-letter type, major, minor version\n") ; + fprintf (vf->f, "D 2 1 6 STRING subtype: 3-letter subtype\n") ; + fprintf (vf->f, "D # 2 4 CHAR 3 INT linetype, count\n") ; + fprintf (vf->f, "D @ 2 4 CHAR 3 INT linetype, list max\n") ; + fprintf (vf->f, "D + 2 4 CHAR 3 INT linetype, list total\n") ; + fprintf (vf->f, "D %% 4 4 CHAR 4 CHAR 4 CHAR 3 INT group, #/+, linetype, value\n") ; + fprintf (vf->f, "D ! 1 11 STRING_LIST provenance: program, version, command, date\n") ; + fprintf (vf->f, "D < 2 6 STRING 3 INT reference: filename, object count\n") ; + fprintf (vf->f, "D > 1 6 STRING deferred: filename\n") ; + fprintf (vf->f, "D ~ 3 4 CHAR 4 CHAR 11 STRING_LIST embedded schema linetype definition\n") ; + fprintf (vf->f, "D . 0 blank line, anywhere in file\n") ; + fprintf (vf->f, "D $ 1 3 INT binary file - goto footer: isBigEndian\n") ; + fprintf (vf->f, "D ^ 0 binary file: end of footer designation\n") ; + fprintf (vf->f, "D - 1 3 INT binary file: offset of start of footer\n") ; + fprintf (vf->f, "D & 1 8 INT_LIST binary file: object index\n") ; + fprintf (vf->f, "D * 1 8 INT_LIST binary file: group index\n") ; + fprintf (vf->f, "D ; 2 4 CHAR 6 STRING binary file: list codec\n") ; + fprintf (vf->f, "D / 1 6 STRING binary file: comment\n") ; + if (fseek (vf->f, 0, SEEK_SET)) die ("ONE schema failure: cannot rewind tmp file") ; + while (oneReadLine (vf)) + schemaLoadRecord (vs, vf) ; + + // next reuse the temp file to load the schema for reading schemas + if (fseek (vf->f, 0, SEEK_SET)) die ("ONE schema failure: cannot rewind tmp file") ; + fprintf (vf->f, "P 3 def this is the primary file type for schemas\n") ; + fprintf (vf->f, "O P 1 6 STRING primary type name\n") ; + fprintf (vf->f, "D S 1 6 STRING secondary type name\n") ; + fprintf (vf->f, "D G 2 4 CHAR 11 STRING_LIST define linetype for groupType\n") ; + fprintf (vf->f, "D O 2 4 CHAR 11 STRING_LIST define linetype for objectType\n") ; + fprintf (vf->f, "D D 2 4 CHAR 11 STRING_LIST define linetype for other records\n") ; + fprintf (vf->f, "\n") ; // terminator + if (fseek (vf->f, 0, SEEK_SET)) die ("ONE schema failure: cannot rewind tmp file") ; + OneSchema *vs0 = vs ; // need this because loadInfo() updates vs on reading P lines + vf->line = 0 ; + while (oneReadLine (vf)) + vs = schemaLoadRecord (vs, vf) ; + OneSchema *vsDef = vs ; // will need this to destroy it once the true schema is read + oneFileDestroy (vf) ; // this will also effectively remove the temp file on closing + + // finally read the schema itself + if (!(vf = oneFileOpenRead (filename, vs0, "def", 1))) + return 0 ; + vs = vs0 ; // set back to vs0, so next filetype spec will replace vsDef + vs->nxt = 0 ; + oneSchemaDestroy (vsDef) ; // no longer need this, and can destroy because unlinked from vs0 + while (oneReadLine (vf)) + vs = schemaLoadRecord (vs, vf) ; + oneFileDestroy (vf) ; + + return vs0 ; +} + +static char *schemaFixNewlines (const char *text) +{ // replace literal "\n" by '\n' chars in text + char *newText = strdup (text) ; + char *s = newText, *t = s ; + while (*s) + if (*s == '\\' && s[1] == 'n') + { *t++ = '\n' ; s += 2 ; } + else + *t++ = *s++ ; + *t = 0 ; + return newText ; +} + +OneSchema *oneSchemaCreateFromText (char *text) // write to temp file and call CreateFromFile() +{ + static char template[64] ; + sprintf (template, "/tmp/OneTextSchema-%d.def", getpid()) ; + + errno = 0 ; + FILE *f = fopen (template, "w") ; + char *fixedText = schemaFixNewlines (text) ; + fprintf (f, "%s\n", fixedText) ; + free (fixedText) ; + fclose (f) ; + if (errno) die ("failed to write temporary file %s errno %d\n", template, errno) ; + + OneSchema *vs = oneSchemaCreateFromFile (template) ; + + errno = 0 ; + unlink (template) ; // delete temporary file - not ideal: will remain if schemaCreate crashes + if (errno) die ("failed to remove temporary file %s errno %d\n", template, errno) ; + + return vs ; +} + +static OneSchema *oneSchemaCreateDynamic (char *fileType, char *subType) +{ // this is clean, but it seems a bit wasteful to create a temp file + char text[32] ; + assert (fileType && strlen(fileType) > 0) ; + assert (!subType || strlen(subType) > 0) ; + if (subType) + sprintf (text, "P %ld %s\nS %ld %s\n", strlen(fileType),fileType, strlen(subType), subType) ; + else + sprintf (text, "P %ld %s\n", strlen(fileType), fileType) ; + OneSchema *vs = oneSchemaCreateFromText (text) ; + return vs ; +} + +void oneSchemaDestroy (OneSchema *vs) +{ int i ; + while (vs) + { for (i = 0 ; i < 128 ; ++i) if (vs->info[i]) infoDestroy (vs->info[i]) ; + if (vs->nSecondary) + { for (i = 0 ; i < vs->nSecondary ; ++i) free (vs->secondary[i]) ; + free (vs->secondary) ; + } + free(vs->primary); + OneSchema *t = vs->nxt ; + free (vs) ; + vs = t ; + } +} + +/*************************************/ + +static inline void setCodecBuffer (OneInfo *vi) +{ + vi->bufSize = vcMaxSerialSize() + 1; // +1 for added but unused 0-terminator + vi->buffer = new (vi->bufSize, void); +} + +static OneFile *oneFileCreate (OneSchema **vsp, char *type) +{ // searches through the linked list of vs to find type, either as primary or a secondary + // if found fills and returns vf, else returns 0 + + int i, j ; + OneFile *vf = new0 (1, OneFile) ; + char *secondary = 0 ; + OneSchema *vs = *vsp ; + + // fprintf (stderr, "oneFileCreate vs %lx type %s\n", (unsigned long)vs, type) ; + + // transfer header info + for (i = 0 ; i < 128 ; ++i) + if (vs->info[i]) vf->info[i] = infoDeepCopy (vs->info[i]) ; + + // find type in schema + while ((vs = vs->nxt)) + if (!strcmp (type, vs->primary)) + break ; + else if (vs->nSecondary) + { for (j = 0 ; j < vs->nSecondary ; ++j) + if (!strcmp (type, vs->secondary[j])) break ; + if (j < vs->nSecondary) { secondary = vs->secondary[j] ; break ; } + } + if (!vs) + { oneFileDestroy (vf) ; + return 0 ; // failed to find a match + } + + // transfer info from matched schema + for (i = 0 ; i < 128 ; ++i) + if (vs->info[i]) vf->info[i] = infoDeepCopy (vs->info[i]) ; + + // build binaryTypeUnpack[] + for (i = 0 ; i < 128 ; ++i) + if (vf->info[i] && vf->info[i]->binaryTypePack) + { U8 x = vf->info[i]->binaryTypePack ; + vf->binaryTypeUnpack[x] = i ; + vf->binaryTypeUnpack[x+1] = i ; + } + + // set other information + vf->objectType = vs->objectType ; + vf->groupType = vs->groupType ; + vf->fileType = new (strlen(vs->primary)+1, char); + strcpy (vf->fileType, vs->primary) ; + if (secondary) + { vf->subType = new (strlen(secondary)+1, char); + strcpy (vf->subType, secondary) ; + } + vf->nFieldMax = vs->nFieldMax ; + vf->field = new (vf->nFieldMax, OneField) ; + + // setup for compression + + vf->codecTrainingSize = 100000; + setCodecBuffer (vf->info[';']) ; + + // determine endian of machine + { int t = 1; + char *b = (char *) (&t); + vf->isBig = (*b == 0); + } + + *vsp = vs ; + return vf ; +} + +static void provRefDefCleanup (OneFile *vf) +{ int n ; + + if (vf->provenance) + { OneProvenance *p = vf->provenance ; + for (n = vf->info['!']->accum.count ; n-- ; p++) + { free (p->program) ; + free (p->version) ; + free (p->command) ; + free (p->date) ; + } + free (vf->provenance) ; + } + if (vf->reference) + { OneReference *r = vf->reference ; + for (n = vf->info['<']->accum.count ; n-- ; r++) + free (r->filename) ; + free (vf->reference) ; + } + if (vf->deferred) + { OneReference *r = vf->deferred ; + for (n = vf->info['>']->accum.count ; n-- ; r++) + free (r->filename) ; + free (vf->deferred) ; + } +} + +static void oneFileDestroy (OneFile *vf) +{ int i, j; + OneInfo *li, *lx; + + if (vf->share) + { for (i = 0; i < 128 ; i++) + { lx = vf->info[i]; + if (lx != NULL) + { for (j = 1; j < vf->share; j++) + { li = vf[j].info[i]; + if (li != lx) // the index OneInfos are shared + { if (li->listCodec == lx->listCodec) li->listCodec = NULL; + infoDestroy(li); + } + } + } + } + + for (j = 1; j < vf->share; j++) + { provRefDefCleanup (&vf[j]) ; + if (vf[j].codecBuf != NULL) free (vf[j].codecBuf); + if (vf[j].f != NULL) fclose (vf[j].f); + } + } + + provRefDefCleanup (vf) ; + if (vf->codecBuf != NULL) free (vf->codecBuf); + if (vf->f != NULL && vf->f != stdout) fclose (vf->f); + + for (i = 0; i < 128 ; i++) + if (vf->info[i] != NULL) + infoDestroy (vf->info[i]); + + if (vf->field) free (vf->field) ; + + if (vf->headerText) + { OneHeaderText *t = vf->headerText ; + while (t) + { free (t->text) ; + { OneHeaderText *nxt = t->nxt ; free (t) ; t = nxt ; } + } + } + + free(vf->fileType); + free(vf->subType); + + free (vf) ; +} + +/*********************************************************************************** + * + * ASCII PARSING UTILITIES: error reporting, lexical level + * + **********************************************************************************/ + +void parseError (OneFile *vf, char *format, ...) +{ va_list args; + + fprintf (stderr, "ONE PARSE ERROR "); + + va_start (args, format); + vfprintf (stderr, format, args); + va_end (args); + + vf->lineBuf[vf->linePos] = '\0'; + fprintf (stderr, ", line %" PRId64 ": %s\n", vf->line, vf->lineBuf); + + exit (1); +} + +static inline char vfGetc(OneFile *vf) +{ char c = getc(vf->f); + if (vf->linePos < 127) + vf->lineBuf[vf->linePos++] = c; + return c; +} + +static inline void eatWhite (OneFile *vf) +{ char x = vfGetc(vf); + if (x == ' ') // 200414: removed option to have tab instead of space + return; + parseError (vf, "failed to find expected space separation character"); +} + +static inline char readChar(OneFile *vf) +{ eatWhite(vf); + return vfGetc(vf); +} + +static inline char *readBuf(OneFile *vf) +{ char x, *cp, *endBuf; + + eatWhite (vf); + endBuf = vf->numberBuf + 32; + for (cp = vf->numberBuf; cp < endBuf ; cp++) + { x = vfGetc(vf); + if (isspace(x) || x == '\0' || x == EOF) + break; + *cp = x; + } + if (cp >= endBuf) + { cp[-1] = 0; + parseError (vf, "overlong item %s", vf->numberBuf); + } + else + { ungetc (x, vf->f); + vf->linePos -= 1; + *cp = 0; + } + return vf->numberBuf; +} + +static inline I64 readInt(OneFile *vf) +{ char *e, *b; + I64 x; + + b = readBuf(vf); + x = strtoll(b, &e, 10); + if (e == b) + parseError (vf, "empty int field"); + if (*e != '\0') + parseError (vf, "bad int"); + return x; +} + +static inline double readReal(OneFile *vf) +{ char *e, *b; + double x; + + b = readBuf(vf); + x = strtod (b, &e); + if (e == b) + parseError (vf, "empty real field"); + if (*e != '\0') + parseError (vf, "bad real"); + return (x); +} + +static inline void readString(OneFile *vf, char *buf, I64 n) +{ eatWhite (vf); + if (vf->isCheckString) + { char *cp = buf; + --cp; + while (n-- && (*++cp = vfGetc (vf))) + if (*cp == '\n' || *cp == EOF) + break; + if (++n) + parseError (vf, "line too short %d", buf); + *++cp = 0; + } + else + { if ((I64) fread (buf, 1, n, vf->f) != n) + die ("ONE parse error: failed to read %d byte string", n); + buf[n] = 0 ; + } +} + +static inline void readFlush (OneFile *vf) // reads to the end of the line and stores as comment +{ char x; + int n = 0; + OneInfo *li = vf->info['/'] ; + + // check the first character - if it is newline then done + x = getc (vf->f) ; + if (x == '\n') + return ; + else if (x != ' ') + parseError (vf, "comment not separated by a space") ; + + // else the remainder of the line is a comment + if (!li->bufSize) + { li->bufSize = 1024 ; + li->buffer = new (li->bufSize, char) ; + } + while ((x = getc (vf->f)) && x != '\n') + if (x == EOF) + parseError (vf, "premature end of file"); + else + { if ((n+1) >= li->bufSize) + { char *s = new (2*li->bufSize, char) ; + memcpy (s, li->buffer, li->bufSize) ; + free (li->buffer) ; + li->buffer = s ; + li->bufSize *= 2 ; + } + ((char*)li->buffer)[n] = x ; + ++n ; + } + ((char*)li->buffer)[n] = 0 ; // string terminator +} + + +/*********************************************************************************** + * + * LIST BUFFER & COUNT MANAGEMENT: error reporting, lexical level + * + **********************************************************************************/ + + // Ensure line type t buffer can handles size+nStrings, and accumulate counts + +static inline void updateCountsAndBuffer (OneFile *vf, char t, I64 size, I64 nStrings) +{ OneInfo *li; + + li = vf->info[(int) t]; + li->accum.total += size; + if (size > li->accum.max) + li->accum.max = size; + size += nStrings; // need to allocate space for terminal 0s + if ( ! li->isUserBuf && size > li->bufSize) // expand buffer + { if (li->buffer != NULL) free (li->buffer); + li->bufSize = size; + li->buffer = new (size*li->listEltSize, void); + } +} + + // Called when a new group starts or eof, accumulate group counts since last group start + +static inline void updateGroupCount(OneFile *vf, bool isGroupLine) +{ int i; + OneInfo *li; + OneCounts *ci; + + for (i = 'A'; i <= 'Z' ; i++) + { li = vf->info[i]; + if (li != NULL) + { ci = &(li->accum); + if (vf->inGroup) + { if (ci->groupCount < ci->count - li->gCount) + ci->groupCount = ci->count - li->gCount; + if (ci->groupTotal < ci->total - li->gTotal) + ci->groupTotal = ci->total - li->gTotal; + } + else + { li->oCount = ci->count; + li->oTotal = ci->total; + } + li->gCount = ci->count; + li->gTotal = ci->total; + } + } + if (isGroupLine) + { vf->group += 1; + vf->inGroup = true; + } +} + +/*********************************************************************************** + * + * BINARY INT LIST COMPACTION & UNCOMPACTION + * + **********************************************************************************/ + +static char *compactIntList (OneFile *vf, OneInfo *li, I64 len, char *buf, int *usedBytes) +{ char *y; + int d, k; + I64 z, i, mask, *ibuf; + + ibuf = (I64 *) buf; + + for (i = len-1; i > 0; i--) // convert to differences - often a big win, else harmless + ibuf[i] -= ibuf[i-1]; + + mask = 0; // find how many top bytes can be skipped + for (i = 1; i < len; i++) + if (ibuf[i] >= 0) + mask |= ibuf[i]; + else + mask |= -(ibuf[i]+1); + + k = sizeof(I64) ; + mask >>= 7; + for (d = 1; d < k; d++) + { if (mask == 0) + break; + mask >>= 8; + } + *usedBytes = d ; + + z = k - d; // number of 0 bytes + if (z == 0) return (char*)&ibuf[1] ; + + if (buf != li->buffer && !li->isUserBuf && (I64) (li->bufSize*sizeof(I64)) < d*len) + { if (li->buffer != NULL) + free (li->buffer); + li->bufSize = ((d*len) / sizeof(I64)) + 1; + li->buffer = new (li->bufSize * sizeof(I64), void); + } + + y = li->buffer ; + buf += sizeof(I64) ; --len ; // don't record the first element of buf, which is not a diff + if (vf->isBig) // copy d bytes per I64, ignoring z before or after depending on isBig + while (len--) + { buf += z; + for (k = 0; k < d; k++) + *y++ = *buf++; + } + else + while (len--) + { for (k = 0; k < d; k++) + *y++ = *buf++; + buf += z; + } + + return li->buffer ; +} + +static void decompactIntList (OneFile *vf, I64 len, char *buf, int usedBytes) +{ int d, z, k; + char *s, *t; + + z = sizeof(I64) - usedBytes ; + + if (z > 0) // decompacts in place + { buf += sizeof(I64) ; --len ; // don't decompact 0th element + d = usedBytes; + s = buf + d*len; + t = s + z*len; + if (vf->isBig) + while (s > buf) + { for (k = 0; k < d; k++) + *--t = *--s; + if (*s & 0x80) + for (k = 0; k < z; k++) + *--t = 0xff; + else + for (k = 0; k < z; k++) + *--t = 0x0; + } + else + while (s > buf) + { if (s[-1] & 0x80) + for (k = 0; k < z; k++) + *--t = 0xff; + else + for (k = 0; k < z; k++) + *--t = 0; + for (k = 0; k < d; k++) + *--t = *--s; + } + buf -= sizeof(I64) ; ++len ; + } + + { I64 i, *x = (I64 *) buf; // revert differencing + for (i = 1; i < len; i++) + x[i] += x[i-1]; + } +} + +// read and write compressed fields + +static inline int writeCompressedFields (FILE *f, OneField *field, OneInfo *li) +{ + int i, n = 0 ; + + for (i = 0 ; i < li->nField ; ++i) + switch (li->fieldType[i]) + { + case oneREAL: fwrite (&field[i].r, 8, 1, f) ; n += 8 ; break ; + case oneCHAR: putc (field[i].c, f) ; ++n ; break ; + default: // includes INT and all the LISTs, which store their length in field as an INT + n += ltfWrite (field[i].i, f) ; + } + + return n ; +} + +static inline void readCompressedFields (FILE *f, OneField *field, OneInfo *li) +{ + int i ; + + for (i = 0 ; i < li->nField ; ++i) + switch (li->fieldType[i]) + { + case oneREAL: fread (&field[i].r, 8, 1, f) ; break ; + case oneCHAR: field[i].c = fgetc (f) ; break ; + default: // includes INT and all the LISTs, which store their length in field as an INT + field[i].i = ltfRead (f) ; + } +} + +/*********************************************************************************** + * + * ONE_READ_LINE: + * Reads the next line and returns false at end of file or on error. The line is + * parsed according to its linetype and contents accessed by macros that follow. + * The top bit of the first character determines whether the line is binary or ascii + * + **********************************************************************************/ + + // Read a string list, first into new allocs, then into sized line buffer. + // Annoyingly inefficient, but we don't use it very much. + +static void readStringList(OneFile *vf, char t, I64 len) +{ int j; + I64 totLen, sLen; + char **string, *buf; + + totLen = 0; + string = new (len, char *); + for (j = 0; j < len ; ++j) + { sLen = readInt (vf); + totLen += sLen; + string[j] = new (sLen+1, char); + readString (vf, string[j], sLen); + } + + updateCountsAndBuffer (vf, t, totLen, len); + + buf = (char *) vf->info[(int) t]->buffer; + for (j = 0; j < len ; ++j) + { strcpy (buf, string[j]); + buf += strlen(buf) + 1; + free (string[j]); + } + free (string); +} + +static bool addProvenance(OneFile *vf, OneProvenance *from, int n) ; // need forward declaration + +char oneReadLine (OneFile *vf) +{ bool isAscii; + U8 x; + char t; + OneInfo *li; + + assert (!vf->isWrite) ; + assert (!vf->isFinal) ; + + vf->linePos = 0; // must come before first vfGetc() + x = vfGetc (vf); // read first char + if (feof (vf->f) || x == '\n') // blank line (x=='\n') is end of records marker before footer + { vf->lineType = 0 ; // additional marker of end of file + return 0; + } + + vf->line += 1; // otherwise assume this is a good line, and die if not + if (x & 0x80) + { isAscii = false; + t = vf->binaryTypeUnpack[x]; + } + else + { isAscii = true; + t = x; + } + vf->lineType = t; + + li = vf->info[(int) t]; + if (li == NULL) + parseError (vf, "unknown line type %c(%d was %d) line %d", t, t, x, (int)vf->line); + li->accum.count += 1; + if (t == vf->objectType) + vf->object += 1; + if (t == vf->groupType) + updateGroupCount (vf, true); + + // fprintf (stderr, "reading line %" PRId64 " type %c nField %d listElt %d\n", vf->line, t, li->nField, li->listEltSize) ; + + if (vf->info['/']->bufSize) // clear the comment buffer + *(char*)(vf->info['/']->buffer) = 0 ; + + vf->nBits = 0 ; // will use for any compressed data read in + + if (isAscii) // read field by field according to ascii spec + { int i, j; + I64 *ilst, len; + double *rlst; + + for (i = 0; i < li->nField; i++) + switch (li->fieldType[i]) + { + case oneINT: + vf->field[i].i = readInt (vf); + // printf (" field %d int %d\n", i, (int)oneInt(vf,i)) ; + break; + case oneREAL: + vf->field[i].r = readReal (vf); + break; + case oneCHAR: + vf->field[i].c = readChar (vf); + // printf (" field %d char %c\n", i, (int)oneChar(vf,i)) ; + break; + case oneSTRING: + case oneDNA: + len = readInt (vf); + vf->field[i].len = len; + updateCountsAndBuffer (vf, t, len, 1); + readString (vf, (char*) li->buffer, len); + break; + case oneINT_LIST: + len = readInt (vf); + vf->field[i].len = len; + updateCountsAndBuffer (vf, t, len, 0); + ilst = (I64 *) li->buffer; + for (j = 0; j < len; ++j) + ilst[j] = readInt(vf); + break; + case oneREAL_LIST: + len = readInt (vf); + vf->field[i].len = len; + updateCountsAndBuffer (vf, t, len, 0); + rlst = (double *) li->buffer; + for (j = 0; j < len; ++j) + rlst[j] = readReal (vf); + break; + case oneSTRING_LIST: // STRING_LIST - inefficient for now - also used for binary + len = readInt (vf); + vf->field[i].len = len; + // printf (" field %d string list len %d\n", i, (int)oneLen(vf)) ; + readStringList (vf, t, len); + break; + } + readFlush (vf); + } + + else // binary - block read fields and list, potentially compressed + { + // read the fields + + if (li->nField > 0) + readCompressedFields (vf->f, vf->field, li) ; + + if (t == vf->groupType) // must follow reading the fields + { I64 *groupIndex = (I64 *) vf->info['*']->buffer; + oneInt(vf,0) = groupIndex[vf->group] - groupIndex[vf->group-1]; + } + + // read the list if there is one + + if (li->listEltSize > 0) + { I64 listLen = oneLen(vf); + + if (listLen > 0) + { li->accum.total += listLen; + if (listLen > li->accum.max) + li->accum.max = listLen; + + if (li->fieldType[li->listField] == oneINT_LIST) + { *(I64*)li->buffer = ltfRead (vf->f) ; + if (listLen == 1) goto doneLine ; + vf->intListBytes = getc(vf->f) ; + } + + if (li->fieldType[li->listField] == oneSTRING_LIST) // handle as ASCII + readStringList (vf, t, listLen); + else if (x & 0x1) // list is compressed + { vf->nBits = ltfRead (vf->f) ; + if (fread (vf->codecBuf, ((vf->nBits+7) >> 3), 1, vf->f) != 1) + die ("ONE read error: fail to read compressed list"); + } + else if (li->fieldType[li->listField] == oneINT_LIST) + { I64 listSize = (listLen-1) * vf->intListBytes ; + if ((I64) fread (&(((I64*)li->buffer)[1]), 1, listSize, vf->f) != listSize) + die ("ONE read error: failed to read list size %" PRId64 "", listSize); + decompactIntList (vf, listLen, li->buffer, vf->intListBytes); + } + else + { I64 listSize = listLen * li->listEltSize ; + if ((I64) fread (li->buffer, 1, listSize, vf->f) != listSize) + die ("ONE read error: failed to read list size %" PRId64 "", listSize); + } + } + + if (li->fieldType[li->listField] == oneSTRING) + ((char *) li->buffer)[listLen] = '\0'; // 0 terminate + } + + doneLine: + + { U8 peek = getc(vf->f) ; // check if next line is a comment - if so then read it + ungetc(peek, vf->f) ; + if (peek & 0x80) + peek = vf->binaryTypeUnpack[peek]; + if (peek == '/') // a comment + { OneField keepField0 = vf->field[0] ; + oneReadLine (vf) ; // read comment line into vf->info['/']->buffer + vf->lineType = t ; + vf->field[0] = keepField0 ; + } + } + } + + return t; +} + +char *oneReadComment (OneFile *vf) +{ char *comment = (char*)(vf->info['/']->buffer) ; + + if (comment && *comment != 0) + return comment ; + else + return 0 ; +} + +void *_oneList (OneFile *vf) +{ + OneInfo *li = vf->info[(int) vf->lineType] ; + + if (vf->nBits) + { if (li->fieldType[li->listField] == oneINT_LIST) // first elt is already in buffer + { vcDecode (li->listCodec, vf->nBits, vf->codecBuf, (char*)&(((I64*)li->buffer)[1])) ; + decompactIntList (vf, oneLen(vf), li->buffer, vf->intListBytes) ; + } + else + vcDecode (li->listCodec, vf->nBits, vf->codecBuf, li->buffer) ; + vf->nBits = 0 ; // so we don't do it again + } + + return li->buffer ; +} + +void *_oneCompressedList (OneFile *vf) +{ + OneInfo *li = vf->info[(int) vf->lineType] ; + + if (!vf->nBits && oneLen(vf) > 0) // need to compress + vcEncode (li->listCodec, oneLen(vf), vf->info[(int) vf->lineType]->buffer, vf->codecBuf); + + return (void*) vf->codecBuf ; +} + +/*********************************************************************************** + * + * ONE_FILE_OPEN_READ: + * Opens file for reading and reads all lines of header if it exists. + * If there is no header then fileType must be given (i.e. non-zero), + * otherwise if fileType is non-zero then it must match the type in the header. + * If the header contains a $-line then it is binary and the routine reads + * the footer that includes decompressors and indexes. + * + **********************************************************************************/ + +OneFile *oneFileOpenRead (const char *path, OneSchema *vs, char *fileType, int nthreads) +{ + OneFile *vf ; + off_t startOff = 0, footOff; + OneSchema *vs0 = vs ; + bool isDynamic = false ; // if we are making the schema from the header + + assert (fileType == NULL || strlen(fileType) > 0) ; + + // first open the file, read first header line if it exists, and create the OneFile object + + { FILE *f ; + char *name ; + int curLine = 0 ; + U8 c ; + + if (strcmp (path, "-") == 0) + f = stdin; + else + { f = fopen (path, "r"); + if (f == NULL) + return NULL; + } + +#define OPEN_ERROR1(x) { fprintf (stderr,"ONE file error %s: %s\n", path, x) ; \ + fclose(f) ; return NULL; } +#define OPEN_ERROR3(x,y,z) { fprintf (stderr,"ONE file error %s: ", path) ; \ + fprintf (stderr,x,y,z) ; fprintf (stderr, "\n") ; fclose(f) ; return NULL ; } + + c = getc(f); + if (feof(f)) + OPEN_ERROR1("file is empty") ; + + if (c == '1') + { int major, minor, slen; + + if (fscanf (f, " %d", &slen) != 1) + OPEN_ERROR1("line 1: failed to read type name length") ; + if (slen == 0) + OPEN_ERROR1("line 1: type name is empty string") ; + name = new0 (slen+1, char); + if (fscanf (f, " %s %d %d", name, &major, &minor) != 3) + OPEN_ERROR1("line 1: failed to read remainder of line") ; + while (getc(f) != '\n') + if (feof(f)) + OPEN_ERROR1("end of file before end of line 1") ; + ++curLine ; + if (major != MAJOR) + OPEN_ERROR3("major version file %d > code %d", major, MAJOR) ; + if (minor > MINOR) + OPEN_ERROR3("minor version file %d > code %d", minor, MINOR) ; + } + else + { ungetc (c, f) ; + if (!fileType) + OPEN_ERROR1("attempting to open a file without the type being defined") ; + name = new0 (strlen(fileType)+1, char); + strcpy (name, fileType) ; + } + + if (!vs) // create a shell schema, which can be filled from the header + { vs0 = vs = oneSchemaCreateDynamic (name, 0) ; + isDynamic = true ; + } + + vf = oneFileCreate (&vs, name) ; + if (!vf) + OPEN_ERROR1("failed to create OneFile object") ; + if (fileType && strcmp (fileType, vf->fileType) && strcmp (fileType, vf->subType)) + { oneFileDestroy (vf) ; + OPEN_ERROR3("fileType mismatch file %s != requested %s", vf->fileType, fileType) ; + } + + free(name); + + vf->f = f; + vf->line = curLine; + } + + // read header and (optionally) footer + // recognise end of header by peeking at the first char to check if alphabetic + + vf->isCheckString = true; // always check strings while reading header + while (true) + { U8 peek = getc(vf->f); + + if (feof(vf->f)) // loop exit at end of file + break; + ungetc(peek, vf->f); + + if (peek & 0x80) + peek = vf->binaryTypeUnpack[peek]; + + if (isalpha(peek)) + break; // loop exit at standard data line + + oneReadLine(vf); // can't fail because we checked file eof already + + switch (vf->lineType) + { + case '1': + parseError(vf, "1 should be first line in header"); + break; + + case '2': + if (oneLen(vf) != 3) + parseError (vf, "secondary subType must have length 3") ; + if (isDynamic) + { char *s = oneString(vf); + vf->subType = new (oneLen(vf)+1, char); + strcpy (vf->subType, s) ; + } + else + { char *sub = oneString(vf) ; + int i ; + for (i = 0 ; i < vs->nSecondary ; ++i) + if (!strcmp (sub, vs->secondary[i])) break ; + if (i < vs->nSecondary) + { vf->subType = new (strlen(sub)+1, char); + strcpy (vf->subType, sub) ; + } + else + parseError (vf, "subtype %s not compatible with primary type %s", + sub, vf->fileType); + } + break; + + case '.': // blank line for spacing and header text + { char *text = oneReadComment (vf) ; + if (text) + { OneHeaderText **t = &vf->headerText ; + while (*t) t = &((*t)->nxt) ; + *t = new0 (1, OneHeaderText) ; + (*t)->text = strdup (text) ; + } + break ; + } + + case '~': // schema definition line + { char t = oneChar(vf,1) ; + OneInfo *vi = vs->info[(int)t] ; + if (vi) + { if (!infoCheckFields (vi, vf)) + { fprintf (stderr, "ONE file error %s: schema mismatch line %" PRId64 " linetype %c\n", + path, vf->line, t) ; + oneFileDestroy (vf) ; + return NULL ; + } + } + else if (isDynamic) + { int oldMax = vf->nFieldMax ; + schemaAddInfoFromLine (vs, vf, t, oneChar(vf,0)) ; + if (oneChar(vf,0) == 'G') vf->groupType = vs->groupType ; + if (oneChar(vf,0) == 'O') vf->objectType = vs->objectType ; + vi = vs->info[(int)t] ; + vf->info[(int)t] = infoDeepCopy (vi) ; + if (vi->binaryTypePack) + { U8 x = vi->binaryTypePack ; + vf->binaryTypeUnpack[x] = t ; + vf->binaryTypeUnpack[x+1] = t ; + } + if (vs->nFieldMax > oldMax) + { free (vf->field) ; + vf->nFieldMax = vs->nFieldMax ; + vf->field = new (vf->nFieldMax, OneField) ; + } + } + } + break ; + + case '#': // count information + case '@': + case '+': + case '%': + { char c = oneChar(vf,0); + OneInfo *li = vf->info[(int) c]; + + if (li == NULL) + parseError (vf, "unknown line type %c", c); + switch (vf->lineType) + { case '#': + li->given.count = oneInt(vf,1); + if (c == vf->objectType && vf->isBinary) // allocate space for object index + { vf->info['&']->bufSize = li->given.count; + vf->info['&']->buffer = new (li->given.count, I64); + } + if (c == vf->groupType && vf->isBinary) // allocate space for group index + { vf->info['*']->bufSize = li->given.count+1; // +1 for end value + vf->info['*']->buffer = new (vf->info['*']->bufSize, I64); + } + break; + case '@': + li->given.max = oneInt(vf,1); + li->bufSize = li->given.max + 1; // allow for string terminators + li->buffer = new (li->bufSize*li->listEltSize, void); + break; + case '+': + li->given.total = oneInt(vf,1); + break; + case '%': + c = oneChar(vf,2); + li = vf->info[(int) c]; + if (li == NULL) + parseError (vf, "unknown line type %c", c); + c = oneChar(vf,1); + if (c == '#') + li->given.groupCount = oneInt(vf,3); + else if (c == '+') + li->given.groupTotal = oneInt(vf,3); + else + parseError (vf, "unrecognised symbol %c", c); + break; + } + } + break; + + case '!': // NB need to copy the strings + { OneProvenance p ; + p.program = oneString(vf) ; + p.version = p.program + strlen(p.program) + 1 ; + p.command = p.version + strlen(p.version) + 1 ; + p.date = p.command + strlen(p.command) + 1 ; + vf->info['!']->accum.count -= 1; // to avoid double counting + addProvenance (vf, &p, 1) ; + } + break; + + case '<': + vf->info['<']->accum.count -= 1; // to avoid double counting + oneAddReference (vf, oneString(vf), oneInt(vf,1)); + break; + + case '>': + vf->info['>']->accum.count -= 1; // to avoid double counting + oneAddDeferred (vf, oneString(vf)); + break; + + // Below here are binary file header types - requires given.count/given.max first + + case '$': // read footer - goto end, find offset to start of footer and go there + if (oneInt(vf,0) != vf->isBig) + die ("ONE file error: endian mismatch - convert file to ascii"); + vf->isBinary = true; + + startOff = ftello (vf->f); + if (fseek (vf->f, -sizeof(off_t), SEEK_END) != 0) + die ("ONE file error: can't seek to final line"); + + if (fread (&footOff, sizeof(off_t), 1, vf->f) != 1) + die ("ONE file error: can't read footer offset"); + + if (fseeko (vf->f, footOff, SEEK_SET) != 0) + die ("ONE file error: can't seek to start of footer"); + break; + + case '^': // end of footer - return to where we jumped from header + if (fseeko (vf->f, startOff, SEEK_SET) != 0) + die ("ONE file error: can't seek back"); + break; + + case '&': + vf->isIndexIn = true; + break; + + case '*': + break; + + case ';': + vf->info[(int) oneChar(vf,0)]->listCodec = vcDeserialize (oneString(vf)); + break; + + default: + parseError (vf, "unknown header line type %c", vf->lineType); + break; + } + } + vf->isCheckString = false; // user can set this back to true if they wish + + if (!vf->objectType) // failed to get a schema from function call or from file + { fprintf (stderr, "ONEfile error %s: no schema available\n", path) ; + oneFileDestroy (vf) ; + return NULL ; + } + + // allocate codec buffer - always allocate enough to handle fields of all line types + + { I64 size = vf->nFieldMax * sizeof(OneField) ; + int i ; + + for (i = 0; i < 128; ++i) + if (vf->info[i]) + { OneInfo *li = vf->info[i]; + if (li->listCodec && size < li->given.max * li->listEltSize) + size = li->given.max * li->listEltSize; + } + vf->codecBufSize = size+1; + vf->codecBuf = new (vf->codecBufSize, void); // add one for worst case codec usage + } + + // if parallel, allocate a OneFile array for parallel thread objects, switch vf to head of array + + if (nthreads > 1) + { int i, j ; + + if (strcmp (path, "-") == 0) + die ("ONE error: parallel input incompatible with stdin as input"); + + { OneFile *vf0 = vf ; + vf = new (nthreads, OneFile); + vf[0] = *vf0 ; + vf->share = nthreads ; + free (vf0) ; // NB free() not oneFileDestroy because don't want deep destroy + } + + startOff = ftello (vf->f) ; + for (i = 1; i < nthreads; i++) + { vs = vs0 ; // needed because vs will have changed to map to the relevant page + OneFile *v = oneFileCreate(&vs, vf->fileType); // need to do this after header is read + vf[i] = *v ; + free (v) ; + v = vf+i; + + v->share = -i ; // so this slave knows its own identity + + v->f = fopen (path, "r") ; // need an independent file handle + if (fseeko (v->f, startOff, SEEK_SET) != 0) + die ("ONE file error: can't seek to start of data"); + + for (j = 0; j < 128; j++) + { OneInfo *li = v->info[j]; + if (li != NULL) + { OneInfo *l0 = vf->info[j]; + if (li->listCodec) vcDestroy (li->listCodec) ; + li->listCodec = l0->listCodec; + if (li->listEltSize > 0) + { li->bufSize = l0->bufSize; + if (li->buffer) free (li->buffer) ; + li->buffer = new (l0->bufSize*l0->listEltSize, void); + } + li->given = l0->given; + } + } + + v->codecBufSize = vf->codecBufSize; + if (v->codecBuf) free (v->codecBuf) ; + v->codecBuf = new (v->codecBufSize, void); + + v->info['&']->listCodec = 0 ; + infoDestroy (v->info['&']); + v->info['*']->listCodec = 0 ; + infoDestroy (v->info['*']); + v->info['&'] = vf->info['&']; + v->info['*'] = vf->info['*']; + + v->isIndexIn = vf->isIndexIn; + if (vf->subType != NULL) + { v->subType = new (strlen(vf->subType)+1, char); + strcpy (v->subType, vf->subType) ; + } + else + v->subType = NULL; + } + } // end of parallel threads block + + if (isDynamic) + oneSchemaDestroy (vs0) ; + + return vf; +} + +/*********************************************************************************** + * + * ONE_USER_BUFFER / CLOSE / GOTO + * + **********************************************************************************/ + + // This lets the user reassign the buffer that lists in a particular line type are read into. + // If this is not set, a default buffer is provided. If buffer == NULL then the package + // reverts to the default buffer. This routine can be called repeatedly. + // NB the package doesn't check the size of a user supplied buffer - the user must allocate + // enough memory for all forthcoming list data. + +void oneUserBuffer (OneFile *vf, char lineType, void *buffer) +{ OneInfo *li; + + li = vf->info[(int) lineType]; + if (buffer != NULL) + { if ( ! li->isUserBuf && li->buffer != NULL) + { free (li->buffer); + li->bufSize = 0; + } + li->buffer = buffer; + li->isUserBuf = true; + } + else + { if (li->isUserBuf) + { li->bufSize = li->given.max + 1; + li->buffer = new (li->given.max*li->listEltSize, void); + } + li->isUserBuf = false; + } +} + +bool oneGotoObject (OneFile *vf, I64 i) +{ if (vf != NULL && vf->isIndexIn && vf->objectType) + if (0 <= i && i < vf->info[(int) vf->objectType]->given.count) + if (fseek (vf->f, ((I64 *) vf->info['&']->buffer)[i], SEEK_SET) == 0) + { vf->object = i; + return true ; + } + return false ; +} + +I64 oneGotoGroup (OneFile *vf, I64 i) +{ if (vf != NULL && vf->isIndexIn && vf->groupType) + if (0 <= i && i < vf->info[(int) vf->groupType]->given.count) + { I64 *groupIndex = (I64 *) vf->info['*']->buffer; + if (!oneGotoObject(vf,groupIndex[i])) + return 0 ; + return (groupIndex[i+1] - groupIndex[i]); + } + return 0 ; +} + +/*********************************************************************************** + * + * ONE_OPEN_WRITE_(NEW | FROM) + * + **********************************************************************************/ + +OneFile *oneFileOpenWriteNew (const char *path, OneSchema *vs, char *fileType, + bool isBinary, int nthreads) +{ OneFile *vf ; + FILE *f ; + OneSchema *vs0 = vs ; + + if (strcmp (path, "-") == 0) + f = stdout; + else + { f = fopen (path, "w"); + if (f == NULL) + return NULL ; + } + + vf = oneFileCreate (&vs, fileType) ; + if (!vf) + return NULL ; + + vf->f = f; + vf->isWrite = true; + vf->isBinary = isBinary; + vf->isLastLineBinary = true; // we don't want to add a newline before the first true line + + vf->codecBufSize = vf->nFieldMax*sizeof(OneField) + 1; + vf->codecBuf = new (vf->codecBufSize, void); + + if (nthreads > 1) + { OneFile *v, *vf0 = vf ; + int i ; + char name[100] ; + int pid = getpid() ; + + vf->share = nthreads ; + vf->fieldLock = mutexInit; + vf->listLock = mutexInit; + vf = new (nthreads, OneFile); + vf[0] = *vf0 ; + free (vf0) ; // NB free() not oneFileDestroy because don't want deep destroy + + for (i = 1; i < nthreads; i++) + { vs = vs0 ; // needed because vs will have changed in prevous oneFileCreate call + v = oneFileCreate (&vs, fileType); + + v->isWrite = true; + v->isBinary = isBinary; + v->isLastLineBinary = isBinary; + + v->codecBufSize = vf->codecBufSize; + v->codecBuf = new (v->codecBufSize, void); + v->codecTrainingSize /= 3*nthreads; + v->share = -i; + + sprintf(name,".part.%d.%d",pid,i); + f = fopen (name, "w"); + if (f == NULL) + die ("ONE file error: cannot create temporary file %d for parallel write", i); + v->f = f; + + vf[i] = *v; + free (v); + } + } + + return vf; +} + +static inline void infoCopy (OneSchema *vs, OneFile *vfIn, char t, char type) +{ + OneInfo *vi = vfIn->info[(int)t] ; + schemaAddInfoFromArray (vs, vi->nField, vi->fieldType, t, type) ; + if (vi->comment) vs->info[(int)t]->comment = strdup (vi->comment) ; +} + +OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn, bool isBinary, int nthreads) +{ + // first build a schema from vfIn + OneSchema *vs0 = oneSchemaCreateDynamic (vfIn->fileType, vfIn->subType) ; + OneSchema *vs = vs0->nxt ; // this is the actual schema - vs0 is for the header + + if (vfIn->groupType) infoCopy (vs, vfIn, vfIn->groupType, 'G') ; // first the group + infoCopy (vs, vfIn, vfIn->objectType, 'O') ; // next the object + int i ; // then the rest of the record lines + for (i = 'A' ; i <= 'z' ; ++i) + if (isalpha(i) && vfIn->info[i] && i != vfIn->groupType && i != vfIn->objectType) + infoCopy (vs, vfIn, (char)i, 'D') ; + // use it to open the file + OneFile *vf = oneFileOpenWriteNew (path, vs0, vfIn->subType ? vfIn->subType : vfIn->fileType, + isBinary, nthreads); + oneSchemaDestroy (vs0) ; + if (!vf) + return NULL ; + + oneInheritProvenance (vf, vfIn); + oneInheritReference (vf, vfIn); + oneInheritDeferred (vf, vfIn); + + // set info[]->given, and resize codecBuf accordingly + I64 size = vf->codecBufSize; + for (i = 0; i < 128 ; ++i) + if (vf->info[i]) + { OneInfo *vi = vf->info[i]; + vi->given = vfIn->info[i]->given ; + if (vi->listCodec) + { I64 sz = vi->given.max * vi->listEltSize; + if (sz >= size) + size = sz+1; + } + } + if (size > vf->codecBufSize) + for (i = 0 ; i < nthreads ; ++i) + { OneFile *v = &(vf[i]) ; + if (v->codecBuf) free (v->codecBuf) ; + v->codecBufSize = size; + v->codecBuf = new (size, void); + } + + return vf ; +} + +bool oneFileCheckSchema (OneFile *vf, char *textSchema) +{ + char *fixedText = schemaFixNewlines (textSchema) ; + OneSchema *vs = oneSchemaCreateFromText (fixedText) ; + free (fixedText) ; + OneSchema *vs0 = vs ; // need to keep the root to destroy the full schema + + if (vs->nxt) // the textSchema contained at least one 'P' line to define a file type + { while (vs && strcmp (vs->primary, vf->fileType)) vs = vs->nxt ; + if (!vs) + { fprintf (stderr, "OneSchema mismatch: file type %s not found in schema\n", + vf->fileType) ; + oneSchemaDestroy (vs0) ; + return false ; + } + } + + bool isMatch = true ; + int i, j ; + + for (i = 'A' ; i <= 'z' ; ++i) + if (vs->info[i]) + { OneInfo *vis = vs->info[i] ; + OneInfo *vif = vf->info[i] ; + if (!vif) + { fprintf (stderr, "OneSchema mismatch: record type %c missing in file schema\n", i) ; + isMatch = false ; + } + else if (vif->nField != vis->nField) + { fprintf (stderr, "OneSchema mismatch: number of fields for type %c file %d != %d\n", + i, vif->nField, vis->nField) ; + isMatch = false ; + } + else + for (j = 0 ; j < vif->nField ; ++j) + if (vif->fieldType[j] != vis->fieldType[j]) + { fprintf (stderr, "OneSchema mismatch: field %d for type %c file %s != %s\n", + j,i,oneTypeString[vif->fieldType[j]],oneTypeString[vis->fieldType[j]]); + isMatch = false ; + } + } + + oneSchemaDestroy (vs0) ; + return isMatch ; +} + +/*********************************************************************************** + * + * SETTING UP PROVENANCE, REFERENCES, & DEFERRALS + * + **********************************************************************************/ + +static bool addProvenance(OneFile *vf, OneProvenance *from, int n) +{ I64 i ; + OneInfo *l = vf->info['!']; + I64 o = l->accum.count; + OneProvenance *p; + + if (n == 0) + return (false); + assert (!vf->isHeaderOut) ; + + l->accum.count += n; + + p = new(o+n, OneProvenance); + if (o > 0) + memcpy (p, vf->provenance, o*sizeof(OneProvenance)); + memcpy (p+o, from, n*sizeof(OneProvenance)); + free (vf->provenance); + vf->provenance = p; + + // finally create self-owned copy of all fields + + p = p+o ; + for (i = 0 ; i < n ; ++i, ++p) + { p->program = strdup(p->program) ; + p->version = strdup(p->version) ; + p->command = strdup(p->command) ; + p->date = strdup(p->date) ; + } + + return (true); +} + +bool oneInheritProvenance(OneFile *vf, OneFile *source) +{ return (addProvenance(vf, source->provenance, source->info['!']->accum.count)); } + +bool oneAddProvenance(OneFile *vf, char *prog, char *version, char *format, ...) +{ va_list args ; + OneProvenance p; + time_t t = time(NULL); + + p.program = prog; + p.version = version; + va_start (args, format) ; vasprintf (&p.command, format, args) ; va_end (args) ; + p.date = new (20, char); + strftime(p.date, 20, "%F_%T", localtime(&t)); + addProvenance (vf, &p, 1); + free (p.command) ; + free (p.date) ; + return true ; // always added something +} + +static bool addReference(OneFile *vf, OneReference *from, int n, bool isDeferred) +{ I64 o; + OneInfo *l; + OneReference *r, **t; + I64 i ; + + if (n == 0) + return false; + assert (!vf->isHeaderOut) ; + + if (isDeferred) + { l = vf->info['>']; + t = &(vf->deferred); + } + else + { l = vf->info['<']; + t = &(vf->reference); + } + o = l->accum.count; + l->accum.count += n; + + r = new (o+n, OneReference); + if (o > 0) + memcpy (r, *t, o*sizeof(OneReference)); + memcpy (r+o, from, n*sizeof(OneReference)); + free (*t); + *t = r; + + r += o ; // make self-owned copy of filename strings + for (i = 0 ; i < n ; ++i, ++r) + r->filename = strdup (r->filename) ; + + return true; +} + +bool oneInheritReference(OneFile *vf, OneFile *source) +{ return (addReference(vf, source->reference, source->info['<']->accum.count, false)); } + +bool oneAddReference(OneFile *vf, char *filename, I64 count) +{ OneReference ref; + ref.filename = filename; + ref.count = count; + return (addReference(vf, &ref, 1, false)); +} + +bool oneInheritDeferred (OneFile *vf, OneFile *source) +{ return (addReference (vf, source->deferred, source->info['>']->accum.count, true)); } + +bool oneAddDeferred (OneFile *vf, char *filename) +{ OneReference ref; + ref.filename = filename; + return (addReference (vf, &ref, 1, true)); +} + +/*********************************************************************************** + * + * ONE_WRITE_HEADER / FOOTER + * + **********************************************************************************/ + +static void writeInfoSpec (OneFile *vf, char ci) +{ + int i ; + OneInfo *vi = vf->info[(int) ci] ; + + if (ci == vf->groupType) + fprintf (vf->f, "\n~ G %c %d", ci, vi->nField) ; + else if (ci == vf->objectType) + fprintf (vf->f, "\n~ O %c %d", ci, vi->nField) ; + else + fprintf (vf->f, "\n~ D %c %d", ci, vi->nField) ; + for (i = 0 ; i < vi->nField ; ++i) + fprintf (vf->f, " %d %s", + (int)strlen(oneTypeString[vi->fieldType[i]]), oneTypeString[vi->fieldType[i]]) ; + if (vi->comment) + oneWriteComment (vf, "%s", vi->comment) ; +} + +static void writeHeader (OneFile *vf) +{ int i,n; + OneInfo *li; + + assert (vf->isWrite) ; + assert (vf->line == 0) ; + assert (vf->share >= 0) ; + + vf->isLastLineBinary = false; // header is in ASCII + + fprintf (vf->f, "1 %lu %s %d %d", strlen(vf->fileType), vf->fileType, MAJOR, MINOR); + vf->line += 1; + if (vf->subType) + { fprintf (vf->f, "\n2 %lu %s", strlen(vf->subType), vf->subType); + vf->line += 1; + } + + // provenance + if (vf->info['!']->accum.count) + { OneProvenance *p = vf->provenance; + n = vf->info['!']->accum.count; + for (i = 0; i < n; i++, p++) + { fprintf (vf->f, "\n! 4 %lu %s %lu %s %lu %s %lu %s", + strlen(p->program), p->program, strlen(p->version), p->version, + strlen(p->command), p->command, strlen(p->date), p->date); + vf->line += 1; + } + } + + fprintf (vf->f, "\n.") ; // always have a spacer after this + + // reference and deferred + if (vf->info['<']->accum.count || vf->info['>']->accum.count) + { OneReference *r = vf->reference; + n = vf->info['<']->accum.count; + for (i = 0; i < n; i++, r++) + { fprintf (vf->f, "\n< %lu %s %" PRId64 "", strlen(r->filename), r->filename, r->count); + vf->line += 1; + } + + r = vf->deferred; + n = vf->info['>']->accum.count; + for (i = 0; i < n; i++, r++) + { fprintf (vf->f, "\n> %lu %s", strlen(r->filename), r->filename); + vf->line += 1; + } + fprintf (vf->f, "\n.") ; + } + + // write the schema into the header - no need for file type, version etc. since already given + if (vf->groupType) writeInfoSpec (vf, vf->groupType) ; + if (vf->objectType) writeInfoSpec (vf, vf->objectType) ; + for (i = 'A' ; i <= 'z' ; ++i) + if (isalnum(i) && vf->info[i] && i != vf->objectType && i != vf->groupType) + writeInfoSpec (vf, i) ; + + // any header text on '.' lines + if (vf->headerText) + { OneHeaderText *t = vf->headerText ; + while (t) + { fprintf (vf->f, "\n. %s", t->text) ; + t = t->nxt ; + } + fprintf (vf->f, "\n.") ; + } + + if (vf->isBinary) // defer writing rest of header + { fprintf (vf->f, "\n$ %d", vf->isBig); + vf->line += 1; + } + else // write counts based on those supplied in input header + { fprintf (vf->f, "\n.") ; + bool isCountWritten = false ; + for (i = 'A'; i <= 'Z'+1 ; i++) + { if (i == 'Z'+1) + { if (vf->groupType) // NB group types are all lower case so > 'Z'+1 + i = vf->groupType ; + else + break ; + } + li = vf->info[i]; + if (li != NULL && li->given.count > 0) + { isCountWritten = true ; + fprintf (vf->f, "\n# %c %" PRId64 "", i, li->given.count); + vf->line += 1; + if (li->given.max > 0) + { fprintf (vf->f, "\n@ %c %" PRId64 "", i, li->given.max); + vf->line += 1; + } + if (li->given.total > 0) + { fprintf (vf->f, "\n+ %c %" PRId64 "", i, li->given.total); + vf->line += 1; + } + if (li->given.groupCount > 0) + { fprintf (vf->f, "\n%% %c # %c %" PRId64 "", vf->groupType, i, li->given.groupCount); + vf->line += 1; + } + if (li->given.groupTotal > 0) + { fprintf (vf->f, "\n%% %c + %c %" PRId64 "", vf->groupType, i, li->given.groupTotal); + vf->line += 1; + } + } + } + if (isCountWritten) + fprintf (vf->f, "\n.") ; + } + fflush (vf->f); + + vf->isHeaderOut = true; +} + +/*********************************************************************************** + * + * ONE_WRITE_LINE + * + **********************************************************************************/ + +static int writeStringList (OneFile *vf, char t, int len, char *buf) +{ OneInfo *li; + int j, nByteWritten = 0; + I64 sLen, totLen; + + totLen = 0; + for (j = 0; j < len; j++) + { sLen = strlen (buf); + totLen += sLen; + nByteWritten += fprintf (vf->f, " %" PRId64 " %s", sLen, buf); + buf += sLen + 1; + } + + li = vf->info[(int) t]; + li->accum.total += totLen; + if (li->accum.max < totLen) + li->accum.max = totLen; + + return nByteWritten ; +} + +// process is to fill fields by assigning to macros, then call - list contents are in buf +// NB in ASCII mode adds '\n' before writing line not after, so oneWriteComment() can add to line +// first call will write initial header + +void oneWriteLine (OneFile *vf, char t, I64 listLen, void *listBuf) +{ I64 i, j; + OneInfo *li; + + // fprintf (stderr, "write line %d type %c char %c\n", vf->line, t, oneChar(vf,0)) ; + + assert (vf->isWrite) ; + assert (!vf->isFinal || !isalpha(t)) ; + + li = vf->info[(int) t]; + assert (li) ; + + vf->line += 1; + li->accum.count += 1; + if (t == vf->groupType) updateGroupCount(vf, true); + + if (li->listEltSize > 0) // need to write the list + { assert (listLen >= 0) ; + vf->field[li->listField].len = listLen ; + if (listBuf == NULL) listBuf = li->buffer; + } + + // BINARY - block write and optionally compress + + if (vf->isBinary) + { U8 x; + + if (!vf->isHeaderOut && vf->share >= 0) writeHeader (vf) ; // no header on slaves + + if (!vf->isLastLineBinary) + { fputc ('\n', vf->f) ; + vf->byte = ftello (vf->f) ; + } + + if (t == vf->objectType) // update index and increment object count + { OneInfo *lx = vf->info['&']; + + if (vf->object >= lx->bufSize) // first ensure enough space + { I64 ns = (lx->bufSize << 1) + 0x20000; + I64 *nb = new (ns, I64); + + memcpy(nb, lx->buffer, lx->bufSize*sizeof(I64)); + free (lx->buffer); + lx->buffer = nb; + lx->bufSize = ns; + } + ((I64 *) lx->buffer)[vf->object] = vf->byte; + // assert (ftello (vf->f) == vf->byte) ; // beware - very costly + + ++vf->object ; + } + if (t == vf->groupType) + { OneInfo *lx = vf->info['*']; + + if (vf->group >= lx->bufSize) // still room for final value because one ahead here + { I64 ns, *nb; + + ns = (lx->bufSize << 1) + 0x20000; + nb = new (ns, I64); + memcpy(nb, lx->buffer, lx->bufSize*sizeof(I64)); + free (lx->buffer); + lx->buffer = nb; + lx->bufSize = ns; + } + + ((I64 *) lx->buffer)[vf->group-1] = vf->object; // group # already advanced + } + + // write the line character + + x = li->binaryTypePack; // Binary line code + compression flags + if (li->isUseListCodec) + x |= 0x01; + fputc (x, vf->f); + ++vf->byte ; + + // write the fields + + if (li->nField > 0) + vf->byte += writeCompressedFields (vf->f, vf->field, li) ; + + // write the list if there is one + + if (li->listEltSize && listLen > 0) + { I64 nBits, listSize; + int listBytes ; + + li->accum.total += listLen; + if (listLen > li->accum.max) + li->accum.max = listLen; + + if (li->fieldType[li->listField] == oneINT_LIST) + { vf->byte += ltfWrite (*(I64*)listBuf, vf->f) ; + if (listLen == 1) goto doneLine ; // finish writing this line here + listBuf = compactIntList (vf, li, listLen, listBuf, &listBytes) ; + --listLen ; + fputc ((char)listBytes, vf->f) ; + vf->byte++ ; + } + else + listBytes = li->listEltSize ; + listSize = listLen * listBytes; + + if (li->fieldType[li->listField] == oneSTRING_LIST) // handle as ASCII + vf->byte += writeStringList (vf, t, listLen, listBuf); + else if (x & 0x1) + { if (listSize >= vf->codecBufSize) + { free (vf->codecBuf); + vf->codecBufSize = listSize+1; + vf->codecBuf = new (vf->codecBufSize, void); + } + nBits = vcEncode (li->listCodec, listSize, listBuf, vf->codecBuf); + vf->byte += ltfWrite (nBits, vf->f) ; + if (fwrite (vf->codecBuf, ((nBits+7) >> 3), 1, vf->f) != 1) + die ("ONE write error: failed to write compressed list"); + vf->byte += ((nBits+7) >> 3) ; + } + else + { if (fwrite (listBuf, listSize, 1, vf->f) != 1) + die ("ONE write error line %" PRId64 ": failed to write list field %d listLen %" PRId64 " listSize %" PRId64 " listBuf %lx", + vf->line, li->listField, listLen, listSize, listBuf); + vf->byte += listSize; + if (li->listCodec != NULL) + { vcAddToTable (li->listCodec, listSize, listBuf); + li->listTack += listSize; + + if (li->listTack > vf->codecTrainingSize) + { if (vf->share == 0) + { vcCreateCodec (li->listCodec, 1); + li->isUseListCodec = true; + } + else + { OneFile *ms; + OneInfo *lx; + + if (vf->share < 0) + { ms = vf + vf->share; + lx = ms->info[(int) t]; + } + else + { ms = vf; + lx = li; + } + + pthread_mutex_lock(&ms->listLock); + + if ( ! li->isUseListCodec) + + { if (vf->share < 0) + { lx->listTack += li->listTack; + li->listTack = 0; + } + if (lx->listTack > ms->codecTrainingSize) + { for (i = 1; i < ms->share; i++) + vcAddHistogram (lx->listCodec, + ms[i].info[(int) t]->listCodec); + vcCreateCodec (lx->listCodec, 1); + for (i = 1; i < ms->share; i++) + { OneCodec *m = ms[i].info[(int) t]->listCodec; + ms[i].info[(int) t]->listCodec = lx->listCodec; + vcDestroy (m); + } + lx->isUseListCodec = true; + for (i = 1; i < ms->share; i++) + ms[i].info[(int) t]->isUseListCodec = true; + } + } + + pthread_mutex_unlock(&ms->listLock); + } + } + } + } + } + + doneLine: + + vf->isLastLineBinary = true; + } + + // ASCII - write field by field + + else + { if (!vf->isHeaderOut && !vf->isNoAsciiHeader) writeHeader (vf) ; + + if (!vf->isLastLineBinary) // terminate previous ascii line + fputc ('\n', vf->f); + + fputc (t, vf->f); + + for (i = 0; i < li->nField; i++) + switch (li->fieldType[i]) + { + case oneINT: + fprintf (vf->f, " %" PRId64 "", vf->field[i].i); + break; + case oneREAL: + fprintf (vf->f, " %f", vf->field[i].r); + break; + case oneCHAR: + fprintf (vf->f, " %c", vf->field[i].c); + break; + case oneSTRING: + case oneDNA: + case oneINT_LIST: + case oneREAL_LIST: + case oneSTRING_LIST: + li->accum.total += listLen; + if (listLen > li->accum.max) + li->accum.max = listLen; + + fprintf (vf->f, " %" PRId64 "", listLen); + if (li->fieldType[i] == oneSTRING || li->fieldType[i] == oneDNA) + { if (listLen > INT_MAX) + die ("ONE write error: string length %" PRId64 " > current max %d", listLen, INT_MAX); + fprintf (vf->f, " %.*s", (int) listLen, (char *) listBuf); + } + else if (li->fieldType[i] == oneINT_LIST) + { I64 *b = (I64 *) listBuf; + for (j = 0; j < listLen ; ++j) + fprintf (vf->f, " %" PRId64 "", b[j]); + } + else if (li->fieldType[i] == oneREAL_LIST) + { double *b = (double *) listBuf; + for (j = 0; j < listLen ; ++j) + fprintf (vf->f, " %f", b[j]); + } + else // vSTRING_LIST + writeStringList (vf, t, listLen, listBuf); + break; + } + vf->isLastLineBinary = false; + } +} + +void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf) +{ die ("not written yet") ; + oneWriteLine (vf, lineType, listLen, dnaBuf) ; +} + +void oneWriteComment (OneFile *vf, char *format, ...) +{ + va_list args ; + + if (vf->isCheckString) // then check no newlines in format + { char *s = format ; + while (*s) if (*s++ == '\n') die ("newline in comment format string: %s", format) ; + } + + va_start (args, format) ; + if (vf->isLastLineBinary) // write a comment line + { char *comment ; + vasprintf (&comment, format, args) ; + oneWriteLine (vf, '/', strlen(comment), comment) ; + free (comment) ; + } + else // write on same line after space + { fputc (' ', vf->f) ; + vfprintf (vf->f, format, args) ; + } + va_end (args) ; +} + +/*********************************************************************************** + * + * MERGING, FOOTER HANDLING, AND CLOSE + * + **********************************************************************************/ + +static void oneWriteFooter (OneFile *vf) +{ int i,n; + off_t footOff; + OneInfo *li; + char *codecBuf ; + + footOff = ftello (vf->f); + if (footOff < 0) + die ("ONE write error: failed footer ftell"); + + // first the per-linetype information + codecBuf = new (vcMaxSerialSize()+1, char) ; // +1 for added up unused 0-terminator + for (i = 'A'; i <= 'Z'+1 ; i++) + { if (i == 'Z'+1) + { if (vf->groupType) // NB group types are all lower case so > 'Z'+1 + i = vf->groupType ; + else + break ; + } + li = vf->info[i]; + if (li != NULL && li->accum.count > 0) + { fprintf (vf->f, "# %c %" PRId64 "\n", i, li->accum.count); + if (li->listEltSize) + { fprintf (vf->f, "@ %c %" PRId64 "\n", i, li->accum.max); + fprintf (vf->f, "+ %c %" PRId64 "\n", i, li->accum.total); + } + if (vf->groupType && i != vf->groupType && vf->group > 0) + { fprintf (vf->f, "%% %c # %c %" PRId64 "\n", vf->groupType, i, li->accum.groupCount); + if (li->listEltSize) + fprintf (vf->f, "%% %c + %c %" PRId64 "\n", vf->groupType, i, li->accum.groupTotal); + } + if (li->isUseListCodec && li->listCodec != DNAcodec) + { oneChar(vf,0) = i; + n = vcSerialize (li->listCodec, codecBuf); + oneWriteLine (vf, ';', n, codecBuf); + } + } + } + + li = vf->info['/'] ; // may need to write list codec for comments + if (li->isUseListCodec) + { oneChar(vf,0) = '/' ; + n = vcSerialize (li->listCodec, codecBuf); + oneWriteLine (vf, ';', n, codecBuf); + } + + free (codecBuf) ; + + oneWriteLine (vf, '&', vf->object, NULL); // number of objects in file = length of index + // NB NULL here and below for '*' defaults writing info->buffer, which contains the index + + if (vf->groupType > 0 && vf->group > 0) + { ((I64 *) vf->info['*']->buffer)[vf->group] = vf->object; + oneWriteLine (vf, '*', vf->group+1, NULL); // number of groups in file + 1 = length of index + } + + fprintf (vf->f, "^\n"); // end of footer marker + + if (fwrite (&footOff, sizeof(off_t), 1, vf->f) != 1) + die ("ONE write error: failed writing footer offset"); +} + + // After all input has been read, or all data has been written, this routine will finish + // accumulating counts/statistics for the file and merge thread stats into those for + // the master file (if a parallel OneFile). + +void oneFinalizeCounts(OneFile *vf) +{ int i, j, n, k, len; + OneInfo *li, *ln; + + if (vf->share < 0) + die ("ONE write error: cannot call oneFileClose on a slave OneFile"); + + vf->isFinal = true; + + if (vf->share == 0) + { updateGroupCount(vf,false); + return; + } + + len = vf->share; + + // Close current groups at the end of each part (if any) + + if (vf->groupType > 0) + for (i = 'A'; i <= 'Z'; i++) + if (vf->info[i] != NULL) + for (j = 0; j < len; j++) + if (vf[j].inGroup) + { I64 oc, ot; + + ot = oc = 0; + for (k = j+1; k < len; k++) + if (vf[k].inGroup) + { oc += vf[k].info[i]->oCount; + ot += vf[k].info[i]->oTotal; + break; + } + else + { oc += vf[k].info[i]->accum.count; + ot += vf[k].info[i]->accum.total; + } + + li = vf[j].info[i]; + if ((li->accum.count - li->gCount) + oc > li->accum.groupCount) + li->accum.groupCount = (li->accum.count - li->gCount) + oc; + if ((li->accum.total - li->gTotal) + ot > li->accum.groupTotal) + li->accum.groupTotal = (li->accum.total - li->gTotal) + ot; + } + + // first the per-linetype information + + n = vf->groupType; + if (n == 0) + n = 'Z'; + + for (i = 'A'; i <= n; i++) + { ln = vf->info[i]; + for (j = 1; j < len; j++) + { li = (vf+j)->info[i]; + if (li != NULL && li->accum.count > 0) + { ln->accum.count += li->accum.count; + if (li->accum.max > ln->accum.max) + ln->accum.max = li->accum.max; + ln->accum.total += li->accum.total; + if (li->accum.groupCount > ln->accum.groupCount) + ln->accum.groupCount = li->accum.groupCount; + if (li->accum.groupTotal > ln->accum.groupTotal) + ln->accum.groupTotal = li->accum.groupTotal; + } + } + } + + if ( ! vf->isBinary) + return; + + // Stitch the group index together + + if (vf->groupType > 0) + { I64 *gb, *gi, off; + int ns; + + ns = 0; + for (j = 0; j < len; j++) + ns += vf[j].group; + gb = new (ns+1, I64); + + ns = 0; + off = 0; + for (j = 0; j < len; j++) + { li = vf[j].info['*']; + gi = (I64 *) (li->buffer); + for (i = 0; i < vf[j].group; i++) + gb[ns++] = gi[i] + off; + off += vf[j].object; + } + gb[ns] = off; + li = vf->info['*']; + free(li->buffer); + li->buffer = gb; + li->bufSize = ns+1; + vf->group = ns; + } + + // Stitch the object index together + + { int ns; + I64 *gb, *gi, off; + + ns = 0; + for (j = 0; j < len; j++) + ns += vf[j].object; + gb = new (ns, I64); + + ns = 0; + off = 0; + for (j = 0; j < len; j++) + { li = vf[j].info['&']; + gi = (I64 *) (li->buffer); + for (i = 0; i < vf[j].object; i++) + gb[ns++] = gi[i] + off; + off += ftello(vf[j].f); + } + + li = vf->info['&']; + free(li->buffer); + li->buffer = gb; + li->bufSize = ns; + vf->object = ns; + } +} + +// automatically rewrites header if allowed when writing + +void oneFileClose (OneFile *vf) +{ + assert (vf->share >= 0) ; + + if (vf->isWrite) + { + if (!vf->isFinal) // RD moved this here from above - surely only needed if isWrite + oneFinalizeCounts (vf); + + if (!vf->isHeaderOut && (vf->isBinary || !vf->isNoAsciiHeader)) writeHeader (vf) ; + + if (vf->share > 0) + { int i, pid, fid, nread; + char name[100], *buf; + + buf = new (10000000, char); + pid = getpid(); + for (i = 1; i < vf->share; i++) + { fclose (vf[i].f); + vf[i].f = NULL; + sprintf(name,".part.%d.%d",pid,i); + fid = open(name,O_RDONLY); + while ((nread = read(fid,buf,10000000)) > 0) + if ((int) fwrite(buf,1,nread,vf->f) != nread) + die ("ONE write error: while cat'ing thread bits (oneFileClose)"); + if (unlink(name) < 0) + die ("ONE write error: could not delete thread file %s", name); + } + free(buf); + } + + fputc ('\n', vf->f); // end of file if ascii, end of data marker if binary + if (vf->isBinary) // write the footer + oneWriteFooter (vf); + } + + oneFileDestroy (vf); +} + +/*********************************************************************************** + * + * Length limited Huffman Compressor/decompressor with special 2-bit compressor for DNA + * Author: Gene Myers + * Creation date: June 27, 2019 + * + * inline both compression.h and compression.c here + * + **********************************************************************************/ + +#undef DEBUG +#undef TEST + + // To create a compressor, get an initially empty object with vcCreate, then + // add a significant corpus of the byte data to be compressed with vcAddToTable, + // and finally create a Huffman codec based on this corpus by calling + // vcCreateCodec. The parameter "partial" should be set if not all the data + // to be compressed has been scanned. At this point you have a compressor ready + // to operate. You can destroy/free it with vcDestroy. + +OneCodec *vcCreate(); +void vcAddToTable(OneCodec *vc, int len, char *bytes); +void vcCreateCodec(OneCodec *vc, int partial); +void vcDestroy(OneCodec *vc); + + // In the instance of accumulating data over multiple threads, vcAddHistogram, will + // add the counts in the table for vh, to the table for vc. + +void vcAddHistogram(OneCodec *vc, OneCodec *vh); + + // A diagnostic routine: shows you the compression scheme and if the distribution + // of the scanned corpus is available, it shows you that too. Output to file 'to'. + +void vcPrint(OneCodec *vc, FILE *to); + + // You can encode and decode where ibytes/ilen are the input and the output + // is placed at obytes and the length of the compressed/decompressed result + // is returned as the value of the function. For vcEncode, ilen is the size + // of the uncompressed input in bytes, and the return value is the size of + // the compressed output in **bits**. The converse is true for vcDecode, i.e + // ilen is the number of bits in the compressed input, and the return value + // is the number of bytes in the uncompressed output. The routines are endian safe. + +int vcEncode(OneCodec *vc, int ilen, char *ibytes, char *obytes); +int vcDecode(OneCodec *vc, int ilen, char *ibytes, char *obytes); + + // Rather than directly reading or writing an encoding of a compressor, the routines + // below serialize or deserialize the compressor into/outof a user-supplied buffer. + // vcMaxSerialSize gives the maximum size of a serialized compressor so the user + // can arrange a buffer of the appropriate size. vcSerialize serializes vc into + // buffer 'out' and returns the # of bytes in the encoding. vcDeserialize will reverse + // the process given a serialization. The routines are endian-safe. + +int vcMaxSerialSize(); +int vcSerialize(OneCodec *vc, void *out); +OneCodec *vcDeserialize(void *in); + +typedef uint64_t uint64; +typedef uint32_t uint32; +typedef uint16_t uint16; +typedef uint8_t uint8; + +#define HUFF_CUTOFF 12 // This cannot be larger than 16 ! + + // Endian flipping macros + +#define FLIP64(p) \ +{ uint8 x = p[0]; \ + p[0] = p[7]; \ + p[7] = x; \ + x = p[1]; \ + p[1] = p[6]; \ + p[6] = x; \ + x = p[2]; \ + p[2] = p[5]; \ + p[5] = x; \ + x = p[3]; \ + p[3] = p[4]; \ + p[4] = x; \ +} + +#define FLIP32(p) \ +{ uint8 x = p[0]; \ + p[0] = p[3]; \ + p[3] = x; \ + x = p[1]; \ + p[1] = p[2]; \ + p[2] = x; \ +} + +#define FLIP16(p) \ +{ uint8 x = p[0]; \ + p[0] = p[1]; \ + p[1] = x; \ +} + +/******************************************************************************************* + * + * Routines for computing a length-limited Huffman Encoding Scheme + * + ********************************************************************************************/ + +#define EMPTY 0 // Compressor just created, histogram zero'd +#define FILLED 1 // Compressor histogram being filled, no codec +#define CODED_WITH 2 // Compressor has a codec (can no longer accumulate histogram) +#define CODED_READ 3 // Compressor has codec but no histogram as was created by read + +typedef struct + { int state; // 1 of the 4 states immediately above + int isbig; // endian of the current machine + uint16 codebits[256]; // Code esc_code is the special code for + uint8 codelens[256]; // non-Huffman exceptions + char lookup[0x10000]; // Lookup table (just for decoding) + int esc_code; // The special escape code (-1 if not partial) + int esc_len; // The length in bits of the special code (if present) + uint64 hist[256]; // Byte distribution for codec + } _OneCodec; + + // The special "predefined" DNA compressor + +static _OneCodec _DNAcodec = { .state = CODED_READ }; +OneCodec *DNAcodec = (OneCodec *) &_DNAcodec; + + // Create an EMPTY compressor object with zero'd histogram and determine machine endian + +OneCodec *vcCreate() +{ _OneCodec *v; + int i; + + v = (_OneCodec *) malloc(sizeof(_OneCodec)); + if (v == NULL) + { fprintf(stderr,"vcCreate: Could not allocate compressor\n"); + exit (1); + } + + v->state = EMPTY; + for (i = 0; i < 256; i++) + v->hist[i] = 0; + + { uint32 t; + uint8 *b; + + t = 1; + b = (uint8 *) (&t); + v->isbig = (b[0] == 0); + } + + return ((OneCodec *) v); +} + + // Free a compressor object + +void vcDestroy(OneCodec *vc) +{ _OneCodec *v = (_OneCodec *) vc; + if (vc != DNAcodec) + free(v); +} + + // Add the frequencies of bytes in bytes[0..len) to vc's histogram + // State becomes FILLED + +void vcAddToTable(OneCodec *vc, int len, char *bytes) +{ _OneCodec *v = (_OneCodec *) vc; + uint8 *data = (uint8 *) bytes; + int i; + + for (i = 0; i < len; i++) + v->hist[(int) data[i]] += 1; + if (v->state < FILLED) + v->state = FILLED; +} + + // Add the frequencies of bytes in bytes[0..len) to vc's histogram + // State becomes FILLED + +void vcAddHistogram(OneCodec *vc, OneCodec *vh) +{ _OneCodec *v = (_OneCodec *) vc; + _OneCodec *h = (_OneCodec *) vh; + int i; + + if (v->state >= CODED_WITH) + { fprintf(stderr,"vcAddHistogram: Compressor already has a codec\n"); + exit (1); + } + if (h->state == CODED_READ) + { fprintf(stderr,"vcAddHistogram: Source compressor doesn't have a histogram\n"); + exit (1); + } + + for (i = 0; i < 256; i++) + v->hist[i] += h->hist[i]; + v->state = FILLED; +} + + // Check vc has a non-empty distribution histogram and if so then build + // length-limited Huffman tables for the bytes that occur in the histogram, + // plus a special escape code if partial is set and there is at least one byte + // with a zero count in the histogram. The algorithm is by Larmore & Hirschberg, + // JACM 73, 3 (1990). + +uint64 *HIST; + +int HSORT(const void *l, const void *r) +{ int x = *((int *) l); + int y = *((int *) r); + return (HIST[x] - HIST[y]); +} + +void vcCreateCodec(OneCodec *vc, int partial) +{ _OneCodec *v = (_OneCodec *) vc; + + uint64 *hist; + char *look; + uint8 *lens; + uint16 *bitv; + + int code[256]; + int leng[256]; + uint16 bits[256]; + int ncode, dcode, ecode; + + int i; + + if (v->state >= CODED_WITH) + { fprintf(stderr,"vcCreateCoder: Compressor already has a codec\n"); + exit (1); + } + if (v->state == EMPTY) + { fprintf(stderr,"vcCreateCoder: Compressor has no byte distribution data\n"); + exit (1); + } + + hist = v->hist; + look = v->lookup; + lens = v->codelens; + bitv = v->codebits; + + ecode = -partial; + ncode = 0; + for (i = 0; i < 256; i++) + if (hist[i] > 0) + code[ncode++] = i; + else if (ecode < 0) + { ecode = i; + code[ncode++] = i; + } + dcode = 2*ncode; + + if (ecode < 0) + partial = 0; + + HIST = hist; + qsort(code,ncode,sizeof(int),HSORT); + +#ifdef DEBUG + fprintf(stderr,"\nSorted Codes %d:\n",ncode); + for (i = 0; i < ncode; i++) + fprintf(stderr," %3d: %3d %10llu\n",i,code[i],hist[code[i]]); +#endif + + { uint8 matrix[HUFF_CUTOFF][dcode]; + uint64 count1[dcode], count2[dcode], countb[ncode]; + uint64 *lcnt, *ccnt, *swp; + int llen, span; + int j, k, n, L; + + for (n = 0; n < ncode; n++) + { count1[n] = countb[n] = hist[code[n]]; + leng[n] = 0; + } + +#ifdef DEBUG + fprintf(stderr,"\nCoin Filter:\n"); + fprintf(stderr," Row %2d:",HUFF_CUTOFF); + for (n = 0; n < ncode; n++) + fprintf(stderr," %" PRId64 "*",countb[n]); + fprintf(stderr,"\n"); +#endif + + lcnt = count1; + ccnt = count2; + llen = ncode-1; + for (L = HUFF_CUTOFF-1; L > 0; L--) + { j = 0; + k = 0; + for (n = 0; j < ncode || k < llen; n++) + { if (k >= llen || (j < ncode && countb[j] <= lcnt[k] + lcnt[k+1])) + { ccnt[n] = countb[j]; + matrix[L][n] = 1; + j += 1; + } + else + { ccnt[n] = lcnt[k] + lcnt[k+1]; + matrix[L][n] = 0; + k += 2; + } + } + llen = n-1; + swp = lcnt; + lcnt = ccnt; + ccnt = swp; + +#ifdef DEBUG + fprintf(stderr," Row %2d:",L); + for (n = 0; n <= llen; n++) + fprintf(stderr," %" PRId64 "%c",lcnt[n],matrix[L][n]?'*':'+'); + fprintf(stderr,"\n"); +#endif + } + + span = 2*(ncode-1); + for (L = 1; L < HUFF_CUTOFF; L++) + { j = 0; + for (n = 0; n < span; n++) + { if (matrix[L][n]) + leng[j++] += 1; + } + span = 2*(span-j); + } + for (n = 0; n < span; n++) + leng[n] += 1; + +#ifdef DEBUG + fprintf(stderr,"\nBack Trace:\n"); + span = 2*(ncode-1); + for (L = 1; L < HUFF_CUTOFF; L++) + { j = 0; + fprintf(stderr," Row %2d:",L); + for (n = 0; n < span; n++) + { if (matrix[L][n]) + j += 1; + fprintf(stderr," %c",matrix[L][n]?'*':'+'); + } + fprintf(stderr,"\n"); + span = 2*(span-j); + } + fprintf(stderr," Length:"); + for (n = 0; n < ncode; n++) + fprintf(stderr," %d",leng[n]); + fprintf(stderr,"\n"); +#endif + } + + { int n, llen; + uint16 lbits; + + llen = leng[0]; + lbits = bits[0] = (1 << llen) - 1; + for (n = 1; n < ncode; n++) + { while ((lbits & 0x1) == 0) + { lbits >>= 1; + llen -= 1; + } + lbits -= 1; + while (llen < leng[n]) + { lbits = (lbits << 1) | 0x1; + llen += 1; + } + bits[n] = lbits; + } + +#ifdef DEBUG + { int j; + + fprintf(stderr,"\nCodes:\n"); + for (n = 0; n < ncode; n++) + { fprintf(stderr," %3d: %2d ",code[n],leng[n]); + for (j = leng[n]-1; j >= 0; j--) + fprintf(stderr,"%x",(bits[n]>>j)&0x1); + fprintf(stderr,"\n"); + } + } +#endif + } + + for (i = 0; i < 256; i++) + { lens[i] = 0; + bitv[i] = 0; + } + + for (i = 0; i < ncode; i++) + { lens[code[i]] = leng[i]; + bitv[code[i]] = bits[i]; + } + + { int j, powr; // Fill in a decoder table giving the next Huffman code + uint16 base; // that is a prefix of the next 16 bits + + for (i = 0; i < 256; i++) + { if (lens[i] > 0) + { base = (bitv[i] << (16-lens[i])); + powr = (1 << (16-lens[i])); + for (j = 0; j < powr; j++) + look[base+j] = i; + } + } + } + + if (partial) + { v->esc_code = ecode; + v->esc_len = lens[ecode]; + lens[ecode] = 0; + } + else + v->esc_code = -1; + v->state = CODED_WITH; +} + + // For debug, give a nice print out of the distribution histogram (if present) + // and the Huffman codec + +void vcPrint(OneCodec *vc, FILE *to) +{ _OneCodec *v = (_OneCodec *) vc; + + uint64 total_bits, ucomp_bits, count; + uint16 mask, code, *bits; + uint64 *hist; + uint8 *lens; + int clen; + int hashist; + int i, k; + + if (vc == DNAcodec) + { fprintf(to," DNAcompressor\n"); + return; + } + + if (v->state < CODED_WITH) + { fprintf(stderr,"vcPrint: Compressor has no codec\n"); + exit (1); + } + hashist = (v->state == CODED_WITH); + + bits = v->codebits; + lens = v->codelens; + hist = v->hist; // only needed if hashist, but compiler warning if assignment is conditional + + if (hashist) + { total_bits = 0; + ucomp_bits = 0; + + count = 0; + for (i = 0; i < 256; i++) + count += hist[i]; + + fprintf(to,"\nHistogram:\n"); + for (i = 0; i < 256; i++) + if (hist[i] > 0) + { if (isprint(i)) + fprintf(to," %c: %12" PRIu64 " %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); + else + fprintf(to," %3d: %12" PRIu64 " %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); + } + } + + fprintf(to,"\nCode Table:\n"); + for (i = 0; i < 256; i++) + { clen = lens[i]; + if (i == v->esc_code) + clen = v->esc_len; + if (clen > 0) + { mask = (1 << clen); + code = bits[i]; + if (isprint(i)) + fprintf(to," %c: %2d ",i,clen); + else + fprintf(to," %3d: %2d ",i,clen); + for (k = 0; k < clen; k++) + { mask >>= 1; + if (code & mask) + fprintf(to,"1"); + else + fprintf(to,"0"); + } + if (i == v->esc_code) + fprintf(to," ***\n"); + else + { fprintf(to,"\n"); + if (hashist) + { total_bits += clen*hist[i]; + ucomp_bits += (hist[i]<<3); + } + } + } + } + if (hashist) + fprintf(to,"\nTotal Bytes = %" PRIu64 " (%.2f%%)\n",(total_bits-1)/8+1,(100.*total_bits)/ucomp_bits); +} + + +/******************************************************************************************* + * + * Read and Write Huffman Schemes (actually just (de)serialize) + * + ********************************************************************************************/ + + // Maximum # of bytes in a serialized compressor code + +int vcMaxSerialSize() +{ return (257 + 2*sizeof(int) + 256*sizeof(uint16)); } + + // Code the compressor into blob 'out' and return number of bytes in the code + +int vcSerialize(OneCodec *vc, void *out) +{ _OneCodec *v = (_OneCodec *) vc; + + int i; + uint16 *bits; + uint8 *lens, *o; + + if (vc == DNAcodec) + return (0); + + if (v->state < CODED_WITH) + { fprintf(stderr,"vcWrite: Compressor does not have a codec\n"); + exit (1); + } + + lens = v->codelens; + bits = v->codebits; + o = (uint8 *) out; + + // Only need to record endian, escape code, code lengths, and codes for those + // with non-zero length + + *o++ = v->isbig; + memcpy(o,&(v->esc_code),sizeof(int)); + o += sizeof(int); + memcpy(o,&(v->esc_len),sizeof(int)); + o += sizeof(int); + for (i = 0; i < 256; i++) + { *o++ = lens[i]; + if (lens[i] > 0 || i == v->esc_code) + { memcpy(o,bits+i,sizeof(uint16)); + o += sizeof(uint16); + } + } + return (o - (uint8 *) out); +} + + // Create a compressor object from the serialized code in blob 'in'. + // The compressor does not have the original histogram from which + // its codec was created. If the endian of the current machine and + // the one that serialized the compressor don't match, then all relevant + // items are byte-flipped. + +OneCodec *vcDeserialize(void *in) +{ _OneCodec *v; + + char *look; + uint8 *lens, *ip; + uint16 *bits, base; + int i, j, powr; + + v = (_OneCodec *) malloc(sizeof(_OneCodec)); + if (v == NULL) + { fprintf(stderr,"vcRead: Could not allocate compressor\n"); + exit (1); + } + + v->state = CODED_READ; + lens = v->codelens; + bits = v->codebits; + look = v->lookup; + ip = (uint8 *) in; + + { uint32 t; + uint8 *b; + + t = 1; + b = (uint8 *) (&t); + v->isbig = (b[0] == 0); + } + + if (v->isbig != *ip++) // If endians out and in don't match then flip item bytes as needed + { FLIP32(ip) + memcpy(&(v->esc_code),ip,sizeof(int)); + ip += sizeof(int); + FLIP32(ip) + memcpy(&(v->esc_len),ip,sizeof(int)); + ip += sizeof(int); + for (i = 0; i < 256; i++) + { lens[i] = *ip++; + if (lens[i] > 0 || i == v->esc_code) + { FLIP16(ip) + memcpy(bits+i,ip,sizeof(uint16)); + ip += sizeof(uint16); + } + else + bits[i] = 0; + } + } + else + { memcpy(&(v->esc_code),ip,sizeof(int)); + ip += sizeof(int); + memcpy(&(v->esc_len),ip,sizeof(int)); + ip += sizeof(int); + for (i = 0; i < 256; i++) + { lens[i] = *ip++; + if (lens[i] > 0 || i == v->esc_code) + { memcpy(bits+i,ip,sizeof(uint16)); + ip += sizeof(uint16); + } + else + bits[i] = 0; + } + } + + if (v->esc_code >= 0) + lens[v->esc_code] = v->esc_len; + for (i = 0; i < 256; i++) + { if (lens[i] > 0) + { base = (bits[i] << (16-lens[i])); + powr = (1 << (16-lens[i])); + for (j = 0; j < powr; j++) + look[base+j] = i; + } + } + if (v->esc_code >= 0) + lens[v->esc_code] = 0; + + return ((OneCodec *) v); +} + + +/******************************************************************************************* + * + * Encoders and Decoders + * + ********************************************************************************************/ + +static uint8 Number[128] = + { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + // Compress DNA into 2-bits per base + +int Compress_DNA(int len, char *s, char *t) +{ int i, j; + uint8 *s0, *s1, *s2, *s3; + + s0 = (uint8 *) s; + s1 = s0+1; + s2 = s1+1; + s3 = s2+1; + + len -= 3; + for (i = j = 0; i < len; i += 4) + t[j++] = (Number[s0[i]] << 6) | (Number[s1[i]] << 4) | (Number[s2[i]] << 2) | Number[s3[i]]; + switch (i-len) + { case 0: + t[j++] = (Number[s0[i]] << 6) | (Number[s1[i]] << 4) | (Number[s2[i]] << 2); + break; + case 1: + t[j++] = (Number[s0[i]] << 6) | (Number[s1[i]] << 4); + break; + case 2: + t[j++] = (Number[s0[i]] << 6); + break; + default: + break; + } + + return ((len+3)<<1); +} + + // Encode ibytes[0..ilen) according to compressor vc and place in obytes + // Return the # of bits used. + +int vcEncode(OneCodec *vc, int ilen, char *ibytes, char *obytes) +{ _OneCodec *v = (_OneCodec *) vc; + + uint64 c, ocode, *ob; + int n, k, rem, tbits, ibits, esc, elen; + uint8 *clens, x, *bcode, *bb; + uint16 *cbits; + + if (vc == DNAcodec) + return (Compress_DNA(ilen,ibytes,obytes)); + + if (v->state < CODED_WITH) + { fprintf(stderr,"vcEncode: Compressor does not have a codec\n"); + exit (1); + } + + esc = v->esc_code; + elen = v->esc_len; + clens = v->codelens; + cbits = v->codebits; + ibits = (ilen << 3); + bcode = (uint8 *) &ocode; + +#define OCODE(L,C) \ +{ rem -= L; \ + if (rem <= 0) \ + { ocode |= (C >> (-rem)); \ + *ob++ = ocode; \ + if (rem < 0) \ + { rem += 64; \ + ocode = (C << rem); \ + } \ + else \ + { rem = 64; \ + ocode = 0; \ + } \ + } \ + else \ + ocode |= (C << rem); \ +} + + ob = (uint64 *) obytes; + tbits = 2; + rem = 62; + if (v->isbig) + ocode = 0x4000000000000000llu; + else + ocode = 0; + for (k = 0; k < ilen; k++) + { x = ibytes[k]; + n = clens[x]; + if (n == 0) + { if (esc < 0) + { fprintf(stderr,"Compression lib: No code for %c(%x) and no escape code\n",x,x); + exit (1); + } + c = cbits[esc]; + tbits += 8+elen; + if (tbits > ibits) + break; + OCODE(elen,c); + c = x; + OCODE(8,c); + } + else + { tbits += n; + if (tbits > ibits) + break; + c = cbits[x]; + OCODE(n,c); + } + } + + if (k < ilen) + { *obytes = 0xff; + memcpy(obytes+1,ibytes,ilen); + return (ibits+8); + } + + bb = (uint8 *) ob; + if (v->isbig) + { rem = ((71-rem)>>3); + for (k = 0; k < rem; k++) + *bb++ = bcode[k]; + } + else + { rem = 7 - ((63-rem)>>3); + for (k = 7; k >= rem; k--) + *bb++ = bcode[k]; + } + + if (tbits >= 64 && !v->isbig) + { x = obytes[7]; + obytes[7] = obytes[0]; + obytes[0] = x; + } + + return (tbits); +} + + // Uncompress read from 2-bits per base into [0-3] per byte representation + +static char Base[4] = { 'a', 'c', 'g', 't' }; + +int Uncompress_DNA(char *s, int len, char *t) +{ int i, tlen, byte; + char *t0, *t1, *t2, *t3; + + t0 = t; + t1 = t0+1; + t2 = t1+1; + t3 = t2+1; + + tlen = len-3; + for (i = 0; i < tlen; i += 4) + { byte = *s++; + t0[i] = Base[(byte >> 6) & 0x3]; + t1[i] = Base[(byte >> 4) & 0x3]; + t2[i] = Base[(byte >> 2) & 0x3]; + t3[i] = Base[byte & 0x3]; + } + + switch (i-tlen) + { case 0: + byte = *s++; + t0[i] = Base[(byte >> 6) & 0x3]; + t1[i] = Base[(byte >> 4) & 0x3]; + t2[i] = Base[(byte >> 2) & 0x3]; + break; + case 1: + byte = *s++; + t0[i] = Base[(byte >> 6) & 0x3]; + t1[i] = Base[(byte >> 4) & 0x3]; + break; + case 2: + byte = *s++; + t0[i] = Base[(byte >> 6) & 0x3]; + break; + default: + break; + } + + return (len); +} + + // Decode ilen bits in ibytes, into obytes according to vc's codec + // Return the number of bytes decoded. + +int vcDecode(OneCodec *vc, int ilen, char *ibytes, char *obytes) +{ _OneCodec *v = (_OneCodec *) vc; + + char *look; + uint8 *lens, *q; + uint64 icode, ncode, *p; + int rem, nem; + uint8 c, *o; + int n, k, elen, inbig, esc; + + if (vc == DNAcodec) + return (Uncompress_DNA(ibytes,ilen>>1,obytes)); + + if (v->state < CODED_WITH) + { fprintf(stderr,"vcDecode: Compressor does not have a codec\n"); + exit (1); + } + + if (*((uint8 *) ibytes) == 0xff) + { int olen = (ilen>>3)-1; + memcpy(obytes,ibytes+1,olen); + return (olen); + } + + p = (uint64 *) ibytes; + + inbig = (*ibytes & 0x40); + if (!inbig && ilen >= 64) + { uint8 x = ibytes[7]; + ibytes[7] = ibytes[0]; + ibytes[0] = x; + } + + if (inbig != v->isbig) + { q = (uint8 *) ibytes; + for (k = 64; k <= ilen; k += 64) + { FLIP64(q) + q += 8; + } + } + + lens = v->codelens; + look = v->lookup; + esc = v->esc_code; + elen = v->esc_len; + +#define GET(n) \ + ilen -= n; \ + icode <<= n; \ + rem -= n; \ + while (rem < 16) \ + { int z = 64-rem; \ + icode |= (ncode >> rem); \ + if (nem > z) \ + { nem -= z; \ + ncode <<= z; \ + rem = 64; \ + break; \ + } \ + else \ + { rem += nem; \ + if (rem >= ilen) \ + break; \ + else if (ilen-rem < 64) \ + { nem = ilen-rem; \ + q = (uint8 *) p; \ + ncode = 0; \ + for (k = 0; k < nem; k += 8) \ + ncode |= (((uint64) (*q++)) << (56-k)); \ + } \ + else \ + { ncode = *p++; \ + nem = 64; \ + } \ + } \ + } + + if (ilen < 64) + { q = (uint8 *) ibytes; + icode = 0; + for (k = 0; k < ilen; k += 8) + icode |= (((uint64) (*q++)) << (56-k)); + } + else + icode = *p++; + o = (uint8 *) obytes; + icode <<= 2; + ilen -= 2; + rem = 62; + if (rem > ilen) + rem = ilen; + ncode = 0; + nem = 0; + while (ilen > 0) + { c = look[icode >> 48]; + if (c == esc) + { GET(elen) + c = (icode >> 56); + GET(8); + } + else + { n = lens[(int) c]; + GET(n) + } + *o++ = c; + } + + return (o - (uint8 *) obytes); +} + +////////////////////////////////////////////////////////////////////////////////////// +// +// integer compression for write/read of fields +// +// top bit of first byte: number is negative +// second bit: one-byte: next six bits give number (make negative if top bit set) +// third bit: two-byte: next 13 bits give number (make negative if top bit set) +// if second and third bits are not set, remaining 5 bits give number of bytes to read + +static inline int intGet (unsigned char *u, I64 *pval) +{ + switch (u[0] >> 5) + { + case 2: case 3: // single byte positive + *pval = (I64) (u[0] & 0x3f) ; return 1 ; + case 6: case 7: // single byte negative + *pval = (I64) u[0] | 0xffffffffffffff00 ; return 1 ; + case 1: // two bytes positive + *pval = (I64) (u[0] & 0x1f) << 8 | (I64)u[1] ; return 2 ; + *pval = - ((I64) (u[0] & 0x1f) << 8 | (I64)u[1]) ; return 2 ; + case 0: + switch (u[0] & 0x07) + { + case 0: die ("int packing error") ; + case 1: *pval = *(I64*)(u+1) & 0x0000000000ffff ; return 3 ; + case 2: *pval = *(I64*)(u+1) & 0x00000000ffffff ; return 4 ; + case 3: *pval = *(I64*)(u+1) & 0x000000ffffffff ; return 5 ; + case 4: *pval = *(I64*)(u+1) & 0x0000ffffffffff ; return 6 ; + case 5: *pval = *(I64*)(u+1) & 0x00ffffffffffff ; return 7 ; + case 6: *pval = *(I64*)(u+1) & 0xffffffffffffff ; return 8 ; + case 7: *pval = *(I64*)(u+1) ; return 9 ; + } + case 4: + switch (u[0] & 0x07) + { + case 0: die ("int packing error") ; + case 1: *pval = *(I64*)(u+1) | 0xffffffffffff0000 ; return 3 ; + case 2: *pval = *(I64*)(u+1) | 0xffffffffff000000 ; return 4 ; + case 3: *pval = *(I64*)(u+1) | 0xffffffff00000000 ; return 5 ; + case 4: *pval = *(I64*)(u+1) | 0xffffff0000000000 ; return 6 ; + case 5: *pval = *(I64*)(u+1) | 0xffff000000000000 ; return 7 ; + case 6: *pval = *(I64*)(u+1) | 0xff00000000000000 ; return 8 ; + case 7: *pval = *(I64*)(u+1) ; return 9 ; + } + } + return 0 ; // shouldn't get here, but needed for compiler happiness +} + +static inline int intPut (unsigned char *u, I64 val) +{ + if (val >= 0) + { if ( !(val & 0xffffffffffffffc0)) { *u = val | 0x40 ; return 1 ; } + else if (!(val & 0xffffffffffffe000)) { *u++ = (val >> 8) | 0x20 ; *u = val & 0xff ; return 2 ; } + else if (!(val & 0xffffffffffff0000)) { *u++ = 1 ; *(I64*)u = val ; return 3 ; } + else if (!(val & 0xffffffffff000000)) { *u++ = 2 ; *(I64*)u = val ; return 4 ; } + else if (!(val & 0xffffffff00000000)) { *u++ = 3 ; *(I64*)u = val ; return 5 ; } + else if (!(val & 0xffffff0000000000)) { *u++ = 4 ; *(I64*)u = val ; return 6 ; } + else if (!(val & 0xffff000000000000)) { *u++ = 5 ; *(I64*)u = val ; return 7 ; } + else if (!(val & 0xff00000000000000)) { *u++ = 6 ; *(I64*)u = val ; return 8 ; } + else { *u++ = 7 ; *(I64*)u = val ; return 9 ; } + } + else + { if ( !(~val & 0xffffffffffffffc0)) { *u = val | 0x40 ; return 1 ; } + // else if (!(~val & 0xffffffffffffe000)) { *u++ = (val >> 8) | 0x20 ; *u = val & 0xff ; return 2 ; } + else if (!(~val & 0xffffffffffff0000)) { *u++ = 0x81 ; *(I64*)u = val ; return 3 ; } + else if (!(~val & 0xffffffffff000000)) { *u++ = 0x82 ; *(I64*)u = val ; return 4 ; } + else if (!(~val & 0xffffffff00000000)) { *u++ = 0x83 ; *(I64*)u = val ; return 5 ; } + else if (!(~val & 0xffffff0000000000)) { *u++ = 0x84 ; *(I64*)u = val ; return 6 ; } + else if (!(~val & 0xffff000000000000)) { *u++ = 0x85 ; *(I64*)u = val ; return 7 ; } + else if (!(~val & 0xff00000000000000)) { *u++ = 0x86 ; *(I64*)u = val ; return 8 ; } + else { *u++ = 0x87 ; *(I64*)u = val ; return 9 ; } + } +} + +static inline I64 ltfRead (FILE *f) +{ + unsigned char u[16] ; + I64 val ; + + u[0] = getc (f) ; + if (u[0] & 0x40) + { intGet (u, &val) ; + // printf ("read %d n 1 u %02x\n", (int)val, u[0]) ; + } + else if (u[0] & 0x20) + { u[1] = getc (f) ; intGet (u, &val) ; + // printf ("read %d n 2 u %02x %02x\n", (int)val, u[0], u[1]) ; + } + else + { int n = 1 + (u[0] & 0x0f) ; + unsigned char *v = &u[1] ; + while (n--) *v++ = getc(f) ; + n = intGet (u, &val) ; + // printf ("read %d n %d u", (int)val, n) ; + // { int i ; for (i = 0 ; i< n ; ++i) printf (" %02x", u[i]) ; putchar ('\n') ; } + } + + return val ; +} + +static inline int ltfWrite (I64 x, FILE *f) +{ + unsigned char u[16] ; + int n = intPut (u, x) ; + + // printf ("write %d n %d u", (int)x, n) ; + // { int i ; for (i = 0 ; i< n ; ++i) printf (" %02x", u[i]) ; putchar ('\n') ; } + + fwrite (u, 1, n, f) ; + return n ; +} + +#if defined(TEST_LTF) || defined(TEST_INT) + +// these are the original routines from James Bonfield on which this is based +// incorporated here for credit and for comparisons in the TEST functionality + +/*********************************************************************************** + * + * LTF encoding for integers + * adapted from htslib/cram/cram_io.h with copyright statement: + +Copyright (c) 2012-2019 Genome Research Ltd. +Author: James Bonfield + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + + 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger +Institute nor the names of its contributors may be used to endorse or promote +products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + * + **********************************************************************************/ + +/* 64-bit itf8 variant */ + +static inline int ltf8_put(char *cp, int64_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~((1LL<<7)-1))) { + *up = val; + return 1; + } else if (!(val & ~((1LL<<(6+8))-1))) { + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~((1LL<<(5+2*8))-1))) { + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~((1LL<<(4+3*8))-1))) { + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else if (!(val & ~((1LL<<(3+4*8))-1))) { + *up++ = (val >> 32) | 0xf0; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 5; + } else if (!(val & ~((1LL<<(2+5*8))-1))) { + *up++ = (val >> 40) | 0xf8; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 6; + } else if (!(val & ~((1LL<<(1+6*8))-1))) { + *up++ = (val >> 48) | 0xfc; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 7; + } else if (!(val & ~((1LL<<(7*8))-1))) { + *up++ = (val >> 56) | 0xfe; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 8; + } else { + *up++ = 0xff; + *up++ = (val >> 56) & 0xff; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 9; + } +} + +static inline int ltf8_get(char *cp, int64_t *val_p) { + unsigned char *up = (unsigned char *)cp; + + if (up[0] < 0x80) { + *val_p = up[0]; + return 1; + } else if (up[0] < 0xc0) { + *val_p = (((uint64_t)up[0]<< 8) | + (uint64_t)up[1]) & (((1LL<<(6+8)))-1); + return 2; + } else if (up[0] < 0xe0) { + *val_p = (((uint64_t)up[0]<<16) | + ((uint64_t)up[1]<< 8) | + (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); + return 3; + } else if (up[0] < 0xf0) { + *val_p = (((uint64_t)up[0]<<24) | + ((uint64_t)up[1]<<16) | + ((uint64_t)up[2]<< 8) | + (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); + return 4; + } else if (up[0] < 0xf8) { + *val_p = (((uint64_t)up[0]<<32) | + ((uint64_t)up[1]<<24) | + ((uint64_t)up[2]<<16) | + ((uint64_t)up[3]<< 8) | + (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); + return 5; + } else if (up[0] < 0xfc) { + *val_p = (((uint64_t)up[0]<<40) | + ((uint64_t)up[1]<<32) | + ((uint64_t)up[2]<<24) | + ((uint64_t)up[3]<<16) | + ((uint64_t)up[4]<< 8) | + (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); + return 6; + } else if (up[0] < 0xfe) { + *val_p = (((uint64_t)up[0]<<48) | + ((uint64_t)up[1]<<40) | + ((uint64_t)up[2]<<32) | + ((uint64_t)up[3]<<24) | + ((uint64_t)up[4]<<16) | + ((uint64_t)up[5]<< 8) | + (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); + return 7; + } else if (up[0] < 0xff) { + *val_p = (((uint64_t)up[1]<<48) | + ((uint64_t)up[2]<<40) | + ((uint64_t)up[3]<<32) | + ((uint64_t)up[4]<<24) | + ((uint64_t)up[5]<<16) | + ((uint64_t)up[6]<< 8) | + (uint64_t)up[7]) & ((1LL<<(7*8))-1); + return 8; + } else { + *val_p = (((uint64_t)up[1]<<56) | + ((uint64_t)up[2]<<48) | + ((uint64_t)up[3]<<40) | + ((uint64_t)up[4]<<32) | + ((uint64_t)up[5]<<24) | + ((uint64_t)up[6]<<16) | + ((uint64_t)up[7]<< 8) | + (uint64_t)up[8]); + return 9; + } +} + +#include +#ifndef RUSAGE_SELF /* to prevent "RUSAGE_SELF redefined" gcc warning, fixme if this is more intricate */ +#define RUSAGE_SELF 0 +#endif + +void timeUpdate (FILE *f) +{ + static bool isFirst = 1 ; + static struct rusage rOld, rFirst ; + struct rusage rNew ; + int secs, usecs ; + + getrusage (RUSAGE_SELF, &rNew) ; + if (!isFirst) + { secs = rNew.ru_utime.tv_sec - rOld.ru_utime.tv_sec ; + usecs = rNew.ru_utime.tv_usec - rOld.ru_utime.tv_usec ; + if (usecs < 0) { usecs += 1000000 ; secs -= 1 ; } + fprintf (f, "user\t%d.%06d", secs, usecs) ; + secs = rNew.ru_stime.tv_sec - rOld.ru_stime.tv_sec ; + usecs = rNew.ru_stime.tv_usec - rOld.ru_stime.tv_usec ; + if (usecs < 0) { usecs += 1000000 ; secs -= 1 ; } + fprintf (f, "\tsystem\t%d.%06d", secs, usecs) ; + fprintf (f, "\tmax_RSS\t%ld", rNew.ru_maxrss - rOld.ru_maxrss) ; + fputc ('\n', f) ; + } + else + { rFirst = rNew ; + isFirst = false ; + } + + rOld = rNew ; +} + +int main (int argc, char *argv[]) +{ + I64 i, j, x, n, tot, mod ; + FILE *f ; + static unsigned char buffer[9*(1<<20)] ; + + if (argc < 3) die ("usage: ./test [mod]") ; + +// { int t = 1; +// char *b = (char *) (&t); +// if (*b == 0) printf ("bigEndian\n") ; else printf ("smallEndian\n") ; +// } + + timeUpdate (0) ; + + x = atoi(argv[1]) ; + n = atoi(argv[2]) ; + if (argc == 4) mod = atoi(argv[3]) ; else mod = 0 ; + tot = 0 ; +#ifdef TEST_INT + f = fopen ("int.test", "w") ; +#endif +#ifdef TEST_LTF + f = fopen ("ltf.test", "w") ; +#endif + if (argc > 4) + for (i = 0 ; i < n ; ++i) { tot += ltfWrite (x++, f) ; if (x == mod) x = 0 ; } + else + { while (n) + { int m = (n > 1<<20) ? 1<<20 : n ; + unsigned char *u = buffer ; + for (i = 0 ; i < m ; ++i) + { +#ifdef TEST_INT + u += intPut (u, x++) ; +#endif +#ifdef TEST_LTF + u += ltf8_put ((char*)u, x++) ; +#endif + if (x == mod) x = 0 ; + } + tot += (u-buffer) ; + fwrite (buffer, 1, (u-buffer), f) ; + n -= m ; + } + } + fclose (f) ; + printf ("wrote %" PRId64 " bytes: ", tot) ; + timeUpdate (stdout) ; + + x = atoi(argv[1]) ; + n = atoi(argv[2]) ; + tot = 0 ; +#ifdef TEST_INT + f = fopen ("int.test", "r") ; +#endif +#ifdef TEST_LTF + f = fopen ("ltf.test", "r") ; +#endif + if (argc > 4) + { for (j = 0 ; j < n ; ++j) + if ((i = ltfRead (f)) != j) + die ("ltf wrote %d read %d", (int)j, (int)i) ; + } + else + { unsigned char *u = buffer, *v = buffer ; // u is current pos, v is end of section read in + while (n) + { int m = (n > 1<<20) ? 1<<20 : n ; +// printf ("attempting to read %d chars: n %d (v-u) %d\n", (int)(9*m - (v-u)), m, (int)(v-u)) ; + unsigned char *v0 = v, *u0 = u ; + v += fread (v, 1, 9*m - (v-u), f) ; +// printf (" read %d chars\n", (int)(v-v0)) ; + for (i = 0 ; i < m ; ++i) +#ifdef TEST_INT + u += intGet (u, &j) ; +#endif +#ifdef TEST_LTF + u += ltf8_get ((char*)u, &j) ; +#endif +// printf (" intGet m %d ints used %d chars\n", m, (int)(u-u0)) ; + n -= m ; + m = v - u ; +// printf (" memmove %d\n", m) ; + memmove (buffer, u, m) ; + tot += (u-buffer) ; + v -= (u-buffer) ; u = buffer ; + } + } + fclose (f) ; + + printf ("read %" PRId64 " bytes: ", tot) ; + timeUpdate (stdout) ; +} +#endif // TEST_LTF + +/*********************************************************************************** + * + * UTILITIES: memory allocation, file opening, timer + * adapted from Richard's utilities + * + **********************************************************************************/ + +static void die(char *format, ...) +{ va_list args; + + va_start (args, format); + fprintf (stderr, "FATAL ERROR: "); + vfprintf (stderr, format, args); + fprintf (stderr, "\n"); + va_end (args); + exit (-1); +} + +static I64 nAlloc = 0; +static I64 totalAlloc = 0; + +static void *myalloc(size_t size) +{ void *p; + + p = malloc(size); + if (p == NULL) die("myalloc failure requesting %d bytes", size); + nAlloc += 1; + totalAlloc += size; + return (p); +} + +static void *mycalloc(size_t number, size_t size) +{ void *p; + + p = calloc(number,size); + if (p == NULL) die("mycalloc failure requesting %d objects of size %d", number, size); + nAlloc += 1; + totalAlloc += size*number; + return p; +} + +/********************* end of file ***********************/ diff --git a/ONElib.h b/ONElib.h new file mode 100644 index 0000000..319ef6b --- /dev/null +++ b/ONElib.h @@ -0,0 +1,410 @@ +/****************************************************************************************** + * + * File: ONElib.h + * Header for ONE file reading and writing + * + * Authors: Richard Durbin (rd109@cam.ac.uk), Gene Myers (myers@mpi-cbg.de) + * Copyright (C) Richard Durbin, Gene Myers, 2019- + * + * HISTORY: + * Last edited: Dec 3 06:08 2022 (rd109) + * * Dec 3 06:01 2022 (rd109): remove oneWriteHeader(), switch to stdarg for oneWriteComment etc. + * * Dec 27 09:46 2019 (gene): style edits + * * Created: Sat Feb 23 10:12:43 2019 (rd109) + * + *****************************************************************************************/ + +#ifndef ONE_DEFINED +#define ONE_DEFINED + +#include // for FILE etc. +#include // for formatted writing in oneWriteComment(), oneAddProvenance() +#include // for standard size int types and their PRI print macros +#include // for standard bool types +#include // for INT_MAX etc. +#include + +/*********************************************************************************** + * + * DATA TYPES + * + **********************************************************************************/ + +// Basic Types +#ifndef U8_DEFINED +#define U8_DEFINED + +typedef int8_t I8; +typedef int16_t I16; +typedef int32_t I32; +typedef int64_t I64; +typedef unsigned char U8; + +#endif // U8_DEFINED + +typedef enum { oneINT = 1, oneREAL, oneCHAR, oneSTRING, + oneINT_LIST, oneREAL_LIST, oneSTRING_LIST, oneDNA } OneType; +extern char* oneTypeString[] ; +// = { 0, "INT", "REAL", "CHAR", "STRING", "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ; + +typedef union + { I64 i; + double r; + char c; + I64 len; // For lists : top 8 bits encode excess bytes, low 56 length + } OneField; + +typedef struct + { char *program; + char *version; + char *command; + char *date; + } OneProvenance; + +typedef struct + { char *filename; + I64 count; + } OneReference; + +typedef struct + { I64 count; + I64 max; + I64 total; + I64 groupCount; + I64 groupTotal; + } OneCounts; + + // OneCodecs are a private package for binary one file compression + +typedef void OneCodec; // forward declaration of opaque type for compression codecs + + // DNAcodec is a special pre-existing compressor one should use for DNA. + // It compresses every base to 2-bits, where any non-ACGT letter is + // effectively converted to an A. Compression is case insensitive, + // but decompression always delivers lower-case. + +extern OneCodec *DNAcodec; + + // Record for a particular line type. There is at most one list element. + +typedef struct + { OneCounts accum; // counts read or written to this moment + OneCounts given; // counts read from header + I64 gCount; // used internally to calculate groupCount and groupTotal + I64 gTotal; + I64 oCount; // # of objects in prefix before first group (if any) + I64 oTotal; // + of objects in prefix (these 2 are for thread parallel apps) + + int nField; // number of fields + OneType *fieldType; // type of each field + int listEltSize; // size of list field elements (if present, else 0) + int listField; // field index of list + char *comment; // the comment on the definition line in the schema + + bool isUserBuf; // flag for whether buffer is owned by user + I64 bufSize; // system buffer and size if not user supplied + void *buffer; + + OneCodec *listCodec; // compression codec and flags + bool isUseListCodec; // on once enough data collected to train associated codec + char binaryTypePack; // binary code for line type, bit 8 set. + // bit 0: list compressed + I64 listTack; // accumulated training data for this threads codeCodec (master) + } OneInfo; + + // the schema type - the first record is the header spec, then a linked list of primary classes + +typedef struct OneSchema + { + char *primary ; + int nSecondary ; + char **secondary ; + OneInfo *info[128] ; + int nFieldMax ; + char objectType ; + char groupType ; + struct OneSchema *nxt ; + } OneSchema ; + +typedef struct OneHeaderText + { char *text ; + struct OneHeaderText *nxt ; + } OneHeaderText ; + + // The main OneFile type - this is the primary handle used by the end user + +typedef struct + { + // this field may be set by the user + + bool isCheckString; // set if want to validate string char by char + + // these fields may be read by user - but don't change them! + + char *fileType; + char *subType; + char lineType; // current lineType + char objectType; // line designation character for primary objects + char groupType; // line designation character for groups (optional) + I64 line; // current line number + I64 byte; // current byte position when writing binary + I64 object; // current object - incremented when object line read + I64 group; // current group - incremented when group line read + OneProvenance *provenance; // if non-zero then count['!'] entries + OneReference *reference; // if non-zero then count['<'] entries + OneReference *deferred; // if non-zero then count['>'] entries + OneField *field; // used to hold the current line - accessed by macros + OneInfo *info[128]; // all the per-linetype information + I64 codecTrainingSize; // amount of data to see before building codec + + // fields below here are private to the package + + FILE *f; + + bool isWrite; // true if open for writing + bool isHeaderOut; // true if header already written + bool isBinary; // true if writing a binary file + bool inGroup; // set once inside a group + bool isLastLineBinary; // needed to deal with newlines on ascii files + bool isIndexIn; // index read in + bool isBig; // are we on a big-endian machine? + bool isNoAsciiHeader; // backdoor for ONEview to avoid writing header in ascii + + char lineBuf[128]; // working buffers + char numberBuf[32]; + int nFieldMax; + I64 codecBufSize; + char *codecBuf; + I64 nBits; // number of bits of list currently in codecBuf + I64 intListBytes; // number of bytes per integer in the compacted INT_LIST + I64 linePos; // current line position + OneHeaderText *headerText; // arbitrary descriptive text that goes with the header + + char binaryTypeUnpack[256]; // invert binary line code to ASCII line character. + int share; // index if slave of threaded write, +nthreads > 0 if master + int isFinal; // oneFinalizeCounts has been called on file + pthread_mutex_t fieldLock; // Mutexs to protect training accumumulation stats when threadded + pthread_mutex_t listLock; + } OneFile; // the footer will be in the concatenated result. + + +/*********************************************************************************** + * + * ROUTINES FOR READING & WRITING ONE FILES IN BOTH ASCII & BINARY (TRANSPARENTLY) + * + **********************************************************************************/ + +// CREATING AND DESTROYING SCHEMAS + +OneSchema *oneSchemaCreateFromFile (char *path) ; +OneSchema *oneSchemaCreateFromText (char *text) ; + + // These functions create a schema handle that can be used to open One-code data files + // for reading and writing. A schema file is itself a One-code file, consisting of + // a set of objects, one per primary file type. Valid lines in this file are: + // P // a short string + // S // a short string - any number of these + // O // definition of object type + // G // definition of group type - first field must be an int + // D // definition of line + // must be a lower or upper case letter. + // is a list of field types from: + // CHAR, INT, REAL, STRING, INT_LIST, REAL_LIST, STRING_LIST, DNA + // Only one list type (STRING, *_LIST or DNA) is allowed per line type. + // All the D lines following an O line apply to that object type. + // By convention comments on each line explain the definition. + // Example, with lists and strings preceded by their length in OneCode style + // P 3 seq this is a sequence file + // O S 1 3 DNA the DNA sequence - each S line starts an object + // D Q 1 6 STRING the phred encoded quality score + ASCII 33 + // D N 4 4 REAL 4 REAL 4 REAL 4 REAL signal to noise ratio in A, C, G, T channels + // G g 2 3 INT 6 STRING group designator: number of objects, name + // The ...FromText() alternative writes the text to a temp file and reads it with + // oneSchemaCreateFromFile(). This allows code to set the schema. + // Internally a schema is a linked list of OneSchema objects, with the first holding + // the (hard-coded) schema for the header and footer, and the remainder each + // corresponding to one primary file type. + +void oneSchemaDestroy (OneSchema *schema) ; + +// READING ONE FILES: + +OneFile *oneFileOpenRead (const char *path, OneSchema *schema, char *type, int nthreads) ; + + // Open ONE file 'path', either binary or ascii encoded, for reading. + // If the file doesn't have a header, then 'type' must be specified, + // otherwise, if 'type' is non-zero it must match the header type. + // All header information (if present) is read. + // 'schema' is also optional. If it is NULL then the file must contain its own schema. + // If 'schema' is present then it must support 'type', and if the file contains its + // own schema, then that must be a subset of the one for this type in 'schema'. + // If nthreads > 1 then nthreadds OneFiles are generated as an array and the pointer + // to the first, called the master, is returned. The other nthreads-1 files are + // called slaves. The package routines are aware of when a OneFile argument is a + // slave or master in a parallel group. The master recieves provenance, counts, etc. + // The slaves only read data and have the virtue of sharing indices and codecs with + // the master if relevant. + +bool oneFileCheckSchema (OneFile *vf, char *textSchema) ; + + // Checks if file schema is consistent with text schema. Mismatches are reported to stderr. + // Filetype and all linetypes in text must match. File schema can contain additional linetypes. + // e.g. if (! oneFileCheckSchema (vf, "P 3 seq\nD S 1 3 DNA\nD Q 1 6 STRING\nD P 0\n")) die () ; + // This is provided to enable a program to ensure that its assumptions about data layout + // are satisfied. + +char oneReadLine (OneFile *vf) ; + + // Read the next ONE formatted line returning the line type of the line, or 0 + // if at the end of the data section. The content macros immediately below are + // used to access the information of the line most recently read. + +void *_oneList (OneFile *vf) ; // lazy codec decompression if required +void *_oneCompressedList (OneFile *vf) ; // lazy codec compression if required + +#define oneInt(vf,x) ((vf)->field[x].i) +#define oneReal(vf,x) ((vf)->field[x].r) +#define oneChar(vf,x) ((vf)->field[x].c) +#define _LF(vf) ((vf)->info[(int)(vf)->lineType]->listField) +#define oneLen(vf) ((vf)->field[_LF(vf)].len & 0xffffffffffffffll) +#define oneString(vf) (char *) _oneList(vf) +#define oneDNAchar(vf) (char *) _oneList(vf) +#define oneDNA2bit(vf) (U8 *) _oneCompressedList(vf) +#define oneIntList(vf) (I64 *) _oneList(vf) +#define oneRealList(vf) (double *) _oneList(vf) +#define oneNextString(vf,s) (s + strlen(s) + 1) + + // Access field information. The index x of a list object is not required as there is + // only one list per line, stored in ->buffer. + // A "string list" is implicitly supported, get the first string with oneString, and + // subsequent strings sequentially with oneNextString, e.g.: + // + // char *s = oneString(vf); + // for (i = 0; i < oneLen(vf); i++) + // { // do something with i'th string + // s = oneNextString(vf,s); + // } + +char *oneReadComment (OneFile *vf); + + // Can be called after oneReadLine() to read any optional comment text after the fixed fields. + // Returns NULL if there is no comment. + +// WRITING ONE FILES: + +OneFile *oneFileOpenWriteNew (const char *path, OneSchema *schema, char *type, + bool isBinary, int nthreads); +OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn, + bool isBinary, int nthreads); + + // Create a new oneFile that will be written to 'path'. For the 'New' variant supply + // the file type, subtype (if non-zero), and whether it should be binary or ASCII. + // For the 'From' variant, specify binary or ASCII, schema and all other header + // information is inherited from 'vfIn', where the count stats are from vfIn's + // accumulation (assumes vfIn has been fully read or written) if 'useAccum is true, + // and from vfIn's header otherwise. + // If nthreads > 1 then nthreads OneFiles are generated as an array and the pointer + // to the first, called the master, is returned. The other nthreads-1 files are + // called slaves. The package routines are aware of when a OneFile argument is a + // slave or master in a parallel group. The slaves are expected to only write data + // lines, with the master adding provenance, producing the header, and then some + // segment of the initial data lines. Upon close the final result is effectively + // the concatenation of the master, followed by the output of each slave in sequence. + +bool oneInheritProvenance (OneFile *vf, OneFile *source); +bool oneInheritReference (OneFile *vf, OneFile *source); +bool oneInheritDeferred (OneFile *vf, OneFile *source); + + // Add all provenance/reference/deferred entries in source to header of vf. Must be + // called before first call to oneWriteLine. + +bool oneAddProvenance (OneFile *vf, char *prog, char *version, char *format, ...); +bool oneAddReference (OneFile *vf, char *filename, I64 count); +bool oneAddDeferred (OneFile *vf, char *filename); + + // Append provenance/reference/deferred to header information. Must be called before + // first call to oneWriteLine. + + // For ASCII output, if you want the header to contain count information then you must + // create and fill the relevant OneCounts objects before the first call to oneWriteLine. + // For BINARY output, the OneCounts information is accumulated and written automatically. + +void oneWriteLine (OneFile *vf, char lineType, I64 listLen, void *listBuf); + + // Set up a line for output just as it would be returned by oneReadLine and then call + // this routine to output the line (ASCII or binary). + // Use the macros above on the l.h.s. of assignments to fill fields (e.g. oneInt(vf,2) = 3). + // For lists, give the length in the listLen argument, and either place the list data in your + // own buffer and give it as listBuf, or put in the line's buffer and set listBuf == NULL. + +void oneWriteLineFrom (OneFile *vf, OneFile *source) ; // copies a line from source into vf +void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf); + +// Minor variants of oneWriteLine(). +// Use oneWriteLineDNA2bit for DNA lists if your DNA is already 2-bit compressed. + +void oneWriteComment (OneFile *vf, char *format, ...); // can not include newline \n chars + + // Adds a comment to the current line. Extends line in ascii, adds special line type in binary. + +// CLOSING FILES (FOR BOTH READ & WRITE) + +void oneFileClose (OneFile *vf); + + // Close vf (opened either for reading or writing). Finalizes counts, merges theaded files, + // and writes footer if binary. Frees all non-user memory associated with vf. + +// GOTO & BUFFER MANAGEMENT + +void oneUserBuffer (OneFile *vf, char lineType, void *buffer); + + // A buffer is used to capture the list element of each line type that has one. + // This routine allows you to reassign the buffer to one you've allocated, or + // to revert to a default system buffer if 'buffer' = NULL. The previous buffer + // (if any) is freed. The user must ensure that a buffer they supply is large + // enough. BTW, this buffer is overwritten with each new line read of the given type. + +bool oneGotoObject (OneFile *vf, I64 i); + + // Goto i'th object in the file. This only works on binary files, which have an index. + +I64 oneGotoGroup (OneFile *vf, I64 i); + + // Goto the first object in group i. Return the size (in objects) of the group, or 0 + // if an error (i out of range or vf has not group type). Only works for binary files. + +/*********************************************************************************** + * + * A BIT ABOUT THE FORMAT OF BINARY FILES + * + **********************************************************************************/ + + // <- <$-line>