/* filter.c -- filter expressions. Copyright (C) 2013-2014 Genome Research Ltd. Author: Petr Danecek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include "filter.h" #include "bcftools.h" #include #include typedef struct _token_t { // read-only values, same for all VCF lines int tok_type; // one of the TOK_* keys below char *key; // set only for string constants, otherwise NULL char *tag; // for debugging and printout only, VCF tag name float threshold; // filtering threshold int hdr_id; // BCF header lookup ID int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*]) void (*setter)(filter_t *, bcf1_t *, struct _token_t *); int (*comparator)(struct _token_t *, struct _token_t *, int op_type, bcf1_t *); void *hash; // test presence of str value in the hash via comparator regex_t *regex; // precompiled regex for string comparison // modified on filter evaluation at each VCF line float *values; // In case str_value is set, values[0] is one sample's string length char *str_value; // and values[0]*nsamples gives the total length; int is_str; int pass_site; // -1 not applicable, 0 fails, >0 pass uint8_t *pass_samples; // status of individual samples int nsamples; // number of samples int nvalues, mvalues; // number of used values, n=0 for missing values, n=1 for scalars // for strings, total length of str_value } token_t; struct _filter_t { bcf_hdr_t *hdr; char *str; int nfilters; token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack int32_t *tmpi; int max_unpack, mtmpi, nsamples; }; #define TOK_VAL 0 #define TOK_LFT 1 // ( #define TOK_RGT 2 // ) #define TOK_LE 3 // less or equal #define TOK_LT 4 // less than #define TOK_EQ 5 // equal #define TOK_BT 6 // bigger than #define TOK_BE 7 // bigger or equal #define TOK_NE 8 // not equal #define TOK_OR 9 // | #define TOK_AND 10 // & #define TOK_ADD 11 // + #define TOK_SUB 12 // - #define TOK_MULT 13 // * #define TOK_DIV 14 // / #define TOK_MAX 15 #define TOK_MIN 16 #define TOK_AVG 17 #define TOK_AND_VEC 18 // && (operator applied in samples) #define TOK_OR_VEC 19 // || (operator applied in samples) #define TOK_LIKE 20 // ~ regular expression #define TOK_NLIKE 21 // !~ regular expression #define TOK_SUM 22 #define TOK_ABS 23 #define TOK_LEN 24 #define TOK_FUNC 25 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 // ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8}; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f" static int filters_next_token(char **str, int *len) { char *tmp = *str; while ( *tmp && isspace(*tmp) ) tmp++; *str = tmp; *len = 0; // test for doubles: d.ddde[+-]dd if ( isdigit(*str[0]) || *str[0]=='.' ) // strtod would eat +/- { double HTS_UNUSED v = strtod(*str, &tmp); if ( *str!=tmp && (!tmp[0] || !isalnum(tmp[0])) ) { *len = tmp - (*str); return TOK_VAL; } tmp = *str; } if ( !strncasecmp(tmp,"MAX(",4) ) { (*str) += 3; return TOK_MAX; } if ( !strncasecmp(tmp,"MIN(",4) ) { (*str) += 3; return TOK_MIN; } if ( !strncasecmp(tmp,"AVG(",4) ) { (*str) += 3; return TOK_AVG; } if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; } if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; } if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; } if ( !strncasecmp(tmp,"%MAX(",5) ) { (*str) += 4; return TOK_MAX; } // for backward compatibility if ( !strncasecmp(tmp,"%MIN(",5) ) { (*str) += 4; return TOK_MIN; } // for backward compatibility if ( !strncasecmp(tmp,"%AVG(",5) ) { (*str) += 4; return TOK_AVG; } // for backward compatibility if ( !strncasecmp(tmp,"%SUM(",5) ) { (*str) += 4; return TOK_SUM; } // for backward compatibility if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5; if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7; if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4; if ( tmp[0]=='@' ) // file name { while ( *tmp && !isspace(*tmp) && *tmp!='=' && *tmp!='!' ) tmp++; *len = tmp - (*str); return TOK_VAL; } while ( tmp[0] ) { if ( tmp[0]=='"' ) break; if ( tmp[0]=='\'' ) break; if ( isspace(tmp[0]) ) break; if ( tmp[0]=='<' ) break; if ( tmp[0]=='>' ) break; if ( tmp[0]=='=' ) break; if ( tmp[0]=='!' ) break; if ( tmp[0]=='&' ) break; if ( tmp[0]=='|' ) break; if ( tmp[0]=='(' ) break; if ( tmp[0]==')' ) break; if ( tmp[0]=='+' ) break; // hacky: so that [*] is not split, the tokenizer does not recognise square brackets [] if ( tmp[0]=='*' && (tmp==*str || tmp[-1]!='[') ) break; if ( tmp[0]=='-' ) break; if ( tmp[0]=='/' ) break; if ( tmp[0]=='~' ) break; tmp++; } if ( tmp > *str ) { *len = tmp - (*str); return TOK_VAL; } if ( tmp[0]=='"' || tmp[0]=='\'' ) { int quote = tmp[0]; tmp++; while ( *tmp && tmp[0]!=quote ) tmp++; if ( !*tmp ) return -1; // missing quotes *len = tmp - (*str) + 1; return TOK_VAL; } if ( tmp[0]=='!' ) { if ( tmp[1]=='=' ) { (*str) += 2; return TOK_NE; } if ( tmp[1]=='~' ) { (*str) += 2; return TOK_NLIKE; } } if ( tmp[0]=='<' ) { if ( tmp[1]=='=' ) { (*str) += 2; return TOK_LE; } (*str) += 1; return TOK_LT; } if ( tmp[0]=='>' ) { if ( tmp[1]=='=' ) { (*str) += 2; return TOK_BE; } (*str) += 1; return TOK_BT; } if ( tmp[0]=='=' ) { if ( tmp[1]=='=' ) { (*str) += 2; return TOK_EQ; } (*str) += 1; return TOK_EQ; } if ( tmp[0]=='(' ) { (*str) += 1; return TOK_LFT; } if ( tmp[0]==')' ) { (*str) += 1; return TOK_RGT; } if ( tmp[0]=='&' && tmp[1]=='&' ) { (*str) += 2; return TOK_AND_VEC; } if ( tmp[0]=='|' && tmp[1]=='|' ) { (*str) += 2; return TOK_OR_VEC; } if ( tmp[0]=='&' ) { (*str) += 1; return TOK_AND; } if ( tmp[0]=='|' ) { (*str) += 1; return TOK_OR; } if ( tmp[0]=='+' ) { (*str) += 1; return TOK_ADD; } if ( tmp[0]=='-' ) { (*str) += 1; return TOK_SUB; } if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; } if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; } if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; } *len = tmp - (*str); return TOK_VAL; } static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok) { float *ptr = &line->qual; if ( bcf_float_is_missing(*ptr) ) tok->nvalues = 0; else { tok->values[0] = line->qual; tok->nvalues = 1; } } static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok) { tok->values[0] = bcf_get_variant_types(line); tok->nvalues = 1; } static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok) { assert( tok->hdr_id >=0 ); int i; for (i=0; in_info; i++) if ( line->d.info[i].key == tok->hdr_id ) break; if ( i==line->n_info ) tok->nvalues = 0; else if ( line->d.info[i].type==BCF_BT_CHAR ) { int n = line->d.info[i].len; int m = (int)tok->values[0]; hts_expand(char,n+1,m,tok->str_value); memcpy(tok->str_value,line->d.info[i].vptr,n); tok->str_value[n] = 0; tok->values[0] = m; tok->nvalues = n; } else if ( line->d.info[i].type==BCF_BT_FLOAT ) { tok->values[0] = line->d.info[i].v1.f; tok->str_value = NULL; tok->nvalues = 1; } else { tok->values[0] = line->d.info[i].v1.i; tok->str_value = NULL; tok->nvalues = 1; } } static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line) { int i; if ( op_type==TOK_NE ) // AND logic: none of the filters can match { if ( !line->d.n_flt ) { if ( atok->hdr_id==-1 ) return 0; // missing value return 1; // no filter present, eval to true } for (i=0; id.n_flt; i++) if ( atok->hdr_id==line->d.flt[i] ) return 0; return 1; } // TOK_EQ with OR logic: at least one of the filters must match if ( !line->d.n_flt ) { if ( atok->hdr_id==-1 ) return 1; return 0; // no filter present, eval to false } for (i=0; id.n_flt; i++) if ( atok->hdr_id==line->d.flt[i] ) return 1; return 0; } static int filters_cmp_id(token_t *atok, token_t *btok, int op_type, bcf1_t *line) { // multiple IDs not supported yet (easy to add though) if ( btok->hash ) { token_t *tmp = atok; atok = btok; btok = tmp; } if ( atok->hash ) { int ret = khash_str2int_has_key(atok->hash, line->d.id); if ( op_type==TOK_EQ ) return ret; return ret ? 0 : 1; } if ( op_type==TOK_EQ ) return strcmp(btok->str_value,line->d.id) ? 0 : 1; return strcmp(btok->str_value,line->d.id) ? 1 : 0; } /** * bcf_get_info_value() - get single INFO value, int or float * @line: BCF line * @info_id: tag ID, as returned by bcf_hdr_id2int * @ivec: 0-based index to retrieve, -1 when single value is expected * @vptr: pointer to memory location of sufficient size to accomodate * info_id's type * * The returned value is -1 if tag is not present, 0 if present but * values is missing or ivec is out of range, and 1 on success. */ static int bcf_get_info_value(bcf1_t *line, int info_id, int ivec, void *value) { int j; for (j=0; jn_info; j++) if ( line->d.info[j].key == info_id ) break; if ( j==line->n_info ) return -1; bcf_info_t *info = &line->d.info[j]; if ( info->len == 1 ) { if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f; else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i; return 1; } if ( ivec<0 ) ivec = 0; #define BRANCH(type_t, is_missing, is_vector_end, out_type_t) { \ type_t *p = (type_t *) info->vptr; \ for (j=0; jlen; j++) \ { \ if ( is_vector_end ) return 0; \ } \ if ( is_missing ) return 0; \ *((out_type_t*)value) = p[j]; \ return 1; \ } switch (info->type) { case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break; case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break; case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break; case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break; default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break; } #undef BRANCH return -1; // this shouldn't happen } static void filters_set_info_int(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi); tok->nvalues = n; hts_expand(float,n,tok->mvalues,tok->values); for (i=0; ivalues[i] = flt->tmpi[i]; } else { int32_t value; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else { tok->values[0] = value; tok->nvalues = 1; } } } static void filters_set_info_float(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->idx==-2 ) { tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues); if ( tok->nvalues<0 ) tok->nvalues = 0; } else { float value; if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 ) tok->nvalues = 0; else { tok->values[0] = value; tok->nvalues = 1; } } } static void filters_set_info_string(filter_t *flt, bcf1_t *line, token_t *tok) { int m = (int)tok->values[0]; int n = bcf_get_info_string(flt->hdr,line,tok->tag,&tok->str_value,&m); if ( n<0 ) { tok->nvalues = 0; return; } tok->values[0] = m; // allocated length if ( tok->idx>=0 ) { // get ith field (i=tok->idx) int i = 0; char *ss = tok->str_value, *se = tok->str_value + n; while ( ssidx ) { if ( *ss==',' ) i++; ss++; } if ( ss==se || i!=tok->idx ) { tok->nvalues = 0; return; } se = ss; while ( se-tok->str_valuestr_value ) *se = 0; else { memmove(tok->str_value,ss,se-ss); tok->str_value[se-ss] = 0; } tok->nvalues = se-ss; } else if ( tok->idx==-2 ) tok->nvalues = n; } static void filters_set_info_flag(filter_t *flt, bcf1_t *line, token_t *tok) { int j; for (j=0; jn_info; j++) if ( line->d.info[j].key == tok->hdr_id ) break; tok->values[0] = j==line->n_info ? 0 : 1; tok->nvalues = 1; } static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok) { int i; if ( (tok->nvalues=bcf_get_format_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi))<0 ) tok->nvalues = 0; else { int is_missing = 1; hts_expand(float,tok->nvalues,tok->mvalues,tok->values); for (i=0; invalues; i++) { if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end ) bcf_float_set_missing(tok->values[i]); else { tok->values[i] = flt->tmpi[i]; is_missing = 0; } } if ( is_missing ) tok->nvalues = 0; else if ( tok->idx >= 0 ) { int nsmpl = bcf_hdr_nsamples(flt->hdr); int nvals = tok->nvalues / nsmpl; if ( tok->idx >= nvals ) tok->nvalues = 0; // the index is too big else { for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; tok->nvalues = nsmpl; } } } tok->nsamples = tok->nvalues; } static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) { if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 ) tok->nvalues = tok->nsamples = 0; // missing values else if ( tok->idx >= 0 ) { int i, nsmpl, nvals; nsmpl = bcf_hdr_nsamples(flt->hdr); nvals = tok->nvalues / nsmpl; if ( tok->idx >= nvals ) tok->nsamples = tok->nvalues = 0; // the index is too big else { for (i=0; ivalues[i] = tok->values[i*nvals+tok->idx]; tok->nsamples = tok->nvalues = nsmpl; } } } static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok) { int ndim = tok->nsamples * (int)tok->values[0]; int ret = bcf_get_format_char(flt->hdr,line,tok->tag,&tok->str_value,&ndim); int nsmpl = bcf_hdr_nsamples(flt->hdr); ndim /= nsmpl; tok->values[0] = ndim; if ( ret<=0 ) { tok->nvalues = 0; return; } if ( tok->idx < 0 ) // scalar { tok->nvalues = tok->nsamples = nsmpl; return; } // vector int i; for (i=0; istr_value + i*ndim; int is = 0, ivec = 0; while ( ivecidx && isidx || is==ndim || !ss[is] ) { ss[0] = '.'; ss[1] = 0; continue; } int ie = is; while ( ienvalues = 0; return; } tok->nvalues = ret; tok->nsamples = nsmpl; } static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { bcf_fmt_t *fmt = bcf_get_fmt(flt->hdr, line, "GT"); if ( !fmt ) { tok->nvalues = tok->nsamples = 0; return; } int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr); kstring_t str; str.s = tok->str_value; str.m = tok->values[0] * nsmpl; str.l = 0; for (i=0; invalues = str.l; tok->nsamples = nsmpl; tok->values[0] = blen; tok->str_value = str.s; } static void filters_set_ref_string(filter_t *flt, bcf1_t *line, token_t *tok) { kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0; kputs(line->d.allele[0], &str); tok->nvalues = str.l; tok->values[0] = str.m; tok->str_value = str.s; } static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok) { kstring_t str; str.s = tok->str_value; str.m = tok->values[0]; str.l = 0; if ( tok->idx>=0 ) { if ( line->n_allele >= tok->idx ) kputs(line->d.allele[tok->idx], &str); else kputc('.', &str); } else { kputs(line->d.allele[1], &str); int i; for (i=2; in_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } } tok->nvalues = str.l; tok->values[0] = str.m; tok->str_value = str.s; } static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok) { tok->nvalues = 1; tok->values[0] = line->n_allele - 1; } static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok) { hts_expand(int32_t, line->n_allele, flt->mtmpi, flt->tmpi); if ( !bcf_calc_ac(flt->hdr, line, flt->tmpi, BCF_UN_INFO|BCF_UN_FMT) ) { tok->nvalues = 0; return; } int i, an = flt->tmpi[0]; for (i=1; in_allele; i++) an += flt->tmpi[i]; if ( !an ) { tok->nvalues = 0; return; } flt->tmpi[0] = an; // for filters_set_[mac|af|maf] if ( tok->idx>=0 ) { tok->nvalues = 1; tok->values[0] = flt->tmpi[tok->idx+1]; } else { hts_expand(float,line->n_allele,tok->mvalues,tok->values); for (i=1; in_allele; i++) tok->values[i-1] = flt->tmpi[i]; tok->nvalues = line->n_allele - 1; } } static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok) { filters_set_ac(flt,line,tok); tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0; tok->nvalues = 1; } static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok) { filters_set_ac(flt,line,tok); if ( !tok->nvalues ) return; int i, an = flt->tmpi[0]; for (i=0; invalues; i++) if ( tok->values[i] > an*0.5 ) tok->values[i] = an - tok->values[i]; } static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok) { filters_set_ac(flt,line,tok); if ( !tok->nvalues ) return; int i, an = flt->tmpi[0]; for (i=0; invalues; i++) tok->values[i] /= (float)an; } static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok) { filters_set_ac(flt,line,tok); if ( !tok->nvalues ) return; int i, an = flt->tmpi[0]; for (i=0; invalues; i++) { tok->values[i] /= (float)an; if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i]; } } static void set_max(filter_t *flt, bcf1_t *line, token_t *tok) { float val = -HUGE_VAL; int i; for (i=0; invalues; i++) { if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i]; } tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; } static void set_min(filter_t *flt, bcf1_t *line, token_t *tok) { float val = HUGE_VAL; int i; for (i=0; invalues; i++) if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i]; tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; } static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok) { float val = 0; int i, n = 0; for (i=0; invalues; i++) if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } tok->values[0] = n ? val / n : 0; tok->nvalues = 1; tok->nsamples = 0; } static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok) { float val = 0; int i, n = 0; for (i=0; invalues; i++) if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; } tok->values[0] = val; tok->nvalues = 1; tok->nsamples = 0; } static void set_abs(filter_t *flt, bcf1_t *line, token_t *tok) { if ( tok->is_str ) error("ABS() can be applied only on numeric values\n"); int i; for (i=0; invalues; i++) tok->values[i] = fabs(tok->values[i]); } static void set_strlen(filter_t *flt, bcf1_t *line, token_t *tok) { tok->is_str = 0; if ( !tok->nvalues ) return; if ( tok->idx==-2 ) { int i = 0; char *ss = tok->str_value; while ( *ss ) { char *se = ss; while ( *se && *se!=',' ) se++; if ( !*se ) tok->values[i] = strlen(ss); else { *se = 0; tok->values[i] = strlen(ss); *se = ','; } ss = *se ? se + 1 : se; i++; } tok->nvalues = i; } else { tok->values[0] = strlen(tok->str_value); tok->nvalues = 1; } } #define VECTOR_ARITHMETICS(atok,btok,AOP) \ { \ int i, has_values = 0; \ if ( !(atok)->nvalues || !(btok)->nvalues ) /* missing values */ \ { \ (atok)->nvalues = 0; (atok)->nsamples = 0; \ } \ else \ { \ if ( ((atok)->nsamples && (btok)->nsamples) || (!(atok)->nsamples && !(btok)->nsamples)) \ { \ for (i=0; i<(atok)->nvalues; i++) \ { \ if ( bcf_float_is_missing((atok)->values[i]) ) continue; \ if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \ has_values = 1; \ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \ } \ } \ else if ( (btok)->nsamples ) \ { \ hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \ for (i=0; i<(btok)->nvalues; i++) \ { \ if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \ { \ bcf_float_set_missing((atok)->values[i]); \ continue; \ } \ has_values = 1; \ (atok)->values[i] = (atok)->values[0] AOP (btok)->values[i]; \ } \ (atok)->nvalues = (btok)->nvalues; \ (atok)->nsamples = (btok)->nsamples; \ } \ else if ( (atok)->nsamples ) \ { \ for (i=0; i<(atok)->nvalues; i++) \ { \ if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \ { \ bcf_float_set_missing((atok)->values[i]); \ continue; \ } \ has_values = 1; \ (atok)->values[i] = (atok)->values[i] AOP (btok)->values[0]; \ } \ } \ } \ if ( !has_values ) { (atok)->nvalues = 0; (atok)->nsamples = 0; } \ } static int vector_logic_and(token_t *atok, token_t *btok) { // We are comparing either two scalars (result of INFO tag vs a threshold), two vectors (two FORMAT fields), // or a vector and a scalar (FORMAT field vs threshold) int i, pass_site = 0; if ( !atok->nvalues || !btok->nvalues ) { atok->nvalues = atok->nsamples = 0; return 0; } if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site && btok->pass_site; if ( atok->nsamples && btok->nsamples ) { for (i=0; insamples; i++) { atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_samples[i]; if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; } return pass_site; } if ( btok->nsamples ) { for (i=0; insamples; i++) { atok->pass_samples[i] = atok->pass_site && btok->pass_samples[i]; if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; } atok->nsamples = btok->nsamples; return pass_site; } /* atok->nsamples!=0 */ for (i=0; insamples; i++) { atok->pass_samples[i] = atok->pass_samples[i] && btok->pass_site; if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; } return pass_site; } static int vector_logic_or(token_t *atok, token_t *btok, int or_type) { int i, pass_site = 0; if ( !atok->nvalues && !btok->nvalues ) // missing sites in both { atok->nvalues = atok->nsamples = 0; return 0; } if ( !atok->nvalues ) // missing value in a { for (i=0; insamples; i++) atok->pass_samples[i] = btok->pass_samples[i]; atok->nsamples = btok->nsamples; return btok->pass_site; } if ( !btok->nvalues ) // missing value in b return atok->pass_site; if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site; if ( !atok->nsamples ) { if ( or_type==TOK_OR ) { for (i=0; insamples; i++) { atok->pass_samples[i] = btok->pass_samples[i]; if ( atok->pass_site || atok->pass_samples[i] ) pass_site = 1; } } else { for (i=0; insamples; i++) { atok->pass_samples[i] = atok->pass_site || btok->pass_samples[i]; if ( atok->pass_samples[i] ) pass_site = 1; } } atok->nsamples = btok->nsamples; return pass_site; } if ( !btok->nsamples ) // vector vs site { if ( or_type==TOK_OR ) { for (i=0; insamples; i++) if ( btok->pass_site || atok->pass_samples[i] ) pass_site = 1; } else { for (i=0; insamples; i++) { atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_site; if ( atok->pass_samples[i] ) pass_site = 1; } } return pass_site; } for (i=0; insamples; i++) { atok->pass_samples[i] = atok->pass_samples[i] || btok->pass_samples[i]; if ( !pass_site && atok->pass_samples[i] ) pass_site = 1; } return pass_site; } #define CMP_VECTORS(atok,btok,CMP_OP,ret) \ { \ int i, j, has_values = 0, pass_site = 0; \ if ( !(atok)->nvalues || !(btok)->nvalues ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ else \ { \ if ( (atok)->nsamples && (btok)->nsamples ) \ { \ for (i=0; i<(atok)->nsamples; i++) \ { \ if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ has_values = 1; \ if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ else (atok)->pass_samples[i] = 0; \ } \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (atok)->nsamples ) \ { \ if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ else \ { \ for (i=0; i<(atok)->nsamples; i++) \ { \ if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ has_values = 1; \ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ else (atok)->pass_samples[i] = 0; \ } \ } \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (btok)->nsamples ) \ { \ if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \ else \ { \ for (i=0; i<(btok)->nsamples; i++) \ { \ if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \ has_values = 1; \ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \ else (atok)->pass_samples[i] = 0; \ } \ (atok)->nvalues = (btok)->nvalues; \ (atok)->nsamples = (btok)->nsamples; \ } \ if ( !has_values ) (atok)->nvalues = 0; \ } \ else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \ { \ /* any field can match: [*] */ \ for (i=0; i<(atok)->nvalues; i++) \ { \ for (j=0; j<(btok)->nvalues; j++) \ if ( (atok)->values[i] CMP_OP (btok)->values[j] ) { pass_site = 1; i = (atok)->nvalues; break; } \ } \ } \ else \ { \ if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[0]) ) \ { \ (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; \ } \ else if ( (atok)->values[0] CMP_OP (btok)->values[0] ) { pass_site = 1; } \ } \ /*fprintf(stderr,"pass=%d\n", pass_site);*/ \ (ret) = pass_site; \ } \ } static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // logic: TOK_EQ or TOK_NE { if ( !atok->nvalues ) { return 0; } if ( !btok->nvalues ) { atok->nvalues = 0; return 0; } int i, pass_site = 0; if ( atok->nsamples && atok->nsamples==btok->nsamples ) { for (i=0; insamples; i++) { char *astr = atok->str_value + i*(int)atok->values[0]; char *bstr = btok->str_value + i*(int)btok->values[0]; char *aend = astr + (int)atok->values[0], *a = astr; while ( avalues[0], *b = bstr; while ( bpass_samples[i] = 0; else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0; if ( logic!=TOK_EQ ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; pass_site |= atok->pass_samples[i]; } if ( !atok->nsamples ) atok->nsamples = btok->nsamples; } else if ( !atok->nsamples && !btok->nsamples ) { if ( atok->idx==-2 || btok->idx==-2 ) { // any field can match: [*] if ( atok->idx==-2 && btok->idx==-2 ) error("fixme: Expected at least one scalar value [%s %s %s]\n", atok->tag ? atok->tag : btok->tag, atok->str_value,btok->str_value); token_t *xtok, *ytok; // xtok is scalar, ytok array if ( btok->idx==-2 ) { xtok = atok; ytok = btok; } else { xtok = btok; ytok = atok; } char *xstr = xtok->str_value, *xend = xstr + xtok->nvalues; char *ystr = ytok->str_value, *yend = ystr + ytok->nvalues, *y = ystr; while ( y<=yend ) { if ( y==yend || *y==',' ) { if ( y-ystr==xend-xstr && !strncmp(xstr,ystr,xend-xstr) ) { pass_site = 1; break; } ystr = y+1; } y++; } } else pass_site = strcmp(atok->str_value,btok->str_value) ? 0 : 1; if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1; } else { token_t *xtok, *ytok; if ( !atok->nsamples ) { xtok = atok; ytok = btok; } else { xtok = btok; ytok = atok; } char *xstr = xtok->str_value; char *xend = xstr + (int)xtok->values[0], *x = xstr; while ( xnsamples; i++) { char *ystr = ytok->str_value + i*(int)ytok->values[0]; char *yend = ystr + (int)ytok->values[0], *y = ystr; while ( ypass_samples[i] = 0; else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0; if ( logic!=TOK_EQ ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1; pass_site |= atok->pass_samples[i]; } if ( !atok->nsamples ) atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set } return pass_site; } static int regex_vector_strings(token_t *atok, token_t *btok) { int ret = regexec(btok->regex, atok->str_value, 0,NULL,0); return ret==0 ? 1 : 0; } static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass_site = -1; tok->idx = -1; // is this a string constant? if ( str[0]=='"' || str[0]=='\'' ) { int quote = str[0]; if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str); tok->key = (char*) calloc(len-1,sizeof(char)); hts_expand(float,1,tok->mvalues,tok->values); tok->values[0] = len-2; memcpy(tok->key,str+1,len-2); tok->key[len-2] = 0; tok->is_str = 1; tok->nvalues = len-2; return 0; } // is it a file? if ( str[0]=='@' ) { tok->tag = (char*) calloc(len+1,sizeof(char)); memcpy(tok->tag,str,len); tok->tag[len] = 0; wordexp_t wexp; wordexp(tok->tag+1, &wexp, 0); if ( !wexp.we_wordc ) error("No such file: %s\n", tok->tag+1); int i, n; char **list = hts_readlist(wexp.we_wordv[0], 1, &n); if ( !list ) error("Could not read: %s\n", wexp.we_wordv[0]); wordfree(&wexp); tok->hash = khash_str2int_init(); for (i=0; ihash,list[i]) ) khash_str2int_inc(tok->hash,list[i]); else free(list[i]); } free(list); return 0; } int is_fmt = -1; if ( !strncasecmp(str,"FMT/",4) ) { str += 4; len -= 4; is_fmt = 1; } else if ( !strncasecmp(str,"FORMAT/",7) ) { str += 7; len -= 7; is_fmt = 1; } else { if ( !strncasecmp(str,"INFO/",5) ) { is_fmt = 0; str += 5; len -= 5; } else if ( !strncasecmp(str,"QUAL",len) || !strncmp(str,"%QUAL",len) /* for backward compatibility */ ) { tok->setter = filters_set_qual; tok->tag = strdup("QUAL"); return 0; } else if ( !strncasecmp(str,"TYPE",len) || !strncmp(str,"%TYPE",len) /* for backward compatibility */ ) { tok->setter = filters_set_type; tok->tag = strdup("TYPE"); return 0; } else if ( !strncasecmp(str,"FILTER",len) || !strncmp(str,"%FILTER",len) /* for backward compatibility */ ) { tok->comparator = filters_cmp_filter; tok->tag = strdup("FILTER"); filter->max_unpack |= BCF_UN_FLT; return 0; } else if ( !strncasecmp(str,"ID",len) || !strncasecmp(str,"%ID",len) /* for backward compatibility */ ) { tok->comparator = filters_cmp_id; tok->tag = strdup("ID"); return 0; } else if ( !strncasecmp(str,"REF",len) ) { tok->setter = &filters_set_ref_string; tok->is_str = 1; tok->tag = strdup("REF"); return 0; } else if ( !strncasecmp(str,"ALT",len) ) { tok->setter = &filters_set_alt_string; tok->is_str = 1; tok->tag = strdup("ALT"); return 0; } else if ( !strncasecmp(str,"N_ALT",len) ) { tok->setter = &filters_set_nalt; tok->tag = strdup("N_ALT"); return 0; } else if ( !strncasecmp(str,"N_SAMPLES",len) ) { tok->tok_type = TOK_VAL; tok->threshold = bcf_hdr_nsamples(filter->hdr); return 0; } } // does it have array subscript? int is_array = 0; kstring_t tmp = {0,0,0}; kputsn(str, len, &tmp); if ( tmp.s[tmp.l-1] == ']' ) { int i; for (i=0; iidx = -2; // tag[*] .. any field else { char *end; tok->idx = strtol(tmp.s+is_array, &end, 10); if ( *end!=']' ) error("Could not parse the index: %s[%s\n", tmp.s,tmp.s+is_array); } } } tok->hdr_id = bcf_hdr_id2int(filter->hdr,BCF_DT_ID,tmp.s); if ( is_fmt==-1 ) { if ( tok->hdr_id >=0 ) { if ( bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) is_fmt = 0; else if ( bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) is_fmt = 1; } if ( is_fmt==-1 ) is_fmt = 0; } if ( is_fmt ) filter->max_unpack |= BCF_UN_FMT; if ( tok->hdr_id>=0 ) { if ( is_fmt && !strcmp("GT",tmp.s) ) { tok->setter = &filters_set_genotype_string; tok->is_str = 1; } else if ( is_fmt ) { if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) error("No such FORMAT field: %s\n", tmp.s); if ( bcf_hdr_id2number(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=1 && !is_array ) error("Error: FORMAT vectors must be subscripted, e.g. %s[0] or %s[*]\n", tmp.s, tmp.s); switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) { case BCF_HT_INT: tok->setter = &filters_set_format_int; break; case BCF_HT_REAL: tok->setter = &filters_set_format_float; break; case BCF_HT_STR: tok->setter = &filters_set_format_string; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } } else if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) error("No such INFO field: %s\n", tmp.s); else { if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG ) tok->setter = filters_set_info_flag; else { if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_STR ) tok->is_str = 1; if ( bcf_hdr_id2number(filter->hdr,BCF_HL_INFO,tok->hdr_id)==1 ) tok->setter = filters_set_info; else { switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) { case BCF_HT_INT: tok->setter = &filters_set_info_int; break; case BCF_HT_REAL: tok->setter = &filters_set_info_float; break; case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break; default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__); } //tok->idx = -2; } } filter->max_unpack |= BCF_UN_INFO; } tok->tag = strdup(tmp.s); if ( tmp.s ) free(tmp.s); return 0; } else if ( !strcasecmp(tmp.s,"ALT") ) { tok->setter = &filters_set_alt_string; tok->is_str = 1; tok->tag = strdup(tmp.s); free(tmp.s); return 0; } else if ( !strcasecmp(tmp.s,"AN") ) { tok->setter = &filters_set_an; tok->tag = strdup("AN"); free(tmp.s); return 0; } else if ( !strcasecmp(tmp.s,"AC") ) { tok->setter = &filters_set_ac; tok->tag = strdup("AC"); free(tmp.s); return 0; } else if ( !strcasecmp(tmp.s,"MAC") ) { tok->setter = &filters_set_mac; tok->tag = strdup("MAC"); free(tmp.s); return 0; } else if ( !strcasecmp(tmp.s,"AF") ) { tok->setter = &filters_set_af; tok->tag = strdup("AF"); free(tmp.s); return 0; } else if ( !strcasecmp(tmp.s,"MAF") ) { tok->setter = &filters_set_maf; tok->tag = strdup("MAF"); free(tmp.s); return 0; } // is it a value? char *end; errno = 0; tok->threshold = strtod(tmp.s, &end); if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); if ( tmp.s ) free(tmp.s); return 0; } static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) { int i; for (i=0; itok_type==TOK_VAL ) { if ( tok->key ) fprintf(stderr,"%s", tok->key); else if ( tok->tag ) fprintf(stderr,"%s", tok->tag); else fprintf(stderr,"%e", tok->threshold); } else fprintf(stderr,"%c", TOKEN_STRING[tok->tok_type]); if ( tok->setter ) fprintf(stderr,"\t[setter %p]", tok->setter); fprintf(stderr,"\n"); } } // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm filter_t *filter_init(bcf_hdr_t *hdr, const char *str) { filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t)); filter->str = strdup(str); filter->hdr = hdr; filter->max_unpack |= BCF_UN_STR; int nops = 0, mops = 0, *ops = NULL; // operators stack int nout = 0, mout = 0; // filter tokens, RPN token_t *out = NULL; char *tmp = filter->str; int last_op = -1; while ( *tmp ) { int len, ret; ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); //fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len); //int i; for (i=0; i0 && ops[nops-1]!=TOK_LFT ) { nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } if ( nops<=0 ) error("Could not parse: %s\n", str); nops--; } else if ( ret!=TOK_VAL ) // one of the operators { // detect unary minus: replace -value with -1*(value) if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT ) { nout++; hts_expand0(token_t, nout, mout, out); token_t *tok = &out[nout-1]; tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass_site = -1; tok->threshold = -1.0; ret = TOK_MULT; } else { while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] ) { nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } } nops++; hts_expand(int, nops, mops, ops); ops[nops-1] = ret; } else if ( !len ) { if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str); break; // all tokens read } else // annotation name or filtering value { nout++; hts_expand0(token_t, nout, mout, out); filters_init1(filter, tmp, len, &out[nout-1]); tmp += len; } last_op = ret; } while ( nops>0 ) { if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str); nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } // In the special cases of TYPE and FILTER the BCF header IDs are yet unknown. Walk through the // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be // just before or after the FILTER token and they must be followed with a comparison operator. // At this point we also initialize regex expressions which, in RPN, must preceed the LIKE/NLIKE operator. // This code is fragile: improve me. int i; for (i=0; istr); out[j].regex = (regex_t *) malloc(sizeof(regex_t)); if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) ) error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str); } if ( out[i].tok_type!=TOK_VAL ) continue; if ( !out[i].tag ) continue; if ( !strcmp(out[i].tag,"TYPE") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int j = i+1; if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; } else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; } else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; } else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; } else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; } else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str); out[j].tag = out[j].key; out[j].key = NULL; i = j; continue; } if ( !strcmp(out[i].tag,"FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int j = i+1; if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE || out[j].tok_type==TOK_LIKE ) j = i - 1; if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( strcmp(".",out[j].key) ) { out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key); if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) ) error("The filter \"%s\" not present in the VCF header\n", out[j].key); } else out[j].hdr_id = -1; out[j].tag = out[j].key; out[j].key = NULL; out[i].hdr_id = out[j].hdr_id; i = j; continue; } } filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0; for (i=0; insamples ) { out[i].pass_samples = (uint8_t*)malloc(filter->nsamples); int j; for (j=0; jnsamples; j++) out[i].pass_samples[j] = 1; } } if (0) filter_debug_print(out, NULL, nout); if ( mops ) free(ops); filter->filters = out; filter->nfilters = nout; filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout); return filter; } void filter_destroy(filter_t *filter) { int i; for (i=0; infilters; i++) { //if ( filter->filters[i].key ) free(filter->filters[i].key); free(filter->filters[i].str_value); free(filter->filters[i].tag); free(filter->filters[i].values); free(filter->filters[i].pass_samples); if (filter->filters[i].hash) khash_str2int_destroy_free(filter->filters[i].hash); if (filter->filters[i].regex) { regfree(filter->filters[i].regex); free(filter->filters[i].regex); } } free(filter->filters); free(filter->flt_stack); free(filter->str); free(filter->tmpi); free(filter); } int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) { bcf_unpack(line, filter->max_unpack); int i, nstack = 0; for (i=0; infilters; i++) { filter->filters[i].nsamples = 0; filter->filters[i].nvalues = 0; filter->filters[i].pass_site = -1; if ( filter->filters[i].tok_type == TOK_VAL ) { if ( filter->filters[i].setter ) // variable, query the VCF line filter->filters[i].setter(filter, line, &filter->filters[i]); else if ( filter->filters[i].key ) // string constant { filter->filters[i].str_value = filter->filters[i].key; filter->filters[i].values[0] = filter->filters[i].values[0]; filter->filters[i].nvalues = strlen(filter->filters[i].key); } else // numeric constant { filter->filters[i].values[0] = filter->filters[i].threshold; filter->filters[i].nvalues = 1; } filter->flt_stack[nstack++] = &filter->filters[i]; continue; } else if ( filter->filters[i].tok_type == TOK_FUNC ) // all functions take only one argument { filter->filters[i].setter(filter, line, filter->flt_stack[nstack-1]); continue; } if ( nstack<2 ) error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack int is_str = filter->flt_stack[nstack-1]->is_str + filter->flt_stack[nstack-2]->is_str; if ( filter->filters[i].tok_type == TOK_OR || filter->filters[i].tok_type == TOK_OR_VEC ) { if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 ) error("Error occurred while processing the filter \"%s\" (%d %d OR)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site); filter->flt_stack[nstack-2]->pass_site = vector_logic_or(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type); nstack--; continue; } if ( filter->filters[i].tok_type == TOK_AND || filter->filters[i].tok_type == TOK_AND_VEC ) { if ( filter->flt_stack[nstack-1]->pass_site<0 || filter->flt_stack[nstack-2]->pass_site<0 ) error("Error occurred while processing the filter \"%s\" (%d %d AND)\n", filter->str,filter->flt_stack[nstack-2]->pass_site,filter->flt_stack[nstack-1]->pass_site); filter->flt_stack[nstack-2]->pass_site = vector_logic_and(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]); nstack--; continue; } if ( filter->filters[i].tok_type == TOK_ADD ) { VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],+); nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_SUB ) { VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],-); nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_MULT ) { VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],*); nstack--; continue; } else if ( filter->filters[i].tok_type == TOK_DIV ) { VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],/); nstack--; continue; } int is_true = 0; if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues ) { filter->flt_stack[nstack-2]->nvalues = filter->flt_stack[nstack-2]->nsamples = 0; } else if ( filter->filters[i].tok_type == TOK_EQ ) { if ( filter->flt_stack[nstack-1]->comparator ) is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_EQ,line); else if ( filter->flt_stack[nstack-2]->comparator ) is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ,line); else if ( is_str==2 ) // both are strings is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_EQ); else if ( is_str==1 ) error("Comparing string to numeric value: %s\n", filter->str); else CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],==,is_true); } else if ( filter->filters[i].tok_type == TOK_NE ) { if ( filter->flt_stack[nstack-1]->comparator ) is_true = filter->flt_stack[nstack-1]->comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],TOK_NE,line); else if ( filter->flt_stack[nstack-2]->comparator ) is_true = filter->flt_stack[nstack-2]->comparator(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE,line); else if ( is_str==2 ) is_true = cmp_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],TOK_NE); else if ( is_str==1 ) error("Comparing string to numeric value: %s\n", filter->str); else CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],!=,is_true); } else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE ) { if ( is_str==2 ) { is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]); if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1; } else error("The regex operator can be used on strings only: %s\n", filter->str); } else if ( is_str>0 ) error("Wrong operator in string comparison: %s [%s,%s]\n", filter->str, filter->flt_stack[nstack-1]->str_value, filter->flt_stack[nstack-2]->str_value); else if ( filter->filters[i].tok_type == TOK_LE ) CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<=,is_true) else if ( filter->filters[i].tok_type == TOK_LT ) CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],<,is_true) else if ( filter->filters[i].tok_type == TOK_BT ) CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>,is_true) else if ( filter->filters[i].tok_type == TOK_BE ) CMP_VECTORS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],>=,is_true) else error("FIXME: did not expect this .. tok_type %d = %d\n", i, filter->filters[i].tok_type); filter->flt_stack[nstack-2]->pass_site = is_true; nstack--; } if ( nstack>1 ) error("Error occurred while processing the filter \"%s\" (2:%d)\n", filter->str,nstack); // too few values left on the stack if ( samples ) { *samples = filter->max_unpack&BCF_UN_FMT ? filter->flt_stack[0]->pass_samples : NULL; if ( *samples && !filter->flt_stack[0]->nsamples ) { for (i=0; insamples; i++) filter->flt_stack[0]->pass_samples[i] = filter->flt_stack[0]->pass_site; } } return filter->flt_stack[0]->pass_site; }