diff options
Diffstat (limited to 'qdbm/odeum.c')
-rw-r--r-- | qdbm/odeum.c | 2090 |
1 files changed, 2090 insertions, 0 deletions
diff --git a/qdbm/odeum.c b/qdbm/odeum.c new file mode 100644 index 00000000..15395224 --- /dev/null +++ b/qdbm/odeum.c @@ -0,0 +1,2090 @@ +/************************************************************************************************* + * Implementation of Odeum + * Copyright (C) 2000-2007 Mikio Hirabayashi + * This file is part of QDBM, Quick Database Manager. + * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU + * Lesser General Public License as published by the Free Software Foundation; either version + * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * You should have received a copy of the GNU Lesser General Public License along with QDBM; if + * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA. + *************************************************************************************************/ + + +#define QDBM_INTERNAL 1 + +#include "odeum.h" +#include "myconf.h" + +#define OD_NAMEMAX 256 /* max size of a database name */ +#define OD_DIRMODE 00755 /* permission of a creating directory */ +#define OD_PATHBUFSIZ 1024 /* size of a path buffer */ +#define OD_NUMBUFSIZ 32 /* size of a buffer for a number */ +#define OD_MAPPBNUM 127 /* bucket size of a petit map handle */ +#define OD_DOCSNAME "docs" /* name of the database for documents */ +#define OD_INDEXNAME "index" /* name of the database for inverted index */ +#define OD_RDOCSNAME "rdocs" /* name of the database for reverse dictionary */ +#define OD_DOCSBNUM 2039 /* initial bucket number of document database */ +#define OD_DOCSDNUM 17 /* division number of document database */ +#define OD_DOCSALIGN -4 /* alignment of document database */ +#define OD_DOCSFBP 32 /* size of free block pool of document database */ +#define OD_INDEXBNUM 32749 /* initial bucket number of inverted index */ +#define OD_INDEXDNUM 7 /* division number of inverted index */ +#define OD_INDEXALIGN -2 /* alignment of inverted index */ +#define OD_INDEXFBP 32 /* size of free block pool of inverted index */ +#define OD_RDOCSLRM 81 /* records in a leaf node of reverse dictionary */ +#define OD_RDOCSNIM 192 /* records in a non-leaf node of reverse dictionary */ +#define OD_RDOCSLCN 128 /* number of leaf cache of reverse dictionary */ +#define OD_RDOCSNCN 32 /* number of non-leaf cache of reverse dictionary */ +#define OD_CACHEBNUM 262139 /* number of buckets for dirty buffers */ +#define OD_CACHESIZ 8388608 /* max bytes to use memory for dirty buffers */ +#define OD_CFLIVERAT 0.8 /* ratio of usable cache region */ +#define OD_CFBEGSIZ 2048 /* beginning size of flushing frequent words */ +#define OD_CFENDSIZ 64 /* lower limit of flushing frequent words */ +#define OD_CFRFRAT 0.2 /* ratio of flushing rare words a time */ +#define OD_OTCBBUFSIZ 1024 /* size of a buffer for call back functions */ +#define OD_OTPERWORDS 10000 /* frequency of call back in merging index */ +#define OD_OTPERDOCS 1000 /* frequency of call back in merging docs */ +#define OD_MDBRATIO 2.5 /* ratio of bucket number and document number */ +#define OD_MIBRATIO 1.5 /* ratio of bucket number and word number */ +#define OD_MIARATIO 0.75 /* ratio of alignment to the first words */ +#define OD_MIWUNIT 32 /* writing unit of merging inverted index */ +#define OD_DMAXEXPR "dmax" /* key of max number of the document ID */ +#define OD_DNUMEXPR "dnum" /* key of number of the documents */ +#define OD_URIEXPR "1" /* map key of URI */ +#define OD_ATTRSEXPR "2" /* map key of attributes */ +#define OD_NWORDSEXPR "3" /* map key of normal words */ +#define OD_AWORDSEXPR "4" /* map key of as-is words */ +#define OD_WTOPRATE 0.1 /* ratio of top words */ +#define OD_WTOPBONUS 5000 /* bonus points of top words */ +#define OD_KEYCRATIO 1.75 /* ratio of number to max of keyword candidates */ +#define OD_WOCCRPOINT 10000 /* points per occurence */ +#define OD_SPACECHARS "\t\n\v\f\r " /* space characters */ +#define OD_DELIMCHARS "!\"#$%&'()*/<=>?[\\]^`{|}~" /* delimiter characters */ +#define OD_GLUECHARS "+,-.:;@" /* glueing characters */ +#define OD_MAXWORDLEN 48 /* max length of a word */ + +typedef struct { /* type of structure for word counting */ + const char *word; /* pointer to the word */ + int num; /* frequency of the word */ +} ODWORD; + +enum { /* enumeration for events binded to each character */ + OD_EVWORD, /* word */ + OD_EVSPACE, /* space */ + OD_EVDELIM, /* delimiter */ + OD_EVGLUE /* glue */ +}; + + +/* private global variables */ +int odindexbnum = OD_INDEXBNUM; +int odindexdnum = OD_INDEXDNUM; +int odcachebnum = OD_CACHEBNUM; +int odcachesiz = OD_CACHESIZ; +void (*odotcb)(const char *, ODEUM *, const char *) = NULL; + + +/* private function prototypes */ +static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum, + const char *fname); +static int odcacheflush(ODEUM *odeum, const char *fname); +static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min); +static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio); +static int odsortindex(ODEUM *odeum, const char *fname); +static int odsortcompare(const void *a, const void *b); +static int odpurgeindex(ODEUM *odeum, const char *fname); +static CBMAP *odpairsmap(const ODPAIR *pairs, int num); +static int odwordcompare(const void *a, const void *b); +static int odmatchoperator(ODEUM *odeum, CBLIST *tokens); +static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, + CBLIST *errors); +static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, + CBLIST *errors); +static void odfixtokens(ODEUM *odeum, CBLIST *tokens); +static void odcleannormalized(ODEUM *odeum, CBLIST *nwords); + + + +/************************************************************************************************* + * public objects + *************************************************************************************************/ + + +/* Get a database handle. */ +ODEUM *odopen(const char *name, int omode){ + assert(name); + return odopendb(name, omode, OD_DOCSBNUM, odindexbnum, "odopen"); +} + + +/* Close a database handle. */ +int odclose(ODEUM *odeum){ + char numbuf[OD_NUMBUFSIZ]; + int err; + assert(odeum); + err = FALSE; + if(odotcb) odotcb("odclose", odeum, "closing the connection"); + if(odeum->wmode){ + if(odotcb) odotcb("odclose", odeum, "writing meta information"); + sprintf(numbuf, "%d", odeum->dmax); + if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE; + sprintf(numbuf, "%d", odeum->dnum); + if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE; + if(!odcacheflushfreq(odeum, "odclose", OD_CFENDSIZ)) err = TRUE; + if(!odcacheflushrare(odeum, "odclose", OD_CFRFRAT)) err = TRUE; + if(!odcacheflush(odeum, "odclose")) err = TRUE; + if(!odsortindex(odeum, "odclose")) err = TRUE; + cbmapclose(odeum->cachemap); + cbmapclose(odeum->sortmap); + } + if(!vlclose(odeum->rdocsdb)) err = TRUE; + if(!crclose(odeum->indexdb)) err = TRUE; + if(!crclose(odeum->docsdb)) err = TRUE; + free(odeum->name); + free(odeum); + return err ? FALSE : TRUE; +} + + +/* Store a document. */ +int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){ + char *tmp, *zbuf; + const char *word, *ctmp; + int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz; + double ival; + ODPAIR pair; + CBMAP *map; + CBLIST *tlist; + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + if(!odeum->wmode){ + dpecodeset(DP_EMODE, __FILE__, __LINE__); + return FALSE; + } + if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){ + if(!over){ + free(tmp); + dpecodeset(DP_EKEEP, __FILE__, __LINE__); + return FALSE; + } + if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return FALSE; + } + free(tmp); + } + odeum->dmax++; + odeum->dnum++; + docid = odeum->dmax; + map = cbmapopen(); + cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE); + tmp = cbmapdump(doc->attrs, &tsiz); + cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE); + free(tmp); + if(wmax < 0 || wmax > cblistnum(doc->nwords)) wmax = cblistnum(doc->nwords); + tlist = cblistopen(); + for(i = 0; i < wmax; i++){ + ctmp = cblistval(doc->nwords, i, &wsiz); + cblistpush(tlist, ctmp, wsiz); + } + tmp = cblistdump(tlist, &tsiz); + cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE); + free(tmp); + cblistclose(tlist); + tlist = cblistopen(); + for(i = 0; i < wmax; i++){ + ctmp = cblistval(doc->awords, i, &wsiz); + if(strcmp(ctmp, cblistval(doc->nwords, i, NULL))){ + cblistpush(tlist, ctmp, wsiz); + } else { + cblistpush(tlist, "\0", 1); + } + } + tmp = cblistdump(tlist, &tsiz); + cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE); + free(tmp); + cblistclose(tlist); + tmp = cbmapdump(map, &tsiz); + cbmapclose(map); + if(_qdbm_deflate){ + if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){ + free(tmp); + dpecodeset(DP_EMISC, __FILE__, __LINE__); + odeum->fatal = TRUE; + return FALSE; + } + free(tmp); + tmp = zbuf; + tsiz = zsiz; + } + if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){ + free(tmp); + if(dpecode == DP_EKEEP) dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return FALSE; + } + free(tmp); + if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){ + odeum->fatal = TRUE; + return FALSE; + } + map = cbmapopen(); + wnum = cblistnum(doc->nwords); + tmax = (int)(wnum * OD_WTOPRATE); + for(i = 0; i < wnum; i++){ + word = cblistval(doc->nwords, i, &wsiz); + if(wsiz < 1) continue; + if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ + num = *(int *)ctmp + OD_WOCCRPOINT; + } else { + num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT; + } + cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE); + } + ival = odlogarithm(wnum); + ival = (ival * ival * ival) / 8.0; + if(ival < 8.0) ival = 8.0; + cbmapiterinit(map); + while((word = cbmapiternext(map, &wsiz)) != NULL){ + pair.id = docid; + pair.score = (int)(*(int *)cbmapget(map, word, wsiz, NULL) / ival); + cbmapputcat(odeum->cachemap, word, wsiz, (char *)&pair, sizeof(pair)); + cbmapmove(odeum->cachemap, word, wsiz, FALSE); + odeum->cacheasiz += sizeof(pair); + cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE); + } + cbmapclose(map); + if(odeum->cacheasiz > odcachesiz){ + for(i = OD_CFBEGSIZ; odeum->cacheasiz > odcachesiz * OD_CFLIVERAT && i >= OD_CFENDSIZ; + i /= 2){ + if(!odcacheflushfreq(odeum, "odput", i)) return FALSE; + } + while(odeum->cacheasiz > odcachesiz * OD_CFLIVERAT){ + if(!odcacheflushrare(odeum, "odput", OD_CFRFRAT)) return FALSE; + } + } + doc->id = docid; + odeum->ldid = docid; + return TRUE; +} + + +/* Delete a document by a URL. */ +int odout(ODEUM *odeum, const char *uri){ + char *tmp; + int tsiz, docid; + assert(odeum && uri); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + if(!odeum->wmode){ + dpecodeset(DP_EMODE, __FILE__, __LINE__); + return FALSE; + } + if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ + if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; + return FALSE; + } + if(tsiz != sizeof(int)){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return FALSE; + } + docid = *(int *)tmp; + free(tmp); + return odoutbyid(odeum, docid); +} + + +/* Delete a document specified by an ID number. */ +int odoutbyid(ODEUM *odeum, int id){ + char *tmp, *zbuf; + const char *uritmp; + int tsiz, uritsiz, zsiz; + CBMAP *map; + assert(odeum && id > 0); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + if(!odeum->wmode){ + dpecodeset(DP_EMODE, __FILE__, __LINE__); + return FALSE; + } + if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ + if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; + return FALSE; + } + if(_qdbm_inflate){ + if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return FALSE; + } + free(tmp); + tmp = zbuf; + tsiz = zsiz; + } + map = cbmapload(tmp, tsiz); + free(tmp); + uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); + if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){ + cbmapclose(map); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return FALSE; + } + cbmapclose(map); + if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){ + odeum->fatal = TRUE; + return FALSE; + } + odeum->dnum--; + return TRUE; +} + + +/* Retrieve a document by a URI. */ +ODDOC *odget(ODEUM *odeum, const char *uri){ + char *tmp; + int tsiz, docid; + assert(odeum && uri); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return NULL; + } + if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ + if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; + return NULL; + } + if(tsiz != sizeof(int)){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return NULL; + } + docid = *(int *)tmp; + free(tmp); + return odgetbyid(odeum, docid); +} + + +/* Retrieve a document by an ID number. */ +ODDOC *odgetbyid(ODEUM *odeum, int id){ + char *tmp, *zbuf; + const char *uritmp, *attrstmp, *nwordstmp, *awordstmp, *asis, *normal; + int i, tsiz, uritsiz, attrstsiz, nwordstsiz, awordstsiz, zsiz, asiz, nsiz; + ODDOC *doc; + CBMAP *map; + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return NULL; + } + if(id < 1){ + dpecodeset(DP_ENOITEM, __FILE__, __LINE__); + return NULL; + } + if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ + if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; + return NULL; + } + if(_qdbm_inflate){ + if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return NULL; + } + free(tmp); + tmp = zbuf; + tsiz = zsiz; + } + map = cbmapload(tmp, tsiz); + free(tmp); + uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); + attrstmp = cbmapget(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), &attrstsiz); + nwordstmp = cbmapget(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), &nwordstsiz); + awordstmp = cbmapget(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), &awordstsiz); + if(!uritmp || !attrstmp || !nwordstmp || !awordstmp){ + cbmapclose(map); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return NULL; + } + doc = cbmalloc(sizeof(ODDOC)); + doc->id = id; + doc->uri = cbmemdup(uritmp, uritsiz); + doc->attrs = cbmapload(attrstmp, attrstsiz); + doc->nwords = cblistload(nwordstmp, nwordstsiz); + doc->awords = cblistload(awordstmp, awordstsiz); + cbmapclose(map); + for(i = 0; i < cblistnum(doc->awords); i++){ + asis = cblistval(doc->awords, i, &asiz); + if(asiz == 1 && asis[0] == '\0'){ + normal = cblistval(doc->nwords, i, &nsiz); + cblistover(doc->awords, i, normal, nsiz); + } + } + return doc; +} + + +/* Retrieve the ID of the document specified by a URI. */ +int odgetidbyuri(ODEUM *odeum, const char *uri){ + char *tmp; + int tsiz, docid; + assert(odeum && uri); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ + if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; + return -1; + } + if(tsiz != sizeof(int)){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return -1; + } + docid = *(int *)tmp; + free(tmp); + return docid; +} + + +/* Check whether the document specified by an ID number exists. */ +int odcheck(ODEUM *odeum, int id){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + if(id < 1){ + dpecodeset(DP_ENOITEM, __FILE__, __LINE__); + return FALSE; + } + return crvsiz(odeum->docsdb, (char *)&id, sizeof(int)) != -1; +} + + +/* Search the inverted index for documents including a word. */ +ODPAIR *odsearch(ODEUM *odeum, const char *word, int max, int *np){ + char *tmp; + int tsiz; + assert(odeum && word && np); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return NULL; + } + if(odeum->wmode && cbmaprnum(odeum->sortmap) > 0 && + (!odcacheflush(odeum, "odsearch") || !odsortindex(odeum, "odsearch"))){ + odeum->fatal = TRUE; + return NULL; + } + max = max < 0 ? -1 : max * sizeof(ODPAIR); + if(!(tmp = crget(odeum->indexdb, word, -1, 0, max, &tsiz))){ + if(dpecode != DP_ENOITEM){ + odeum->fatal = TRUE; + return NULL; + } + *np = 0; + return cbmalloc(1); + } + *np = tsiz / sizeof(ODPAIR); + return (ODPAIR *)tmp; +} + + +/* Get the number of documents including a word. */ +int odsearchdnum(ODEUM *odeum, const char *word){ + int rv; + assert(odeum && word); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + rv = crvsiz(odeum->indexdb, word, -1); + return rv < 0 ? -1 : rv / sizeof(ODPAIR); +} + + +/* Initialize the iterator of a database handle. */ +int oditerinit(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + return criterinit(odeum->docsdb); +} + + +/* Get the next key of the iterator. */ +ODDOC *oditernext(ODEUM *odeum){ + char *tmp; + int tsiz, docsid; + ODDOC *doc; + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return NULL; + } + doc = NULL; + while(TRUE){ + if(!(tmp = criternext(odeum->docsdb, &tsiz))){ + if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; + return NULL; + } + if(tsiz != sizeof(int)){ + free(tmp); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + odeum->fatal = TRUE; + return NULL; + } + docsid = *(int *)tmp; + free(tmp); + if((doc = odgetbyid(odeum, docsid)) != NULL) break; + if(dpecode != DP_ENOITEM){ + odeum->fatal = TRUE; + return NULL; + } + } + return doc; +} + + +/* Synchronize updating contents with the files and the devices. */ +int odsync(ODEUM *odeum){ + char numbuf[OD_NUMBUFSIZ]; + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + if(!odeum->wmode){ + dpecodeset(DP_EMODE, __FILE__, __LINE__); + return FALSE; + } + if(odotcb) odotcb("odsync", odeum, "writing meta information"); + sprintf(numbuf, "%d", odeum->dmax); + if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)){ + odeum->fatal = TRUE; + return FALSE; + } + sprintf(numbuf, "%d", odeum->dnum); + if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)){ + odeum->fatal = TRUE; + return FALSE; + } + if(!odcacheflush(odeum, "odsync")){ + odeum->fatal = TRUE; + return FALSE; + } + if(!odsortindex(odeum, "odsync")){ + odeum->fatal = TRUE; + return FALSE; + } + if(odotcb) odotcb("odsync", odeum, "synchronizing the document database"); + if(!crsync(odeum->docsdb)){ + odeum->fatal = TRUE; + return FALSE; + } + if(odotcb) odotcb("odsync", odeum, "synchronizing the inverted index"); + if(!crsync(odeum->indexdb)){ + odeum->fatal = TRUE; + return FALSE; + } + if(odotcb) odotcb("odsync", odeum, "synchronizing the reverse dictionary"); + if(!vlsync(odeum->rdocsdb)){ + odeum->fatal = TRUE; + return FALSE; + } + return TRUE; +} + + +/* Optimize a database. */ +int odoptimize(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return FALSE; + } + if(!odeum->wmode){ + dpecodeset(DP_EMODE, __FILE__, __LINE__); + return FALSE; + } + if(!odcacheflush(odeum, "odoptimize")){ + odeum->fatal = TRUE; + return FALSE; + } + if(odeum->ldid < 1 || odeum->ldid != odeum->dnum){ + if(!odpurgeindex(odeum, "odoptimize")){ + odeum->fatal = TRUE; + return FALSE; + } + } + if(odeum->ldid > 0){ + if(!odsortindex(odeum, "odoptimize")){ + odeum->fatal = TRUE; + return FALSE; + } + } + if(odotcb) odotcb("odoptimize", odeum, "optimizing the document database"); + if(!croptimize(odeum->docsdb, -1)){ + odeum->fatal = TRUE; + return FALSE; + } + if(odotcb) odotcb("odoptimize", odeum, "optimizing the inverted index"); + if(!croptimize(odeum->indexdb, -1)){ + odeum->fatal = TRUE; + return FALSE; + } + if(odotcb) odotcb("odoptimize", odeum, "optimizing the reverse dictionary"); + if(!vloptimize(odeum->rdocsdb)){ + odeum->fatal = TRUE; + return FALSE; + } + return TRUE; +} + + +/* Get the name of a database. */ +char *odname(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return NULL; + } + return cbmemdup(odeum->name, -1); +} + + +/* Get the total size of database files. */ +double odfsiz(ODEUM *odeum){ + double fsiz, rv; + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + fsiz = 0; + if((rv = crfsizd(odeum->docsdb)) < 0) return -1.0; + fsiz += rv; + if((rv = crfsizd(odeum->indexdb)) < 0) return -1.0; + fsiz += rv; + if((rv = vlfsiz(odeum->rdocsdb)) == -1) return -1.0; + fsiz += rv; + return fsiz; +} + + +/* Get the total number of the elements of the bucket arrays for the inverted index. */ +int odbnum(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + return crbnum(odeum->indexdb); +} + + +/* Get the total number of the used elements of the bucket arrays in the inverted index. */ +int odbusenum(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + return crbusenum(odeum->indexdb); +} + + +/* Get the number of the documents stored in a database. */ +int oddnum(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + return odeum->dnum; +} + + +/* Get the number of the words stored in a database. */ +int odwnum(ODEUM *odeum){ + assert(odeum); + if(odeum->fatal){ + dpecodeset(DP_EFATAL, __FILE__, __LINE__); + return -1; + } + return crrnum(odeum->indexdb); +} + + +/* Check whether a database handle is a writer or not. */ +int odwritable(ODEUM *odeum){ + assert(odeum); + return odeum->wmode; +} + + +/* Check whether a database has a fatal error or not. */ +int odfatalerror(ODEUM *odeum){ + assert(odeum); + return odeum->fatal; +} + + +/* Get the inode number of a database directory. */ +int odinode(ODEUM *odeum){ + assert(odeum); + return odeum->inode; +} + + +/* Get the last modified time of a database. */ +time_t odmtime(ODEUM *odeum){ + assert(odeum); + return crmtime(odeum->indexdb); +} + + +/* Merge plural database directories. */ +int odmerge(const char *name, const CBLIST *elemnames){ + ODEUM *odeum, **elems; + CURIA *curia, *ecuria; + VILLA *villa, *evilla; + ODPAIR *pairs; + char *word, *kbuf, *vbuf, *dbuf, otmsg[OD_OTCBBUFSIZ]; + char *wpunit[OD_MIWUNIT], *vpunit[OD_MIWUNIT]; + int i, j, k, num, dnum, wnum, dbnum, ibnum, tnum, wsunit[OD_MIWUNIT], vsunit[OD_MIWUNIT]; + int err, *bases, sum, max, wsiz, ksiz, vsiz, uend, unum, pnum, align, id, nid, dsiz; + assert(name && elemnames); + num = cblistnum(elemnames); + elems = cbmalloc(num * sizeof(ODEUM *) + 1); + dnum = 0; + wnum = 0; + for(i = 0; i < num; i++){ + if(!(elems[i] = odopen(cblistval(elemnames, i, NULL), OD_OREADER))){ + for(i -= 1; i >= 0; i--){ + odclose(elems[i]); + } + free(elems); + return FALSE; + } + dnum += oddnum(elems[i]); + wnum += odwnum(elems[i]); + } + dbnum = (int)(dnum * OD_MDBRATIO / OD_DOCSDNUM); + ibnum = (int)(wnum * OD_MIBRATIO / odindexdnum); + if(!(odeum = odopendb(name, OD_OWRITER | OD_OCREAT | OD_OTRUNC, dbnum, ibnum, "odmerge"))){ + for(i = 0; i < num; i++){ + odclose(elems[i]); + } + free(elems); + return FALSE; + } + err = FALSE; + if(odotcb) odotcb("odmerge", odeum, "calculating the base ID numbers"); + bases = cbmalloc(num * sizeof(int) + 1); + sum = 0; + for(i = 0; i < num; i++){ + ecuria = elems[i]->docsdb; + max = 0; + if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE; + while((kbuf = criternext(ecuria, &ksiz)) != NULL){ + if(ksiz == sizeof(int)){ + if(*(int *)kbuf > max) max = *(int *)kbuf; + } + free(kbuf); + } + bases[i] = sum; + sum += max; + } + curia = odeum->indexdb; + for(i = 0; i < num; i++){ + if(odotcb){ + sprintf(otmsg, "merging the inverted index (%d/%d)", i + 1, num); + odotcb("odmerge", odeum, otmsg); + } + ecuria = elems[i]->indexdb; + tnum = 0; + uend = FALSE; + if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE; + while(!uend){ + for(unum = 0; unum < OD_MIWUNIT; unum++){ + if(!(word = criternext(ecuria, &wsiz))){ + uend = TRUE; + break; + } + if(!(vbuf = crget(ecuria, word, wsiz, 0, -1, &vsiz))){ + err = TRUE; + free(word); + break; + } + wpunit[unum] = word; + wsunit[unum] = wsiz; + vpunit[unum] = vbuf; + vsunit[unum] = vsiz; + } + for(j = 0; j < unum; j++){ + word = wpunit[j]; + wsiz = wsunit[j]; + vbuf = vpunit[j]; + vsiz = vsunit[j]; + pairs = (ODPAIR *)vbuf; + pnum = vsiz / sizeof(ODPAIR); + for(k = 0; k < pnum; k++){ + pairs[k].id += bases[i]; + } + align = (int)(i < num - 1 ? vsiz * (num - i) * OD_MIARATIO : OD_INDEXALIGN); + if(!crsetalign(curia, align)) err = TRUE; + if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DCAT)) err = TRUE; + free(vbuf); + free(word); + if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){ + sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria)); + odotcb("odmerge", odeum, otmsg); + } + tnum++; + } + } + } + if(odotcb) odotcb("odmerge", odeum, "sorting the inverted index"); + tnum = 0; + if(!criterinit(curia) && dpecode != DP_ENOITEM) err = TRUE; + while((word = criternext(curia, &wsiz)) != NULL){ + if((vbuf = crget(curia, word, wsiz, 0, -1, &vsiz)) != NULL){ + if(vsiz > sizeof(ODPAIR)){ + pairs = (ODPAIR *)vbuf; + pnum = vsiz / sizeof(ODPAIR); + qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare); + if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DOVER)) err = TRUE; + } + free(vbuf); + } + free(word); + if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){ + sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(curia)); + odotcb("odmerge", odeum, otmsg); + } + tnum++; + } + if(odotcb) odotcb("odmerge", odeum, "synchronizing the inverted index"); + if(!crsync(curia)) err = TRUE; + dnum = 0; + curia = odeum->docsdb; + villa = odeum->rdocsdb; + for(i = 0; i < num; i++){ + if(odotcb){ + sprintf(otmsg, "merging the document database (%d/%d)", i + 1, num); + odotcb("odmerge", odeum, otmsg); + } + evilla = elems[i]->rdocsdb; + ecuria = elems[i]->docsdb; + tnum = 0; + if(!vlcurfirst(evilla) && dpecode != DP_ENOITEM) err = TRUE; + while(TRUE){ + if(!(kbuf = vlcurkey(evilla, &ksiz))) break; + if((ksiz == sizeof(OD_DMAXEXPR) && !memcmp(kbuf, OD_DMAXEXPR, ksiz)) || + (ksiz == sizeof(OD_DNUMEXPR) && !memcmp(kbuf, OD_DNUMEXPR, ksiz))){ + free(kbuf); + if(!vlcurnext(evilla)) break; + continue; + } + if(!(vbuf = vlcurval(evilla, &vsiz))){ + free(kbuf); + if(!vlcurnext(evilla)) break; + continue; + } + if(vsiz != sizeof(int)){ + free(vbuf); + free(kbuf); + if(!vlcurnext(evilla)) break; + continue; + } + id = *(int *)vbuf; + nid = id + bases[i]; + if(vlput(villa, kbuf, ksiz, (char *)&nid, sizeof(int), VL_DKEEP)){ + if((dbuf = crget(ecuria, (char *)&id, sizeof(int), 0, -1, &dsiz)) != NULL){ + if(crput(curia, (char *)&nid, sizeof(int), dbuf, dsiz, CR_DKEEP)){ + dnum++; + } else { + err = TRUE; + } + free(dbuf); + } else { + err = TRUE; + } + } else if(dpecode != DP_EKEEP){ + err = TRUE; + } + free(vbuf); + free(kbuf); + odeum->dnum++; + if(odotcb && (tnum + 1) % OD_OTPERDOCS == 0){ + sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria)); + odotcb("odmerge", odeum, otmsg); + } + tnum++; + if(!vlcurnext(evilla)) break; + } + } + odeum->dnum = dnum; + odeum->dmax = dnum; + free(bases); + if(odotcb) odotcb("odmerge", odeum, "synchronizing the document index"); + if(!crsync(curia)) err = TRUE; + if(!odclose(odeum)) err = TRUE; + for(i = 0; i < num; i++){ + if(!odclose(elems[i])) err = TRUE; + } + free(elems); + return err ? FALSE : TRUE; +} + + +/* Remove a database directory. */ +int odremove(const char *name){ + char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ]; + char path[OD_PATHBUFSIZ]; + const char *file; + struct stat sbuf; + CBLIST *list; + int i; + assert(name); + sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); + sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); + sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); + if(lstat(name, &sbuf) == -1){ + dpecodeset(DP_ESTAT, __FILE__, __LINE__); + return FALSE; + } + if(lstat(docsname, &sbuf) != -1 && !crremove(docsname)) return FALSE; + if(lstat(indexname, &sbuf) != -1 && !crremove(indexname)) return FALSE; + if(lstat(rdocsname, &sbuf) != -1 && !vlremove(rdocsname)) return FALSE; + if((list = cbdirlist(name)) != NULL){ + for(i = 0; i < cblistnum(list); i++){ + file = cblistval(list, i, NULL); + if(!strcmp(file, MYCDIRSTR) || !strcmp(file, MYPDIRSTR)) continue; + sprintf(path, "%s%c%s", name, MYPATHCHR, file); + if(lstat(path, &sbuf) == -1) continue; + if(S_ISDIR(sbuf.st_mode)){ + if(!crremove(path)) return FALSE; + } else { + if(!dpremove(path)) return FALSE; + } + } + cblistclose(list); + } + if(rmdir(name) == -1){ + dpecodeset(DP_ERMDIR, __FILE__, __LINE__); + return FALSE; + } + return TRUE; +} + + +/* Get a document handle. */ +ODDOC *oddocopen(const char *uri){ + ODDOC *doc; + assert(uri); + doc = cbmalloc(sizeof(ODDOC)); + doc->id = -1; + doc->uri = cbmemdup(uri, -1); + doc->attrs = cbmapopenex(OD_MAPPBNUM); + doc->nwords = cblistopen(); + doc->awords = cblistopen(); + return doc; +} + + +/* Close a document handle. */ +void oddocclose(ODDOC *doc){ + assert(doc); + cblistclose(doc->awords); + cblistclose(doc->nwords); + cbmapclose(doc->attrs); + free(doc->uri); + free(doc); +} + + +/* Add an attribute to a document. */ +void oddocaddattr(ODDOC *doc, const char *name, const char *value){ + assert(doc && name && value); + cbmapput(doc->attrs, name, -1, value, -1, TRUE); +} + + +/* Add a word to a document. */ +void oddocaddword(ODDOC *doc, const char *normal, const char *asis){ + assert(doc && normal && asis); + cblistpush(doc->nwords, normal, -1); + cblistpush(doc->awords, asis, -1); +} + + +/* Get the ID number of a document. */ +int oddocid(const ODDOC *doc){ + assert(doc); + return doc->id; +} + + +/* Get the URI of a document. */ +const char *oddocuri(const ODDOC *doc){ + assert(doc); + return doc->uri; +} + + +/* Get the value of an attribute of a document. */ +const char *oddocgetattr(const ODDOC *doc, const char *name){ + assert(doc && name); + return cbmapget(doc->attrs, name, -1, NULL); +} + + +/* Get the list handle contains words in normalized form of a document. */ +const CBLIST *oddocnwords(const ODDOC *doc){ + assert(doc); + return doc->nwords; +} + + +/* Get the list handle contains words in appearance form of a document. */ +const CBLIST *oddocawords(const ODDOC *doc){ + assert(doc); + return doc->awords; +} + + +/* Get the map handle contains keywords in normalized form and their scores. */ +CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum){ + const CBLIST *nwords; + CBMAP *map, *kwmap; + const char *word, *ctmp; + char numbuf[OD_NUMBUFSIZ]; + ODWORD *owords; + int i, wsiz, wnum, hnum, mnum, nbsiz; + double ival; + assert(doc && max >= 0); + map = cbmapopen(); + nwords = oddocnwords(doc); + for(i = 0; i < cblistnum(nwords); i++){ + word = cblistval(nwords, i, &wsiz); + if(wsiz < 1) continue; + if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ + wnum = *(int *)ctmp + OD_WOCCRPOINT; + } else { + wnum = OD_WOCCRPOINT; + } + cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE); + } + mnum = cbmaprnum(map); + owords = cbmalloc(mnum * sizeof(ODWORD) + 1); + cbmapiterinit(map); + for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){ + owords[i].word = word; + owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL); + } + qsort(owords, mnum, sizeof(ODWORD), odwordcompare); + if(odeum){ + if(mnum > max * OD_KEYCRATIO) mnum = (int)(max * OD_KEYCRATIO); + for(i = 0; i < mnum; i++){ + if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0; + ival = odlogarithm(hnum); + ival = (ival * ival * ival) / 8.0; + if(ival < 8.0) ival = 8.0; + owords[i].num = (int)(owords[i].num / ival); + } + qsort(owords, mnum, sizeof(ODWORD), odwordcompare); + } + if(mnum > max) mnum = max; + kwmap = cbmapopenex(OD_MAPPBNUM); + for(i = 0; i < mnum; i++){ + nbsiz = sprintf(numbuf, "%d", owords[i].num); + cbmapput(kwmap, owords[i].word, -1, numbuf, nbsiz, TRUE); + } + free(owords); + cbmapclose(map); + return kwmap; +} + + +/* Break a text into words in appearance form. */ +CBLIST *odbreaktext(const char *text){ + const char *word; + CBLIST *elems, *words; + int i, j, dif, wsiz, pv, delim; + assert(text); + words = cblistopen(); + elems = cbsplit(text, -1, OD_SPACECHARS); + for(i = 0; i < cblistnum(elems); i++){ + word = cblistval(elems, i, &wsiz); + delim = FALSE; + j = 0; + pv = 0; + while(TRUE){ + dif = j - pv; + if(j >= wsiz){ + if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); + break; + } + if(delim){ + if(!strchr(OD_DELIMCHARS, word[j])){ + if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); + pv = j; + delim = FALSE; + } + } else { + if(strchr(OD_DELIMCHARS, word[j])){ + if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); + pv = j; + delim = TRUE; + } + } + j++; + } + } + cblistclose(elems); + return words; +} + + +/* Make the normalized form of a word. */ +char *odnormalizeword(const char *asis){ + char *nword; + int i; + assert(asis); + for(i = 0; asis[i] != '\0'; i++){ + if(!strchr(OD_DELIMCHARS, asis[i])) break; + } + if(asis[i] == '\0') return cbmemdup("", 0); + nword = cbmemdup(asis, -1); + for(i = 0; nword[i] != '\0'; i++){ + if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A'; + } + while(i >= 0){ + if(strchr(OD_GLUECHARS, nword[i])){ + nword[i] = '\0'; + } else { + break; + } + i--; + } + return nword; +} + + +/* Get the common elements of two sets of documents. */ +ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ + CBMAP *map; + ODPAIR *result; + const char *tmp; + int i, rnum; + assert(apairs && anum >= 0 && bpairs && bnum >= 0); + map = odpairsmap(bpairs, bnum); + result = cbmalloc(sizeof(ODPAIR) * anum + 1); + rnum = 0; + for(i = 0; i < anum; i++){ + if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue; + result[rnum].id = apairs[i].id; + result[rnum].score = apairs[i].score + *(int *)tmp; + rnum++; + } + cbmapclose(map); + qsort(result, rnum, sizeof(ODPAIR), odsortcompare); + *np = rnum; + return result; +} + + +/* Get the sum of elements of two sets of documents. */ +ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ + CBMAP *map; + ODPAIR *result; + const char *tmp; + int i, score, rnum; + assert(apairs && anum >= 0 && bpairs && bnum >= 0); + map = odpairsmap(bpairs, bnum); + for(i = 0; i < anum; i++){ + score = 0; + if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) + score = *(int *)tmp; + score += apairs[i].score; + cbmapput(map, (char *)&(apairs[i].id), sizeof(int), + (char *)&score, sizeof(int), TRUE); + } + rnum = cbmaprnum(map); + result = cbmalloc(rnum * sizeof(ODPAIR) + 1); + cbmapiterinit(map); + for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){ + result[i].id = *(int *)tmp; + result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL); + } + cbmapclose(map); + qsort(result, rnum, sizeof(ODPAIR), odsortcompare); + *np = rnum; + return result; +} + + +/* Get the difference set of documents. */ +ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ + CBMAP *map; + ODPAIR *result; + const char *tmp; + int i, rnum; + assert(apairs && anum >= 0 && bpairs && bnum >= 0); + map = odpairsmap(bpairs, bnum); + result = cbmalloc(sizeof(ODPAIR) * anum + 1); + rnum = 0; + for(i = 0; i < anum; i++){ + if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue; + result[rnum].id = apairs[i].id; + result[rnum].score = apairs[i].score; + rnum++; + } + cbmapclose(map); + qsort(result, rnum, sizeof(ODPAIR), odsortcompare); + *np = rnum; + return result; +} + + +/* Sort a set of documents in descending order of scores. */ +void odpairssort(ODPAIR *pairs, int pnum){ + assert(pairs && pnum >= 0); + qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare); +} + + +/* Get the natural logarithm of a number. */ +double odlogarithm(double x){ + int i; + if(x <= 1.0) return 0.0; + x = x * x * x * x * x * x * x * x * x * x; + for(i = 0; x > 1.0; i++){ + x /= 2.718281828459; + } + return (double)i / 10.0; +} + + +/* Get the cosine of the angle of two vectors. */ +double odvectorcosine(const int *avec, const int *bvec, int vnum){ + double rv; + assert(avec && bvec && vnum >= 0); + rv = odvecinnerproduct(avec, bvec, vnum) / + ((odvecabsolute(avec, vnum) * odvecabsolute(bvec, vnum))); + return rv > 0.0 ? rv : 0.0; +} + + +/* Set the global tuning parameters. */ +void odsettuning(int ibnum, int idnum, int cbnum, int csiz){ + if(ibnum > 0) odindexbnum = ibnum; + if(idnum > 0) odindexdnum = idnum; + if(cbnum > 0) odcachebnum = dpprimenum(cbnum); + if(csiz > 0) odcachesiz = csiz; +} + + +/* Break a text into words and store appearance forms and normalized form into lists. */ +void odanalyzetext(ODEUM *odeum, const char *text, CBLIST *awords, CBLIST *nwords){ + char aword[OD_MAXWORDLEN+1], *wp; + int lev, wsiz; + assert(odeum && text && awords); + lev = OD_EVSPACE; + wsiz = 0; + for(; *text != '\0'; text++){ + switch(odeum->statechars[*(unsigned char *)text]){ + case OD_EVWORD: + if(wsiz > 0 && lev == OD_EVDELIM){ + cblistpush(awords, aword, wsiz); + if(nwords) cblistpush(nwords, "", 0); + wsiz = 0; + } + if(wsiz <= OD_MAXWORDLEN){ + aword[wsiz++] = *text; + } + lev = OD_EVWORD; + break; + case OD_EVGLUE: + if(wsiz > 0 && lev == OD_EVDELIM){ + cblistpush(awords, aword, wsiz); + if(nwords) cblistpush(nwords, "", 0); + wsiz = 0; + } + if(wsiz <= OD_MAXWORDLEN){ + aword[wsiz++] = *text; + } + lev = OD_EVGLUE; + break; + case OD_EVDELIM: + if(wsiz > 0 && lev != OD_EVDELIM){ + cblistpush(awords, aword, wsiz); + if(nwords){ + wp = aword; + aword[wsiz] = '\0'; + while(*wp != '\0'){ + if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A'; + wp++; + } + wp--; + while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){ + wsiz--; + wp--; + } + cblistpush(nwords, aword, wsiz); + } + wsiz = 0; + } + if(wsiz <= OD_MAXWORDLEN){ + aword[wsiz++] = *text; + } + lev = OD_EVDELIM; + break; + default: + if(wsiz > 0){ + cblistpush(awords, aword, wsiz); + if(nwords){ + if(lev == OD_EVDELIM){ + cblistpush(nwords, "", 0); + } else { + wp = aword; + aword[wsiz] = '\0'; + while(*wp != '\0'){ + if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A'; + wp++; + } + wp--; + while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){ + wsiz--; + wp--; + } + cblistpush(nwords, aword, wsiz); + } + } + wsiz = 0; + } + lev = OD_EVSPACE; + break; + } + } + if(wsiz > 0){ + cblistpush(awords, aword, wsiz); + if(nwords){ + if(lev == OD_EVDELIM){ + cblistpush(nwords, "", 0); + } else { + wp = aword; + aword[wsiz] = '\0'; + while(*wp != '\0'){ + if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A'; + wp++; + } + wp--; + while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){ + wsiz--; + wp--; + } + cblistpush(nwords, aword, wsiz); + } + } + wsiz = 0; + } +} + + +/* Set the classes of characters used by `odanalyzetext'. */ +void odsetcharclass(ODEUM *odeum, const char *spacechars, const char *delimchars, + const char *gluechars){ + assert(odeum && spacechars && delimchars && gluechars); + memset(odeum->statechars, OD_EVWORD, sizeof(odeum->statechars)); + for(; *spacechars != '\0'; spacechars++){ + odeum->statechars[*(unsigned char *)spacechars] = OD_EVSPACE; + } + for(; *delimchars != '\0'; delimchars++){ + odeum->statechars[*(unsigned char *)delimchars] = OD_EVDELIM; + } + for(; *gluechars != '\0'; gluechars++){ + odeum->statechars[*(unsigned char *)gluechars] = OD_EVGLUE; + } +} + + +/* Query a database using a small boolean query language. */ +ODPAIR *odquery(ODEUM *odeum, const char *query, int *np, CBLIST *errors){ + CBLIST *tokens = cblistopen(); + CBLIST *nwords = cblistopen(); + ODPAIR *results = NULL; + assert(odeum && query && np); + odanalyzetext(odeum, query, tokens, nwords); + odcleannormalized(odeum, nwords); + odfixtokens(odeum, tokens); + results = odparseexpr(odeum, tokens, nwords, np, errors); + cblistclose(tokens); + cblistclose(nwords); + return results; +} + + + +/************************************************************************************************* + * features for experts + *************************************************************************************************/ + + +/* Get the internal database handle for documents. */ +CURIA *odidbdocs(ODEUM *odeum){ + assert(odeum); + return odeum->docsdb; +} + + +/* Get the internal database handle for the inverted index. */ +CURIA *odidbindex(ODEUM *odeum){ + assert(odeum); + return odeum->indexdb; +} + + +/* Get the internal database handle for the reverse dictionary. */ +VILLA *odidbrdocs(ODEUM *odeum){ + assert(odeum); + return odeum->rdocsdb; +} + + +/* Set the call back function called in merging. */ +void odsetotcb(void (*otcb)(const char *, ODEUM *, const char *)){ + odotcb = otcb; +} + + +/* Get the positive one of square roots of a number. */ +double odsquareroot(double x){ + double c, rv; + if(x <= 0.0) return 0.0; + c = x > 1.0 ? x : 1; + do { + rv = c; + c = (x / c + c) * 0.5; + } while(c < rv); + return rv; +} + + +/* Get the absolute of a vector. */ +double odvecabsolute(const int *vec, int vnum){ + double rv; + int i; + assert(vec && vnum >= 0); + rv = 0; + for(i = 0; i < vnum; i++){ + rv += (double)vec[i] * (double)vec[i]; + } + return odsquareroot(rv); +} + + +/* Get the inner product of two vectors. */ +double odvecinnerproduct(const int *avec, const int *bvec, int vnum){ + double rv; + int i; + assert(avec && bvec && vnum >= 0); + rv = 0; + for(i = 0; i < vnum; i++){ + rv += (double)avec[i] * (double)bvec[i]; + } + return rv; +} + + + +/************************************************************************************************* + * private objects + *************************************************************************************************/ + + +/* Get a database handle. + `name' specifies the name of a database directory. + `omode' specifies the connection mode. + `docsbnum` specifies the number of buckets of the document database. + `indexbnum` specifies the number of buckets of the index database. + `fname' specifies the name of caller function. + The return value is the database handle or `NULL' if it is not successful. */ +static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum, + const char *fname){ + int cromode, vlomode, inode, dmax, dnum; + char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp; + struct stat sbuf; + CURIA *docsdb, *indexdb; + VILLA *rdocsdb; + CBMAP *cachemap; + CBMAP *sortmap; + ODEUM *odeum; + assert(name); + if(strlen(name) > OD_NAMEMAX){ + dpecodeset(DP_EMISC, __FILE__, __LINE__); + return NULL; + } + cromode = CR_OREADER; + vlomode = VL_OREADER; + if(omode & OD_OWRITER){ + cromode = CR_OWRITER; + vlomode = VL_OWRITER | VL_OZCOMP | VL_OYCOMP; + if(omode & OD_OCREAT){ + cromode |= CR_OCREAT; + vlomode |= VL_OCREAT; + } + if(omode & OD_OTRUNC){ + cromode |= CR_OTRUNC; + vlomode |= VL_OTRUNC; + } + } + if(omode & OD_ONOLCK){ + cromode |= CR_ONOLCK; + vlomode |= VL_ONOLCK; + } + if(omode & OD_OLCKNB){ + cromode |= CR_OLCKNB; + vlomode |= VL_OLCKNB; + } + sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); + sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); + sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); + docsdb = NULL; + indexdb = NULL; + rdocsdb = NULL; + if((omode & OD_OWRITER) && (omode & OD_OCREAT)){ + if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){ + dpecodeset(DP_EMKDIR, __FILE__, __LINE__); + return NULL; + } + } + if(lstat(name, &sbuf) == -1){ + dpecodeset(DP_ESTAT, __FILE__, __LINE__); + return NULL; + } + inode = sbuf.st_ino; + if(!(docsdb = cropen(docsname, cromode, docsbnum, OD_DOCSDNUM))) return NULL; + if(!(indexdb = cropen(indexname, cromode, indexbnum, odindexdnum))){ + crclose(docsdb); + return NULL; + } + if(omode & OD_OWRITER){ + if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetfbpsiz(docsdb, OD_DOCSFBP) || + !crsetalign(indexdb, OD_INDEXALIGN) || !crsetfbpsiz(indexdb, OD_INDEXFBP)){ + crclose(indexdb); + crclose(docsdb); + return NULL; + } + } + if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){ + crclose(indexdb); + crclose(docsdb); + return NULL; + } + vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN); + if(omode & OD_OWRITER){ + cachemap = cbmapopenex(odcachebnum); + sortmap = cbmapopenex(odcachebnum); + } else { + cachemap = NULL; + sortmap = NULL; + } + if(vlrnum(rdocsdb) > 0){ + dmax = -1; + dnum = -1; + if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){ + dmax = atoi(tmp); + free(tmp); + } + if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){ + dnum = atoi(tmp); + free(tmp); + } + if(dmax < 0 || dnum < 0){ + if(sortmap) cbmapclose(sortmap); + if(cachemap) cbmapclose(cachemap); + vlclose(rdocsdb); + crclose(indexdb); + crclose(docsdb); + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + return NULL; + } + } else { + dmax = 0; + dnum = 0; + } + odeum = cbmalloc(sizeof(ODEUM)); + odeum->name = cbmemdup(name, -1); + odeum->wmode = omode & OD_OWRITER; + odeum->fatal = FALSE; + odeum->inode = inode; + odeum->docsdb = docsdb; + odeum->indexdb = indexdb; + odeum->rdocsdb = rdocsdb; + odeum->cachemap = cachemap; + odeum->cacheasiz = 0; + odeum->sortmap = sortmap; + odeum->dmax = dmax; + odeum->dnum = dnum; + odeum->ldid = -1; + odsetcharclass(odeum, OD_SPACECHARS, OD_DELIMCHARS, OD_GLUECHARS); + if(odotcb) odotcb(fname, odeum, "the connection was established"); + return odeum; +} + + +/* Flush the cache for dirty buffer of words. + `odeum' specifies a database handle. + `fname' specifies the name of caller function. + If successful, the return value is true, else, it is false. */ +static int odcacheflush(ODEUM *odeum, const char *fname){ + const char *kbuf, *vbuf; + char otmsg[OD_OTCBBUFSIZ]; + int i, rnum, ksiz, vsiz; + assert(odeum); + if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE; + if(odotcb) odotcb(fname, odeum, "flushing caches"); + cbmapiterinit(odeum->cachemap); + for(i = 0; (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){ + vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz); + if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){ + odeum->fatal = TRUE; + return FALSE; + } + if(odotcb && (i + 1) % OD_OTPERWORDS == 0){ + sprintf(otmsg, "... (%d/%d)", i + 1, rnum); + odotcb(fname, odeum, otmsg); + } + } + cbmapclose(odeum->cachemap); + odeum->cachemap = cbmapopenex(odcachebnum); + odeum->cacheasiz = 0; + return TRUE; +} + + +/* Flush all frequent words in the cache for dirty buffer of words. + `odeum' specifies a database handle. + `fname' specifies the name of caller function. + `min' specifies the minimum size of frequent words. + If successful, the return value is true, else, it is false. */ +static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min){ + const char *kbuf, *vbuf; + char otmsg[OD_OTCBBUFSIZ]; + int rnum, ksiz, vsiz; + assert(odeum); + if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE; + if(odotcb){ + sprintf(otmsg, "flushing frequent words: min=%d asiz=%d rnum=%d)", + min, odeum->cacheasiz, rnum); + odotcb(fname, odeum, otmsg); + } + cbmapiterinit(odeum->cachemap); + while((kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL){ + vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz); + if(vsiz >= sizeof(ODPAIR) * min){ + if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){ + odeum->fatal = TRUE; + return FALSE; + } + cbmapout(odeum->cachemap, kbuf, ksiz); + odeum->cacheasiz -= vsiz; + } + } + if(odotcb){ + sprintf(otmsg, "... (done): min=%d asiz=%d rnum=%d)", + min, odeum->cacheasiz, cbmaprnum(odeum->cachemap)); + odotcb(fname, odeum, otmsg); + } + return TRUE; +} + + +/* Flush the half of rare words in the cache for dirty buffer of words. + `odeum' specifies a database handle. + `fname' specifies the name of caller function. + `ratio' specifies the ratio of rare words. + If successful, the return value is true, else, it is false. */ +static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio){ + const char *kbuf, *vbuf; + char otmsg[OD_OTCBBUFSIZ]; + int i, rnum, limit, ksiz, vsiz; + assert(odeum); + if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE; + if(odotcb){ + sprintf(otmsg, "flushing rare words: ratio=%.2f asiz=%d rnum=%d)", + ratio, odeum->cacheasiz, rnum); + odotcb(fname, odeum, otmsg); + } + cbmapiterinit(odeum->cachemap); + limit = (int)(rnum * ratio); + for(i = 0; i < limit && (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){ + vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz); + if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){ + odeum->fatal = TRUE; + return FALSE; + } + cbmapout(odeum->cachemap, kbuf, ksiz); + odeum->cacheasiz -= vsiz; + } + if(odotcb){ + sprintf(otmsg, "... (done): ratio=%.2f asiz=%d rnum=%d)", + ratio, odeum->cacheasiz, cbmaprnum(odeum->cachemap)); + odotcb(fname, odeum, otmsg); + } + return TRUE; +} + + +/* Sort the records of inverted index. + `odeum' specifies a database handle. + `fname' specifies the name of caller function. + If successful, the return value is true, else, it is false. */ +static int odsortindex(ODEUM *odeum, const char *fname){ + const char *word; + char *tmp, otmsg[OD_OTCBBUFSIZ]; + int i, rnum, wsiz, tsiz; + ODPAIR *pairs; + assert(odeum); + if((rnum = cbmaprnum(odeum->sortmap)) < 1) return TRUE; + if(odotcb) odotcb(fname, odeum, "sorting the inverted index"); + cbmapiterinit(odeum->sortmap); + for(i = 0; (word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL; i++){ + if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){ + if(tsiz > sizeof(ODPAIR)){ + pairs = (ODPAIR *)tmp; + qsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare); + if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){ + free(tmp); + return FALSE; + } + } + free(tmp); + } else if(dpecode != DP_ENOITEM){ + return FALSE; + } + if(odotcb && (i + 1) % OD_OTPERWORDS == 0){ + sprintf(otmsg, "... (%d/%d)", i + 1, rnum); + odotcb(fname, odeum, otmsg); + } + } + cbmapclose(odeum->sortmap); + odeum->sortmap = cbmapopenex(odcachebnum); + return TRUE; +} + + +/* Compare two pairs of structures of a search result. + `a' specifies the pointer to the region of one pair. + `b' specifies the pointer to the region of the other pair. + The return value is positive if the former is big, negative if the latter is big, 0 if both + are equivalent. */ +static int odsortcompare(const void *a, const void *b){ + ODPAIR *ap, *bp; + int rv; + assert(a && b); + ap = (ODPAIR *)a; + bp = (ODPAIR *)b; + rv = bp->score - ap->score; + if(rv != 0) return rv; + return ap->id - bp->id; +} + + +/* Purge the elements of the deleted documents from the inverted index. + `odeum' specifies a database handle. + `fname' specifies the name of caller function. + If successful, the return value is true, else, it is false. */ +static int odpurgeindex(ODEUM *odeum, const char *fname){ + ODPAIR *pairs; + char *kbuf, *vbuf, otmsg[OD_OTCBBUFSIZ]; + int i, rnum, tnum, ksiz, vsiz, pnum, wi; + assert(odeum); + if((rnum = crrnum(odeum->indexdb)) < 1) return TRUE; + if(odotcb) odotcb(fname, odeum, "purging dispensable regions"); + if(!criterinit(odeum->indexdb)) return FALSE; + tnum = 0; + while(TRUE){ + if(!(kbuf = criternext(odeum->indexdb, &ksiz))){ + if(dpecode != DP_ENOITEM) return FALSE; + break; + } + if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){ + dpecodeset(DP_EBROKEN, __FILE__, __LINE__); + free(kbuf); + return FALSE; + } + pairs = (ODPAIR *)vbuf; + pnum = vsiz / sizeof(ODPAIR); + wi = 0; + for(i = 0; i < pnum; i++){ + if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){ + pairs[wi++] = pairs[i]; + } + } + if(wi > 0){ + if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){ + free(vbuf); + free(kbuf); + return FALSE; + } + } else { + if(!crout(odeum->indexdb, kbuf, ksiz)){ + free(vbuf); + free(kbuf); + return FALSE; + } + } + free(vbuf); + free(kbuf); + if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){ + sprintf(otmsg, "... (%d/%d)", tnum + 1, rnum); + odotcb(fname, odeum, otmsg); + } + tnum++; + } + return TRUE; +} + + +/* Create a map of a document array. + `pairs' specifies the pointer to a document array. + `num' specifies the number of elements of the array. + The return value is a map of the document array. */ +static CBMAP *odpairsmap(const ODPAIR *pairs, int num){ + CBMAP *map; + int i; + assert(pairs && num >= 0); + map = cbmapopen(); + for(i = 0; i < num; i++){ + cbmapput(map, (char *)&(pairs[i].id), sizeof(int), + (char *)&(pairs[i].score), sizeof(int), TRUE); + } + return map; +} + + +/* compare two pairs of structures of words in a document. + `a' specifies the pointer to the region of one word. + `b' specifies the pointer to the region of the other word. + The return value is positive if the former is big, negative if the latter is big, 0 if both + are equivalent. */ +static int odwordcompare(const void *a, const void *b){ + ODWORD *ap, *bp; + int rv; + assert(a && b); + ap = (ODWORD *)a; + bp = (ODWORD *)b; + if((rv = bp->num - ap->num) != 0) return rv; + if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv; + return strcmp(ap->word, bp->word); +} + + +/* Match an operator without taking it off the token list. + `odeum' specifies a database handle. + `tokens' specifies a list handle of tokens. + The return value is whether the next token is an operator. */ +static int odmatchoperator(ODEUM *odeum, CBLIST *tokens){ + const char *tk = NULL; + int tk_len = 0; + tk = cblistval(tokens, 0, &tk_len); + if(tk && (tk[0] == '&' || tk[0] == '|' || tk[0] == '!')) return 1; + return 0; +} + + +/* Implements the subexpr part of the grammar. + `odeum' specifies a database handle. + `tokens' specifies a list handle of tokens. + `nwords' specifies a list handle of normalized words. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + `errors' specifies a list handle into which error messages are stored. + The return value is the pointer to an array of document IDs. */ +static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, + CBLIST *errors){ + char *tk = NULL; + int tk_len = 0; + char *nword = NULL; /* used to do the actual search, should match with tokens */ + ODPAIR *result = NULL; + int result_num = 0; + int i; + double ival; + if((tk = cblistshift(tokens, &tk_len)) != NULL){ + assert(tk != NULL); + if(tk[0] == '('){ + free(tk); + /* recurse into expr */ + result = odparseexpr(odeum, tokens, nwords, &result_num, errors); + /* match right token RPAREN */ + tk = cblistshift(tokens, &tk_len); + /* print an error if either we didn't get anything or we didn't get a ) */ + if(tk == NULL){ + if(errors) cblistpush(errors, "Expression ended without closing ')'", -1); + } else if(tk[0] != ')'){ + if(errors) cblistpush(errors, "Un-balanced parenthesis.", -1); + } + } else if(odeum->statechars[*(unsigned char *)tk] == 0){ + /* Perform odsearch with the next norm word that isn't an operator. */ + nword = cblistshift(nwords, NULL); + assert(nword != NULL); + if((result = odsearch(odeum, nword, -1, &result_num)) != NULL){ + /* TF-IDF tuning */ + ival = odlogarithm(result_num); + ival = (ival * ival) / 4.0; + if(ival < 4.0) ival = 4.0; + for(i = 0; i < result_num; i++){ + result[i].score = (int)(result[i].score / ival); + } + } + free(nword); + } else { + if(errors) cblistpush(errors, "Invalid sub-expression. Expected '(' or WORD.", -1); + result = cbmalloc(1); + result_num = 0; + } + /* done with the token */ + free(tk); + } + *np = result_num; + return result; +} + + +/* Implements the actual recursive decent parser for the mini query language. + `odeum' specifies a database handle. + `tokens' specifies a list handle of tokens. + `nwords' specifies a list handle of normalized words. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + `errors' specifies a list handle into which error messages are stored. + The return value is the pointer to an array of document IDs. + It simply parses an initial subexpr, and then loops over as many (operator subexpr) + sequences as it can find. The odmatchoperator function handles injecting a default & + between consecutive words. */ +static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, + CBLIST *errors){ + ODPAIR *left = NULL; + ODPAIR *right = NULL; + ODPAIR *temp = NULL; + int left_num = 0; + int right_num = 0; + int temp_num = 0; + char *op = NULL; + int op_len = 0; + if(!(left = odparsesubexpr(odeum, tokens, nwords, &left_num, errors))) return NULL; + /* expr ::= subexpr ( op subexpr )* */ + while(odmatchoperator(odeum, tokens)){ + op = cblistshift(tokens, &op_len); + if(!(right = odparsesubexpr(odeum, tokens, nwords, &right_num, errors))){ + free(op); + free(left); + return NULL; + } + switch(op[0]){ + case '&': + temp = odpairsand(left, left_num, right, right_num, &temp_num); + break; + case '|': + temp = odpairsor(left, left_num, right, right_num, &temp_num); + break; + case '!': + temp = odpairsnotand(left, left_num, right, right_num, &temp_num); + break; + default: + if(errors) cblistpush(errors, "Invalid operator. Expected '&', '|', or '!'.", -1); + break; + } + if(temp){ + /* an operator was done so we must swap it with the left */ + free(left); left = NULL; + left = temp; + left_num = temp_num; + } + free(op); + if(right) free(right); + } + *np = left_num; + return left; +} + + +/* Processes the tokens in order to break them up further. + `odeum' specifies a database handle. + `tokens' specifies a list handle of tokens. */ +static void odfixtokens(ODEUM *odeum, CBLIST *tokens){ + const char *tk = NULL; + int tk_len = 0; + int i = 0; + int lastword = 0; + for(i = 0; i < cblistnum(tokens); i++){ + tk = cblistval(tokens, i, &tk_len); + assert(tk); + if(tk[0] == '&' || tk[0] == '|' || tk[0] == '!' || tk[0] == '(' || tk[0] == ')'){ + lastword = 0; + if(tk_len > 1){ + /* need to break it up for the next loop around */ + tk = cblistremove(tokens, i, &tk_len); + cblistinsert(tokens, i, tk, 1); + cblistinsert(tokens, i+1, tk+1, tk_len-1); + free((char *)tk); + } + } else if(odeum->statechars[*(unsigned char *)tk] == 0){ + /* if the last one was a word and this is a word then we need a default & between them */ + if(lastword){ + cblistinsert(tokens, i, "&", 1); + i++; + } + lastword = 1; + } + } +} + + +/* Cleans out the parts of the normalized word list that are not considered words. + `odeum' specifies a database handle. + `tokens' specifies a list handle of tokens. */ +static void odcleannormalized(ODEUM *odeum, CBLIST *nwords){ + char *tk = NULL; + int tk_len = 0; + int i = 0; + for(i = 0; i < cblistnum(nwords); i++){ + tk = (char *)cblistval(nwords, i, &tk_len); + if(tk_len == 0 || (!odeum->statechars[*(unsigned char *)tk] == 0)){ + /* not a word so delete it */ + tk = cblistremove(nwords, i, &tk_len); + free(tk); + i--; + } + } +} + + + +/* END OF FILE */ |