diff options
Diffstat (limited to 'qdbm/odeum.c')
-rw-r--r-- | qdbm/odeum.c | 2090 |
1 files changed, 0 insertions, 2090 deletions
diff --git a/qdbm/odeum.c b/qdbm/odeum.c deleted file mode 100644 index 15395224..00000000 --- a/qdbm/odeum.c +++ /dev/null @@ -1,2090 +0,0 @@ -/************************************************************************************************* - * Implementation of Odeum - * Copyright (C) 2000-2007 Mikio Hirabayashi - * This file is part of QDBM, Quick Database Manager. - * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU - * Lesser General Public License as published by the Free Software Foundation; either version - * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be - * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more - * details. - * You should have received a copy of the GNU Lesser General Public License along with QDBM; if - * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - * 02111-1307 USA. - *************************************************************************************************/ - - -#define QDBM_INTERNAL 1 - -#include "odeum.h" -#include "myconf.h" - -#define OD_NAMEMAX 256 /* max size of a database name */ -#define OD_DIRMODE 00755 /* permission of a creating directory */ -#define OD_PATHBUFSIZ 1024 /* size of a path buffer */ -#define OD_NUMBUFSIZ 32 /* size of a buffer for a number */ -#define OD_MAPPBNUM 127 /* bucket size of a petit map handle */ -#define OD_DOCSNAME "docs" /* name of the database for documents */ -#define OD_INDEXNAME "index" /* name of the database for inverted index */ -#define OD_RDOCSNAME "rdocs" /* name of the database for reverse dictionary */ -#define OD_DOCSBNUM 2039 /* initial bucket number of document database */ -#define OD_DOCSDNUM 17 /* division number of document database */ -#define OD_DOCSALIGN -4 /* alignment of document database */ -#define OD_DOCSFBP 32 /* size of free block pool of document database */ -#define OD_INDEXBNUM 32749 /* initial bucket number of inverted index */ -#define OD_INDEXDNUM 7 /* division number of inverted index */ -#define OD_INDEXALIGN -2 /* alignment of inverted index */ -#define OD_INDEXFBP 32 /* size of free block pool of inverted index */ -#define OD_RDOCSLRM 81 /* records in a leaf node of reverse dictionary */ -#define OD_RDOCSNIM 192 /* records in a non-leaf node of reverse dictionary */ -#define OD_RDOCSLCN 128 /* number of leaf cache of reverse dictionary */ -#define OD_RDOCSNCN 32 /* number of non-leaf cache of reverse dictionary */ -#define OD_CACHEBNUM 262139 /* number of buckets for dirty buffers */ -#define OD_CACHESIZ 8388608 /* max bytes to use memory for dirty buffers */ -#define OD_CFLIVERAT 0.8 /* ratio of usable cache region */ -#define OD_CFBEGSIZ 2048 /* beginning size of flushing frequent words */ -#define OD_CFENDSIZ 64 /* lower limit of flushing frequent words */ -#define OD_CFRFRAT 0.2 /* ratio of flushing rare words a time */ -#define OD_OTCBBUFSIZ 1024 /* size of a buffer for call back functions */ -#define OD_OTPERWORDS 10000 /* frequency of call back in merging index */ -#define OD_OTPERDOCS 1000 /* frequency of call back in merging docs */ -#define OD_MDBRATIO 2.5 /* ratio of bucket number and document number */ -#define OD_MIBRATIO 1.5 /* ratio of bucket number and word number */ -#define OD_MIARATIO 0.75 /* ratio of alignment to the first words */ -#define OD_MIWUNIT 32 /* writing unit of merging inverted index */ -#define OD_DMAXEXPR "dmax" /* key of max number of the document ID */ -#define OD_DNUMEXPR "dnum" /* key of number of the documents */ -#define OD_URIEXPR "1" /* map key of URI */ -#define OD_ATTRSEXPR "2" /* map key of attributes */ -#define OD_NWORDSEXPR "3" /* map key of normal words */ -#define OD_AWORDSEXPR "4" /* map key of as-is words */ -#define OD_WTOPRATE 0.1 /* ratio of top words */ -#define OD_WTOPBONUS 5000 /* bonus points of top words */ -#define OD_KEYCRATIO 1.75 /* ratio of number to max of keyword candidates */ -#define OD_WOCCRPOINT 10000 /* points per occurence */ -#define OD_SPACECHARS "\t\n\v\f\r " /* space characters */ -#define OD_DELIMCHARS "!\"#$%&'()*/<=>?[\\]^`{|}~" /* delimiter characters */ -#define OD_GLUECHARS "+,-.:;@" /* glueing characters */ -#define OD_MAXWORDLEN 48 /* max length of a word */ - -typedef struct { /* type of structure for word counting */ - const char *word; /* pointer to the word */ - int num; /* frequency of the word */ -} ODWORD; - -enum { /* enumeration for events binded to each character */ - OD_EVWORD, /* word */ - OD_EVSPACE, /* space */ - OD_EVDELIM, /* delimiter */ - OD_EVGLUE /* glue */ -}; - - -/* private global variables */ -int odindexbnum = OD_INDEXBNUM; -int odindexdnum = OD_INDEXDNUM; -int odcachebnum = OD_CACHEBNUM; -int odcachesiz = OD_CACHESIZ; -void (*odotcb)(const char *, ODEUM *, const char *) = NULL; - - -/* private function prototypes */ -static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum, - const char *fname); -static int odcacheflush(ODEUM *odeum, const char *fname); -static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min); -static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio); -static int odsortindex(ODEUM *odeum, const char *fname); -static int odsortcompare(const void *a, const void *b); -static int odpurgeindex(ODEUM *odeum, const char *fname); -static CBMAP *odpairsmap(const ODPAIR *pairs, int num); -static int odwordcompare(const void *a, const void *b); -static int odmatchoperator(ODEUM *odeum, CBLIST *tokens); -static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, - CBLIST *errors); -static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, - CBLIST *errors); -static void odfixtokens(ODEUM *odeum, CBLIST *tokens); -static void odcleannormalized(ODEUM *odeum, CBLIST *nwords); - - - -/************************************************************************************************* - * public objects - *************************************************************************************************/ - - -/* Get a database handle. */ -ODEUM *odopen(const char *name, int omode){ - assert(name); - return odopendb(name, omode, OD_DOCSBNUM, odindexbnum, "odopen"); -} - - -/* Close a database handle. */ -int odclose(ODEUM *odeum){ - char numbuf[OD_NUMBUFSIZ]; - int err; - assert(odeum); - err = FALSE; - if(odotcb) odotcb("odclose", odeum, "closing the connection"); - if(odeum->wmode){ - if(odotcb) odotcb("odclose", odeum, "writing meta information"); - sprintf(numbuf, "%d", odeum->dmax); - if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE; - sprintf(numbuf, "%d", odeum->dnum); - if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE; - if(!odcacheflushfreq(odeum, "odclose", OD_CFENDSIZ)) err = TRUE; - if(!odcacheflushrare(odeum, "odclose", OD_CFRFRAT)) err = TRUE; - if(!odcacheflush(odeum, "odclose")) err = TRUE; - if(!odsortindex(odeum, "odclose")) err = TRUE; - cbmapclose(odeum->cachemap); - cbmapclose(odeum->sortmap); - } - if(!vlclose(odeum->rdocsdb)) err = TRUE; - if(!crclose(odeum->indexdb)) err = TRUE; - if(!crclose(odeum->docsdb)) err = TRUE; - free(odeum->name); - free(odeum); - return err ? FALSE : TRUE; -} - - -/* Store a document. */ -int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){ - char *tmp, *zbuf; - const char *word, *ctmp; - int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz; - double ival; - ODPAIR pair; - CBMAP *map; - CBLIST *tlist; - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - if(!odeum->wmode){ - dpecodeset(DP_EMODE, __FILE__, __LINE__); - return FALSE; - } - if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){ - if(!over){ - free(tmp); - dpecodeset(DP_EKEEP, __FILE__, __LINE__); - return FALSE; - } - if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return FALSE; - } - free(tmp); - } - odeum->dmax++; - odeum->dnum++; - docid = odeum->dmax; - map = cbmapopen(); - cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE); - tmp = cbmapdump(doc->attrs, &tsiz); - cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE); - free(tmp); - if(wmax < 0 || wmax > cblistnum(doc->nwords)) wmax = cblistnum(doc->nwords); - tlist = cblistopen(); - for(i = 0; i < wmax; i++){ - ctmp = cblistval(doc->nwords, i, &wsiz); - cblistpush(tlist, ctmp, wsiz); - } - tmp = cblistdump(tlist, &tsiz); - cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE); - free(tmp); - cblistclose(tlist); - tlist = cblistopen(); - for(i = 0; i < wmax; i++){ - ctmp = cblistval(doc->awords, i, &wsiz); - if(strcmp(ctmp, cblistval(doc->nwords, i, NULL))){ - cblistpush(tlist, ctmp, wsiz); - } else { - cblistpush(tlist, "\0", 1); - } - } - tmp = cblistdump(tlist, &tsiz); - cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE); - free(tmp); - cblistclose(tlist); - tmp = cbmapdump(map, &tsiz); - cbmapclose(map); - if(_qdbm_deflate){ - if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){ - free(tmp); - dpecodeset(DP_EMISC, __FILE__, __LINE__); - odeum->fatal = TRUE; - return FALSE; - } - free(tmp); - tmp = zbuf; - tsiz = zsiz; - } - if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){ - free(tmp); - if(dpecode == DP_EKEEP) dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return FALSE; - } - free(tmp); - if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){ - odeum->fatal = TRUE; - return FALSE; - } - map = cbmapopen(); - wnum = cblistnum(doc->nwords); - tmax = (int)(wnum * OD_WTOPRATE); - for(i = 0; i < wnum; i++){ - word = cblistval(doc->nwords, i, &wsiz); - if(wsiz < 1) continue; - if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ - num = *(int *)ctmp + OD_WOCCRPOINT; - } else { - num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT; - } - cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE); - } - ival = odlogarithm(wnum); - ival = (ival * ival * ival) / 8.0; - if(ival < 8.0) ival = 8.0; - cbmapiterinit(map); - while((word = cbmapiternext(map, &wsiz)) != NULL){ - pair.id = docid; - pair.score = (int)(*(int *)cbmapget(map, word, wsiz, NULL) / ival); - cbmapputcat(odeum->cachemap, word, wsiz, (char *)&pair, sizeof(pair)); - cbmapmove(odeum->cachemap, word, wsiz, FALSE); - odeum->cacheasiz += sizeof(pair); - cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE); - } - cbmapclose(map); - if(odeum->cacheasiz > odcachesiz){ - for(i = OD_CFBEGSIZ; odeum->cacheasiz > odcachesiz * OD_CFLIVERAT && i >= OD_CFENDSIZ; - i /= 2){ - if(!odcacheflushfreq(odeum, "odput", i)) return FALSE; - } - while(odeum->cacheasiz > odcachesiz * OD_CFLIVERAT){ - if(!odcacheflushrare(odeum, "odput", OD_CFRFRAT)) return FALSE; - } - } - doc->id = docid; - odeum->ldid = docid; - return TRUE; -} - - -/* Delete a document by a URL. */ -int odout(ODEUM *odeum, const char *uri){ - char *tmp; - int tsiz, docid; - assert(odeum && uri); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - if(!odeum->wmode){ - dpecodeset(DP_EMODE, __FILE__, __LINE__); - return FALSE; - } - if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ - if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; - return FALSE; - } - if(tsiz != sizeof(int)){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return FALSE; - } - docid = *(int *)tmp; - free(tmp); - return odoutbyid(odeum, docid); -} - - -/* Delete a document specified by an ID number. */ -int odoutbyid(ODEUM *odeum, int id){ - char *tmp, *zbuf; - const char *uritmp; - int tsiz, uritsiz, zsiz; - CBMAP *map; - assert(odeum && id > 0); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - if(!odeum->wmode){ - dpecodeset(DP_EMODE, __FILE__, __LINE__); - return FALSE; - } - if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ - if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; - return FALSE; - } - if(_qdbm_inflate){ - if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return FALSE; - } - free(tmp); - tmp = zbuf; - tsiz = zsiz; - } - map = cbmapload(tmp, tsiz); - free(tmp); - uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); - if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){ - cbmapclose(map); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return FALSE; - } - cbmapclose(map); - if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){ - odeum->fatal = TRUE; - return FALSE; - } - odeum->dnum--; - return TRUE; -} - - -/* Retrieve a document by a URI. */ -ODDOC *odget(ODEUM *odeum, const char *uri){ - char *tmp; - int tsiz, docid; - assert(odeum && uri); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return NULL; - } - if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ - if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; - return NULL; - } - if(tsiz != sizeof(int)){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return NULL; - } - docid = *(int *)tmp; - free(tmp); - return odgetbyid(odeum, docid); -} - - -/* Retrieve a document by an ID number. */ -ODDOC *odgetbyid(ODEUM *odeum, int id){ - char *tmp, *zbuf; - const char *uritmp, *attrstmp, *nwordstmp, *awordstmp, *asis, *normal; - int i, tsiz, uritsiz, attrstsiz, nwordstsiz, awordstsiz, zsiz, asiz, nsiz; - ODDOC *doc; - CBMAP *map; - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return NULL; - } - if(id < 1){ - dpecodeset(DP_ENOITEM, __FILE__, __LINE__); - return NULL; - } - if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){ - if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; - return NULL; - } - if(_qdbm_inflate){ - if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return NULL; - } - free(tmp); - tmp = zbuf; - tsiz = zsiz; - } - map = cbmapload(tmp, tsiz); - free(tmp); - uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz); - attrstmp = cbmapget(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), &attrstsiz); - nwordstmp = cbmapget(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), &nwordstsiz); - awordstmp = cbmapget(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), &awordstsiz); - if(!uritmp || !attrstmp || !nwordstmp || !awordstmp){ - cbmapclose(map); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return NULL; - } - doc = cbmalloc(sizeof(ODDOC)); - doc->id = id; - doc->uri = cbmemdup(uritmp, uritsiz); - doc->attrs = cbmapload(attrstmp, attrstsiz); - doc->nwords = cblistload(nwordstmp, nwordstsiz); - doc->awords = cblistload(awordstmp, awordstsiz); - cbmapclose(map); - for(i = 0; i < cblistnum(doc->awords); i++){ - asis = cblistval(doc->awords, i, &asiz); - if(asiz == 1 && asis[0] == '\0'){ - normal = cblistval(doc->nwords, i, &nsiz); - cblistover(doc->awords, i, normal, nsiz); - } - } - return doc; -} - - -/* Retrieve the ID of the document specified by a URI. */ -int odgetidbyuri(ODEUM *odeum, const char *uri){ - char *tmp; - int tsiz, docid; - assert(odeum && uri); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){ - if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; - return -1; - } - if(tsiz != sizeof(int)){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return -1; - } - docid = *(int *)tmp; - free(tmp); - return docid; -} - - -/* Check whether the document specified by an ID number exists. */ -int odcheck(ODEUM *odeum, int id){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - if(id < 1){ - dpecodeset(DP_ENOITEM, __FILE__, __LINE__); - return FALSE; - } - return crvsiz(odeum->docsdb, (char *)&id, sizeof(int)) != -1; -} - - -/* Search the inverted index for documents including a word. */ -ODPAIR *odsearch(ODEUM *odeum, const char *word, int max, int *np){ - char *tmp; - int tsiz; - assert(odeum && word && np); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return NULL; - } - if(odeum->wmode && cbmaprnum(odeum->sortmap) > 0 && - (!odcacheflush(odeum, "odsearch") || !odsortindex(odeum, "odsearch"))){ - odeum->fatal = TRUE; - return NULL; - } - max = max < 0 ? -1 : max * sizeof(ODPAIR); - if(!(tmp = crget(odeum->indexdb, word, -1, 0, max, &tsiz))){ - if(dpecode != DP_ENOITEM){ - odeum->fatal = TRUE; - return NULL; - } - *np = 0; - return cbmalloc(1); - } - *np = tsiz / sizeof(ODPAIR); - return (ODPAIR *)tmp; -} - - -/* Get the number of documents including a word. */ -int odsearchdnum(ODEUM *odeum, const char *word){ - int rv; - assert(odeum && word); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - rv = crvsiz(odeum->indexdb, word, -1); - return rv < 0 ? -1 : rv / sizeof(ODPAIR); -} - - -/* Initialize the iterator of a database handle. */ -int oditerinit(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - return criterinit(odeum->docsdb); -} - - -/* Get the next key of the iterator. */ -ODDOC *oditernext(ODEUM *odeum){ - char *tmp; - int tsiz, docsid; - ODDOC *doc; - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return NULL; - } - doc = NULL; - while(TRUE){ - if(!(tmp = criternext(odeum->docsdb, &tsiz))){ - if(dpecode != DP_ENOITEM) odeum->fatal = TRUE; - return NULL; - } - if(tsiz != sizeof(int)){ - free(tmp); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - odeum->fatal = TRUE; - return NULL; - } - docsid = *(int *)tmp; - free(tmp); - if((doc = odgetbyid(odeum, docsid)) != NULL) break; - if(dpecode != DP_ENOITEM){ - odeum->fatal = TRUE; - return NULL; - } - } - return doc; -} - - -/* Synchronize updating contents with the files and the devices. */ -int odsync(ODEUM *odeum){ - char numbuf[OD_NUMBUFSIZ]; - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - if(!odeum->wmode){ - dpecodeset(DP_EMODE, __FILE__, __LINE__); - return FALSE; - } - if(odotcb) odotcb("odsync", odeum, "writing meta information"); - sprintf(numbuf, "%d", odeum->dmax); - if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)){ - odeum->fatal = TRUE; - return FALSE; - } - sprintf(numbuf, "%d", odeum->dnum); - if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)){ - odeum->fatal = TRUE; - return FALSE; - } - if(!odcacheflush(odeum, "odsync")){ - odeum->fatal = TRUE; - return FALSE; - } - if(!odsortindex(odeum, "odsync")){ - odeum->fatal = TRUE; - return FALSE; - } - if(odotcb) odotcb("odsync", odeum, "synchronizing the document database"); - if(!crsync(odeum->docsdb)){ - odeum->fatal = TRUE; - return FALSE; - } - if(odotcb) odotcb("odsync", odeum, "synchronizing the inverted index"); - if(!crsync(odeum->indexdb)){ - odeum->fatal = TRUE; - return FALSE; - } - if(odotcb) odotcb("odsync", odeum, "synchronizing the reverse dictionary"); - if(!vlsync(odeum->rdocsdb)){ - odeum->fatal = TRUE; - return FALSE; - } - return TRUE; -} - - -/* Optimize a database. */ -int odoptimize(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return FALSE; - } - if(!odeum->wmode){ - dpecodeset(DP_EMODE, __FILE__, __LINE__); - return FALSE; - } - if(!odcacheflush(odeum, "odoptimize")){ - odeum->fatal = TRUE; - return FALSE; - } - if(odeum->ldid < 1 || odeum->ldid != odeum->dnum){ - if(!odpurgeindex(odeum, "odoptimize")){ - odeum->fatal = TRUE; - return FALSE; - } - } - if(odeum->ldid > 0){ - if(!odsortindex(odeum, "odoptimize")){ - odeum->fatal = TRUE; - return FALSE; - } - } - if(odotcb) odotcb("odoptimize", odeum, "optimizing the document database"); - if(!croptimize(odeum->docsdb, -1)){ - odeum->fatal = TRUE; - return FALSE; - } - if(odotcb) odotcb("odoptimize", odeum, "optimizing the inverted index"); - if(!croptimize(odeum->indexdb, -1)){ - odeum->fatal = TRUE; - return FALSE; - } - if(odotcb) odotcb("odoptimize", odeum, "optimizing the reverse dictionary"); - if(!vloptimize(odeum->rdocsdb)){ - odeum->fatal = TRUE; - return FALSE; - } - return TRUE; -} - - -/* Get the name of a database. */ -char *odname(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return NULL; - } - return cbmemdup(odeum->name, -1); -} - - -/* Get the total size of database files. */ -double odfsiz(ODEUM *odeum){ - double fsiz, rv; - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - fsiz = 0; - if((rv = crfsizd(odeum->docsdb)) < 0) return -1.0; - fsiz += rv; - if((rv = crfsizd(odeum->indexdb)) < 0) return -1.0; - fsiz += rv; - if((rv = vlfsiz(odeum->rdocsdb)) == -1) return -1.0; - fsiz += rv; - return fsiz; -} - - -/* Get the total number of the elements of the bucket arrays for the inverted index. */ -int odbnum(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - return crbnum(odeum->indexdb); -} - - -/* Get the total number of the used elements of the bucket arrays in the inverted index. */ -int odbusenum(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - return crbusenum(odeum->indexdb); -} - - -/* Get the number of the documents stored in a database. */ -int oddnum(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - return odeum->dnum; -} - - -/* Get the number of the words stored in a database. */ -int odwnum(ODEUM *odeum){ - assert(odeum); - if(odeum->fatal){ - dpecodeset(DP_EFATAL, __FILE__, __LINE__); - return -1; - } - return crrnum(odeum->indexdb); -} - - -/* Check whether a database handle is a writer or not. */ -int odwritable(ODEUM *odeum){ - assert(odeum); - return odeum->wmode; -} - - -/* Check whether a database has a fatal error or not. */ -int odfatalerror(ODEUM *odeum){ - assert(odeum); - return odeum->fatal; -} - - -/* Get the inode number of a database directory. */ -int odinode(ODEUM *odeum){ - assert(odeum); - return odeum->inode; -} - - -/* Get the last modified time of a database. */ -time_t odmtime(ODEUM *odeum){ - assert(odeum); - return crmtime(odeum->indexdb); -} - - -/* Merge plural database directories. */ -int odmerge(const char *name, const CBLIST *elemnames){ - ODEUM *odeum, **elems; - CURIA *curia, *ecuria; - VILLA *villa, *evilla; - ODPAIR *pairs; - char *word, *kbuf, *vbuf, *dbuf, otmsg[OD_OTCBBUFSIZ]; - char *wpunit[OD_MIWUNIT], *vpunit[OD_MIWUNIT]; - int i, j, k, num, dnum, wnum, dbnum, ibnum, tnum, wsunit[OD_MIWUNIT], vsunit[OD_MIWUNIT]; - int err, *bases, sum, max, wsiz, ksiz, vsiz, uend, unum, pnum, align, id, nid, dsiz; - assert(name && elemnames); - num = cblistnum(elemnames); - elems = cbmalloc(num * sizeof(ODEUM *) + 1); - dnum = 0; - wnum = 0; - for(i = 0; i < num; i++){ - if(!(elems[i] = odopen(cblistval(elemnames, i, NULL), OD_OREADER))){ - for(i -= 1; i >= 0; i--){ - odclose(elems[i]); - } - free(elems); - return FALSE; - } - dnum += oddnum(elems[i]); - wnum += odwnum(elems[i]); - } - dbnum = (int)(dnum * OD_MDBRATIO / OD_DOCSDNUM); - ibnum = (int)(wnum * OD_MIBRATIO / odindexdnum); - if(!(odeum = odopendb(name, OD_OWRITER | OD_OCREAT | OD_OTRUNC, dbnum, ibnum, "odmerge"))){ - for(i = 0; i < num; i++){ - odclose(elems[i]); - } - free(elems); - return FALSE; - } - err = FALSE; - if(odotcb) odotcb("odmerge", odeum, "calculating the base ID numbers"); - bases = cbmalloc(num * sizeof(int) + 1); - sum = 0; - for(i = 0; i < num; i++){ - ecuria = elems[i]->docsdb; - max = 0; - if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE; - while((kbuf = criternext(ecuria, &ksiz)) != NULL){ - if(ksiz == sizeof(int)){ - if(*(int *)kbuf > max) max = *(int *)kbuf; - } - free(kbuf); - } - bases[i] = sum; - sum += max; - } - curia = odeum->indexdb; - for(i = 0; i < num; i++){ - if(odotcb){ - sprintf(otmsg, "merging the inverted index (%d/%d)", i + 1, num); - odotcb("odmerge", odeum, otmsg); - } - ecuria = elems[i]->indexdb; - tnum = 0; - uend = FALSE; - if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE; - while(!uend){ - for(unum = 0; unum < OD_MIWUNIT; unum++){ - if(!(word = criternext(ecuria, &wsiz))){ - uend = TRUE; - break; - } - if(!(vbuf = crget(ecuria, word, wsiz, 0, -1, &vsiz))){ - err = TRUE; - free(word); - break; - } - wpunit[unum] = word; - wsunit[unum] = wsiz; - vpunit[unum] = vbuf; - vsunit[unum] = vsiz; - } - for(j = 0; j < unum; j++){ - word = wpunit[j]; - wsiz = wsunit[j]; - vbuf = vpunit[j]; - vsiz = vsunit[j]; - pairs = (ODPAIR *)vbuf; - pnum = vsiz / sizeof(ODPAIR); - for(k = 0; k < pnum; k++){ - pairs[k].id += bases[i]; - } - align = (int)(i < num - 1 ? vsiz * (num - i) * OD_MIARATIO : OD_INDEXALIGN); - if(!crsetalign(curia, align)) err = TRUE; - if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DCAT)) err = TRUE; - free(vbuf); - free(word); - if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){ - sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria)); - odotcb("odmerge", odeum, otmsg); - } - tnum++; - } - } - } - if(odotcb) odotcb("odmerge", odeum, "sorting the inverted index"); - tnum = 0; - if(!criterinit(curia) && dpecode != DP_ENOITEM) err = TRUE; - while((word = criternext(curia, &wsiz)) != NULL){ - if((vbuf = crget(curia, word, wsiz, 0, -1, &vsiz)) != NULL){ - if(vsiz > sizeof(ODPAIR)){ - pairs = (ODPAIR *)vbuf; - pnum = vsiz / sizeof(ODPAIR); - qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare); - if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DOVER)) err = TRUE; - } - free(vbuf); - } - free(word); - if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){ - sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(curia)); - odotcb("odmerge", odeum, otmsg); - } - tnum++; - } - if(odotcb) odotcb("odmerge", odeum, "synchronizing the inverted index"); - if(!crsync(curia)) err = TRUE; - dnum = 0; - curia = odeum->docsdb; - villa = odeum->rdocsdb; - for(i = 0; i < num; i++){ - if(odotcb){ - sprintf(otmsg, "merging the document database (%d/%d)", i + 1, num); - odotcb("odmerge", odeum, otmsg); - } - evilla = elems[i]->rdocsdb; - ecuria = elems[i]->docsdb; - tnum = 0; - if(!vlcurfirst(evilla) && dpecode != DP_ENOITEM) err = TRUE; - while(TRUE){ - if(!(kbuf = vlcurkey(evilla, &ksiz))) break; - if((ksiz == sizeof(OD_DMAXEXPR) && !memcmp(kbuf, OD_DMAXEXPR, ksiz)) || - (ksiz == sizeof(OD_DNUMEXPR) && !memcmp(kbuf, OD_DNUMEXPR, ksiz))){ - free(kbuf); - if(!vlcurnext(evilla)) break; - continue; - } - if(!(vbuf = vlcurval(evilla, &vsiz))){ - free(kbuf); - if(!vlcurnext(evilla)) break; - continue; - } - if(vsiz != sizeof(int)){ - free(vbuf); - free(kbuf); - if(!vlcurnext(evilla)) break; - continue; - } - id = *(int *)vbuf; - nid = id + bases[i]; - if(vlput(villa, kbuf, ksiz, (char *)&nid, sizeof(int), VL_DKEEP)){ - if((dbuf = crget(ecuria, (char *)&id, sizeof(int), 0, -1, &dsiz)) != NULL){ - if(crput(curia, (char *)&nid, sizeof(int), dbuf, dsiz, CR_DKEEP)){ - dnum++; - } else { - err = TRUE; - } - free(dbuf); - } else { - err = TRUE; - } - } else if(dpecode != DP_EKEEP){ - err = TRUE; - } - free(vbuf); - free(kbuf); - odeum->dnum++; - if(odotcb && (tnum + 1) % OD_OTPERDOCS == 0){ - sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria)); - odotcb("odmerge", odeum, otmsg); - } - tnum++; - if(!vlcurnext(evilla)) break; - } - } - odeum->dnum = dnum; - odeum->dmax = dnum; - free(bases); - if(odotcb) odotcb("odmerge", odeum, "synchronizing the document index"); - if(!crsync(curia)) err = TRUE; - if(!odclose(odeum)) err = TRUE; - for(i = 0; i < num; i++){ - if(!odclose(elems[i])) err = TRUE; - } - free(elems); - return err ? FALSE : TRUE; -} - - -/* Remove a database directory. */ -int odremove(const char *name){ - char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ]; - char path[OD_PATHBUFSIZ]; - const char *file; - struct stat sbuf; - CBLIST *list; - int i; - assert(name); - sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); - sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); - sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); - if(lstat(name, &sbuf) == -1){ - dpecodeset(DP_ESTAT, __FILE__, __LINE__); - return FALSE; - } - if(lstat(docsname, &sbuf) != -1 && !crremove(docsname)) return FALSE; - if(lstat(indexname, &sbuf) != -1 && !crremove(indexname)) return FALSE; - if(lstat(rdocsname, &sbuf) != -1 && !vlremove(rdocsname)) return FALSE; - if((list = cbdirlist(name)) != NULL){ - for(i = 0; i < cblistnum(list); i++){ - file = cblistval(list, i, NULL); - if(!strcmp(file, MYCDIRSTR) || !strcmp(file, MYPDIRSTR)) continue; - sprintf(path, "%s%c%s", name, MYPATHCHR, file); - if(lstat(path, &sbuf) == -1) continue; - if(S_ISDIR(sbuf.st_mode)){ - if(!crremove(path)) return FALSE; - } else { - if(!dpremove(path)) return FALSE; - } - } - cblistclose(list); - } - if(rmdir(name) == -1){ - dpecodeset(DP_ERMDIR, __FILE__, __LINE__); - return FALSE; - } - return TRUE; -} - - -/* Get a document handle. */ -ODDOC *oddocopen(const char *uri){ - ODDOC *doc; - assert(uri); - doc = cbmalloc(sizeof(ODDOC)); - doc->id = -1; - doc->uri = cbmemdup(uri, -1); - doc->attrs = cbmapopenex(OD_MAPPBNUM); - doc->nwords = cblistopen(); - doc->awords = cblistopen(); - return doc; -} - - -/* Close a document handle. */ -void oddocclose(ODDOC *doc){ - assert(doc); - cblistclose(doc->awords); - cblistclose(doc->nwords); - cbmapclose(doc->attrs); - free(doc->uri); - free(doc); -} - - -/* Add an attribute to a document. */ -void oddocaddattr(ODDOC *doc, const char *name, const char *value){ - assert(doc && name && value); - cbmapput(doc->attrs, name, -1, value, -1, TRUE); -} - - -/* Add a word to a document. */ -void oddocaddword(ODDOC *doc, const char *normal, const char *asis){ - assert(doc && normal && asis); - cblistpush(doc->nwords, normal, -1); - cblistpush(doc->awords, asis, -1); -} - - -/* Get the ID number of a document. */ -int oddocid(const ODDOC *doc){ - assert(doc); - return doc->id; -} - - -/* Get the URI of a document. */ -const char *oddocuri(const ODDOC *doc){ - assert(doc); - return doc->uri; -} - - -/* Get the value of an attribute of a document. */ -const char *oddocgetattr(const ODDOC *doc, const char *name){ - assert(doc && name); - return cbmapget(doc->attrs, name, -1, NULL); -} - - -/* Get the list handle contains words in normalized form of a document. */ -const CBLIST *oddocnwords(const ODDOC *doc){ - assert(doc); - return doc->nwords; -} - - -/* Get the list handle contains words in appearance form of a document. */ -const CBLIST *oddocawords(const ODDOC *doc){ - assert(doc); - return doc->awords; -} - - -/* Get the map handle contains keywords in normalized form and their scores. */ -CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum){ - const CBLIST *nwords; - CBMAP *map, *kwmap; - const char *word, *ctmp; - char numbuf[OD_NUMBUFSIZ]; - ODWORD *owords; - int i, wsiz, wnum, hnum, mnum, nbsiz; - double ival; - assert(doc && max >= 0); - map = cbmapopen(); - nwords = oddocnwords(doc); - for(i = 0; i < cblistnum(nwords); i++){ - word = cblistval(nwords, i, &wsiz); - if(wsiz < 1) continue; - if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){ - wnum = *(int *)ctmp + OD_WOCCRPOINT; - } else { - wnum = OD_WOCCRPOINT; - } - cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE); - } - mnum = cbmaprnum(map); - owords = cbmalloc(mnum * sizeof(ODWORD) + 1); - cbmapiterinit(map); - for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){ - owords[i].word = word; - owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL); - } - qsort(owords, mnum, sizeof(ODWORD), odwordcompare); - if(odeum){ - if(mnum > max * OD_KEYCRATIO) mnum = (int)(max * OD_KEYCRATIO); - for(i = 0; i < mnum; i++){ - if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0; - ival = odlogarithm(hnum); - ival = (ival * ival * ival) / 8.0; - if(ival < 8.0) ival = 8.0; - owords[i].num = (int)(owords[i].num / ival); - } - qsort(owords, mnum, sizeof(ODWORD), odwordcompare); - } - if(mnum > max) mnum = max; - kwmap = cbmapopenex(OD_MAPPBNUM); - for(i = 0; i < mnum; i++){ - nbsiz = sprintf(numbuf, "%d", owords[i].num); - cbmapput(kwmap, owords[i].word, -1, numbuf, nbsiz, TRUE); - } - free(owords); - cbmapclose(map); - return kwmap; -} - - -/* Break a text into words in appearance form. */ -CBLIST *odbreaktext(const char *text){ - const char *word; - CBLIST *elems, *words; - int i, j, dif, wsiz, pv, delim; - assert(text); - words = cblistopen(); - elems = cbsplit(text, -1, OD_SPACECHARS); - for(i = 0; i < cblistnum(elems); i++){ - word = cblistval(elems, i, &wsiz); - delim = FALSE; - j = 0; - pv = 0; - while(TRUE){ - dif = j - pv; - if(j >= wsiz){ - if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); - break; - } - if(delim){ - if(!strchr(OD_DELIMCHARS, word[j])){ - if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); - pv = j; - delim = FALSE; - } - } else { - if(strchr(OD_DELIMCHARS, word[j])){ - if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv); - pv = j; - delim = TRUE; - } - } - j++; - } - } - cblistclose(elems); - return words; -} - - -/* Make the normalized form of a word. */ -char *odnormalizeword(const char *asis){ - char *nword; - int i; - assert(asis); - for(i = 0; asis[i] != '\0'; i++){ - if(!strchr(OD_DELIMCHARS, asis[i])) break; - } - if(asis[i] == '\0') return cbmemdup("", 0); - nword = cbmemdup(asis, -1); - for(i = 0; nword[i] != '\0'; i++){ - if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A'; - } - while(i >= 0){ - if(strchr(OD_GLUECHARS, nword[i])){ - nword[i] = '\0'; - } else { - break; - } - i--; - } - return nword; -} - - -/* Get the common elements of two sets of documents. */ -ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ - CBMAP *map; - ODPAIR *result; - const char *tmp; - int i, rnum; - assert(apairs && anum >= 0 && bpairs && bnum >= 0); - map = odpairsmap(bpairs, bnum); - result = cbmalloc(sizeof(ODPAIR) * anum + 1); - rnum = 0; - for(i = 0; i < anum; i++){ - if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue; - result[rnum].id = apairs[i].id; - result[rnum].score = apairs[i].score + *(int *)tmp; - rnum++; - } - cbmapclose(map); - qsort(result, rnum, sizeof(ODPAIR), odsortcompare); - *np = rnum; - return result; -} - - -/* Get the sum of elements of two sets of documents. */ -ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ - CBMAP *map; - ODPAIR *result; - const char *tmp; - int i, score, rnum; - assert(apairs && anum >= 0 && bpairs && bnum >= 0); - map = odpairsmap(bpairs, bnum); - for(i = 0; i < anum; i++){ - score = 0; - if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) - score = *(int *)tmp; - score += apairs[i].score; - cbmapput(map, (char *)&(apairs[i].id), sizeof(int), - (char *)&score, sizeof(int), TRUE); - } - rnum = cbmaprnum(map); - result = cbmalloc(rnum * sizeof(ODPAIR) + 1); - cbmapiterinit(map); - for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){ - result[i].id = *(int *)tmp; - result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL); - } - cbmapclose(map); - qsort(result, rnum, sizeof(ODPAIR), odsortcompare); - *np = rnum; - return result; -} - - -/* Get the difference set of documents. */ -ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){ - CBMAP *map; - ODPAIR *result; - const char *tmp; - int i, rnum; - assert(apairs && anum >= 0 && bpairs && bnum >= 0); - map = odpairsmap(bpairs, bnum); - result = cbmalloc(sizeof(ODPAIR) * anum + 1); - rnum = 0; - for(i = 0; i < anum; i++){ - if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue; - result[rnum].id = apairs[i].id; - result[rnum].score = apairs[i].score; - rnum++; - } - cbmapclose(map); - qsort(result, rnum, sizeof(ODPAIR), odsortcompare); - *np = rnum; - return result; -} - - -/* Sort a set of documents in descending order of scores. */ -void odpairssort(ODPAIR *pairs, int pnum){ - assert(pairs && pnum >= 0); - qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare); -} - - -/* Get the natural logarithm of a number. */ -double odlogarithm(double x){ - int i; - if(x <= 1.0) return 0.0; - x = x * x * x * x * x * x * x * x * x * x; - for(i = 0; x > 1.0; i++){ - x /= 2.718281828459; - } - return (double)i / 10.0; -} - - -/* Get the cosine of the angle of two vectors. */ -double odvectorcosine(const int *avec, const int *bvec, int vnum){ - double rv; - assert(avec && bvec && vnum >= 0); - rv = odvecinnerproduct(avec, bvec, vnum) / - ((odvecabsolute(avec, vnum) * odvecabsolute(bvec, vnum))); - return rv > 0.0 ? rv : 0.0; -} - - -/* Set the global tuning parameters. */ -void odsettuning(int ibnum, int idnum, int cbnum, int csiz){ - if(ibnum > 0) odindexbnum = ibnum; - if(idnum > 0) odindexdnum = idnum; - if(cbnum > 0) odcachebnum = dpprimenum(cbnum); - if(csiz > 0) odcachesiz = csiz; -} - - -/* Break a text into words and store appearance forms and normalized form into lists. */ -void odanalyzetext(ODEUM *odeum, const char *text, CBLIST *awords, CBLIST *nwords){ - char aword[OD_MAXWORDLEN+1], *wp; - int lev, wsiz; - assert(odeum && text && awords); - lev = OD_EVSPACE; - wsiz = 0; - for(; *text != '\0'; text++){ - switch(odeum->statechars[*(unsigned char *)text]){ - case OD_EVWORD: - if(wsiz > 0 && lev == OD_EVDELIM){ - cblistpush(awords, aword, wsiz); - if(nwords) cblistpush(nwords, "", 0); - wsiz = 0; - } - if(wsiz <= OD_MAXWORDLEN){ - aword[wsiz++] = *text; - } - lev = OD_EVWORD; - break; - case OD_EVGLUE: - if(wsiz > 0 && lev == OD_EVDELIM){ - cblistpush(awords, aword, wsiz); - if(nwords) cblistpush(nwords, "", 0); - wsiz = 0; - } - if(wsiz <= OD_MAXWORDLEN){ - aword[wsiz++] = *text; - } - lev = OD_EVGLUE; - break; - case OD_EVDELIM: - if(wsiz > 0 && lev != OD_EVDELIM){ - cblistpush(awords, aword, wsiz); - if(nwords){ - wp = aword; - aword[wsiz] = '\0'; - while(*wp != '\0'){ - if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A'; - wp++; - } - wp--; - while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){ - wsiz--; - wp--; - } - cblistpush(nwords, aword, wsiz); - } - wsiz = 0; - } - if(wsiz <= OD_MAXWORDLEN){ - aword[wsiz++] = *text; - } - lev = OD_EVDELIM; - break; - default: - if(wsiz > 0){ - cblistpush(awords, aword, wsiz); - if(nwords){ - if(lev == OD_EVDELIM){ - cblistpush(nwords, "", 0); - } else { - wp = aword; - aword[wsiz] = '\0'; - while(*wp != '\0'){ - if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A'; - wp++; - } - wp--; - while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){ - wsiz--; - wp--; - } - cblistpush(nwords, aword, wsiz); - } - } - wsiz = 0; - } - lev = OD_EVSPACE; - break; - } - } - if(wsiz > 0){ - cblistpush(awords, aword, wsiz); - if(nwords){ - if(lev == OD_EVDELIM){ - cblistpush(nwords, "", 0); - } else { - wp = aword; - aword[wsiz] = '\0'; - while(*wp != '\0'){ - if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A'; - wp++; - } - wp--; - while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){ - wsiz--; - wp--; - } - cblistpush(nwords, aword, wsiz); - } - } - wsiz = 0; - } -} - - -/* Set the classes of characters used by `odanalyzetext'. */ -void odsetcharclass(ODEUM *odeum, const char *spacechars, const char *delimchars, - const char *gluechars){ - assert(odeum && spacechars && delimchars && gluechars); - memset(odeum->statechars, OD_EVWORD, sizeof(odeum->statechars)); - for(; *spacechars != '\0'; spacechars++){ - odeum->statechars[*(unsigned char *)spacechars] = OD_EVSPACE; - } - for(; *delimchars != '\0'; delimchars++){ - odeum->statechars[*(unsigned char *)delimchars] = OD_EVDELIM; - } - for(; *gluechars != '\0'; gluechars++){ - odeum->statechars[*(unsigned char *)gluechars] = OD_EVGLUE; - } -} - - -/* Query a database using a small boolean query language. */ -ODPAIR *odquery(ODEUM *odeum, const char *query, int *np, CBLIST *errors){ - CBLIST *tokens = cblistopen(); - CBLIST *nwords = cblistopen(); - ODPAIR *results = NULL; - assert(odeum && query && np); - odanalyzetext(odeum, query, tokens, nwords); - odcleannormalized(odeum, nwords); - odfixtokens(odeum, tokens); - results = odparseexpr(odeum, tokens, nwords, np, errors); - cblistclose(tokens); - cblistclose(nwords); - return results; -} - - - -/************************************************************************************************* - * features for experts - *************************************************************************************************/ - - -/* Get the internal database handle for documents. */ -CURIA *odidbdocs(ODEUM *odeum){ - assert(odeum); - return odeum->docsdb; -} - - -/* Get the internal database handle for the inverted index. */ -CURIA *odidbindex(ODEUM *odeum){ - assert(odeum); - return odeum->indexdb; -} - - -/* Get the internal database handle for the reverse dictionary. */ -VILLA *odidbrdocs(ODEUM *odeum){ - assert(odeum); - return odeum->rdocsdb; -} - - -/* Set the call back function called in merging. */ -void odsetotcb(void (*otcb)(const char *, ODEUM *, const char *)){ - odotcb = otcb; -} - - -/* Get the positive one of square roots of a number. */ -double odsquareroot(double x){ - double c, rv; - if(x <= 0.0) return 0.0; - c = x > 1.0 ? x : 1; - do { - rv = c; - c = (x / c + c) * 0.5; - } while(c < rv); - return rv; -} - - -/* Get the absolute of a vector. */ -double odvecabsolute(const int *vec, int vnum){ - double rv; - int i; - assert(vec && vnum >= 0); - rv = 0; - for(i = 0; i < vnum; i++){ - rv += (double)vec[i] * (double)vec[i]; - } - return odsquareroot(rv); -} - - -/* Get the inner product of two vectors. */ -double odvecinnerproduct(const int *avec, const int *bvec, int vnum){ - double rv; - int i; - assert(avec && bvec && vnum >= 0); - rv = 0; - for(i = 0; i < vnum; i++){ - rv += (double)avec[i] * (double)bvec[i]; - } - return rv; -} - - - -/************************************************************************************************* - * private objects - *************************************************************************************************/ - - -/* Get a database handle. - `name' specifies the name of a database directory. - `omode' specifies the connection mode. - `docsbnum` specifies the number of buckets of the document database. - `indexbnum` specifies the number of buckets of the index database. - `fname' specifies the name of caller function. - The return value is the database handle or `NULL' if it is not successful. */ -static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum, - const char *fname){ - int cromode, vlomode, inode, dmax, dnum; - char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp; - struct stat sbuf; - CURIA *docsdb, *indexdb; - VILLA *rdocsdb; - CBMAP *cachemap; - CBMAP *sortmap; - ODEUM *odeum; - assert(name); - if(strlen(name) > OD_NAMEMAX){ - dpecodeset(DP_EMISC, __FILE__, __LINE__); - return NULL; - } - cromode = CR_OREADER; - vlomode = VL_OREADER; - if(omode & OD_OWRITER){ - cromode = CR_OWRITER; - vlomode = VL_OWRITER | VL_OZCOMP | VL_OYCOMP; - if(omode & OD_OCREAT){ - cromode |= CR_OCREAT; - vlomode |= VL_OCREAT; - } - if(omode & OD_OTRUNC){ - cromode |= CR_OTRUNC; - vlomode |= VL_OTRUNC; - } - } - if(omode & OD_ONOLCK){ - cromode |= CR_ONOLCK; - vlomode |= VL_ONOLCK; - } - if(omode & OD_OLCKNB){ - cromode |= CR_OLCKNB; - vlomode |= VL_OLCKNB; - } - sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME); - sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME); - sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME); - docsdb = NULL; - indexdb = NULL; - rdocsdb = NULL; - if((omode & OD_OWRITER) && (omode & OD_OCREAT)){ - if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){ - dpecodeset(DP_EMKDIR, __FILE__, __LINE__); - return NULL; - } - } - if(lstat(name, &sbuf) == -1){ - dpecodeset(DP_ESTAT, __FILE__, __LINE__); - return NULL; - } - inode = sbuf.st_ino; - if(!(docsdb = cropen(docsname, cromode, docsbnum, OD_DOCSDNUM))) return NULL; - if(!(indexdb = cropen(indexname, cromode, indexbnum, odindexdnum))){ - crclose(docsdb); - return NULL; - } - if(omode & OD_OWRITER){ - if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetfbpsiz(docsdb, OD_DOCSFBP) || - !crsetalign(indexdb, OD_INDEXALIGN) || !crsetfbpsiz(indexdb, OD_INDEXFBP)){ - crclose(indexdb); - crclose(docsdb); - return NULL; - } - } - if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){ - crclose(indexdb); - crclose(docsdb); - return NULL; - } - vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN); - if(omode & OD_OWRITER){ - cachemap = cbmapopenex(odcachebnum); - sortmap = cbmapopenex(odcachebnum); - } else { - cachemap = NULL; - sortmap = NULL; - } - if(vlrnum(rdocsdb) > 0){ - dmax = -1; - dnum = -1; - if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){ - dmax = atoi(tmp); - free(tmp); - } - if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){ - dnum = atoi(tmp); - free(tmp); - } - if(dmax < 0 || dnum < 0){ - if(sortmap) cbmapclose(sortmap); - if(cachemap) cbmapclose(cachemap); - vlclose(rdocsdb); - crclose(indexdb); - crclose(docsdb); - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - return NULL; - } - } else { - dmax = 0; - dnum = 0; - } - odeum = cbmalloc(sizeof(ODEUM)); - odeum->name = cbmemdup(name, -1); - odeum->wmode = omode & OD_OWRITER; - odeum->fatal = FALSE; - odeum->inode = inode; - odeum->docsdb = docsdb; - odeum->indexdb = indexdb; - odeum->rdocsdb = rdocsdb; - odeum->cachemap = cachemap; - odeum->cacheasiz = 0; - odeum->sortmap = sortmap; - odeum->dmax = dmax; - odeum->dnum = dnum; - odeum->ldid = -1; - odsetcharclass(odeum, OD_SPACECHARS, OD_DELIMCHARS, OD_GLUECHARS); - if(odotcb) odotcb(fname, odeum, "the connection was established"); - return odeum; -} - - -/* Flush the cache for dirty buffer of words. - `odeum' specifies a database handle. - `fname' specifies the name of caller function. - If successful, the return value is true, else, it is false. */ -static int odcacheflush(ODEUM *odeum, const char *fname){ - const char *kbuf, *vbuf; - char otmsg[OD_OTCBBUFSIZ]; - int i, rnum, ksiz, vsiz; - assert(odeum); - if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE; - if(odotcb) odotcb(fname, odeum, "flushing caches"); - cbmapiterinit(odeum->cachemap); - for(i = 0; (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){ - vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz); - if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){ - odeum->fatal = TRUE; - return FALSE; - } - if(odotcb && (i + 1) % OD_OTPERWORDS == 0){ - sprintf(otmsg, "... (%d/%d)", i + 1, rnum); - odotcb(fname, odeum, otmsg); - } - } - cbmapclose(odeum->cachemap); - odeum->cachemap = cbmapopenex(odcachebnum); - odeum->cacheasiz = 0; - return TRUE; -} - - -/* Flush all frequent words in the cache for dirty buffer of words. - `odeum' specifies a database handle. - `fname' specifies the name of caller function. - `min' specifies the minimum size of frequent words. - If successful, the return value is true, else, it is false. */ -static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min){ - const char *kbuf, *vbuf; - char otmsg[OD_OTCBBUFSIZ]; - int rnum, ksiz, vsiz; - assert(odeum); - if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE; - if(odotcb){ - sprintf(otmsg, "flushing frequent words: min=%d asiz=%d rnum=%d)", - min, odeum->cacheasiz, rnum); - odotcb(fname, odeum, otmsg); - } - cbmapiterinit(odeum->cachemap); - while((kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL){ - vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz); - if(vsiz >= sizeof(ODPAIR) * min){ - if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){ - odeum->fatal = TRUE; - return FALSE; - } - cbmapout(odeum->cachemap, kbuf, ksiz); - odeum->cacheasiz -= vsiz; - } - } - if(odotcb){ - sprintf(otmsg, "... (done): min=%d asiz=%d rnum=%d)", - min, odeum->cacheasiz, cbmaprnum(odeum->cachemap)); - odotcb(fname, odeum, otmsg); - } - return TRUE; -} - - -/* Flush the half of rare words in the cache for dirty buffer of words. - `odeum' specifies a database handle. - `fname' specifies the name of caller function. - `ratio' specifies the ratio of rare words. - If successful, the return value is true, else, it is false. */ -static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio){ - const char *kbuf, *vbuf; - char otmsg[OD_OTCBBUFSIZ]; - int i, rnum, limit, ksiz, vsiz; - assert(odeum); - if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE; - if(odotcb){ - sprintf(otmsg, "flushing rare words: ratio=%.2f asiz=%d rnum=%d)", - ratio, odeum->cacheasiz, rnum); - odotcb(fname, odeum, otmsg); - } - cbmapiterinit(odeum->cachemap); - limit = (int)(rnum * ratio); - for(i = 0; i < limit && (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){ - vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz); - if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){ - odeum->fatal = TRUE; - return FALSE; - } - cbmapout(odeum->cachemap, kbuf, ksiz); - odeum->cacheasiz -= vsiz; - } - if(odotcb){ - sprintf(otmsg, "... (done): ratio=%.2f asiz=%d rnum=%d)", - ratio, odeum->cacheasiz, cbmaprnum(odeum->cachemap)); - odotcb(fname, odeum, otmsg); - } - return TRUE; -} - - -/* Sort the records of inverted index. - `odeum' specifies a database handle. - `fname' specifies the name of caller function. - If successful, the return value is true, else, it is false. */ -static int odsortindex(ODEUM *odeum, const char *fname){ - const char *word; - char *tmp, otmsg[OD_OTCBBUFSIZ]; - int i, rnum, wsiz, tsiz; - ODPAIR *pairs; - assert(odeum); - if((rnum = cbmaprnum(odeum->sortmap)) < 1) return TRUE; - if(odotcb) odotcb(fname, odeum, "sorting the inverted index"); - cbmapiterinit(odeum->sortmap); - for(i = 0; (word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL; i++){ - if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){ - if(tsiz > sizeof(ODPAIR)){ - pairs = (ODPAIR *)tmp; - qsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare); - if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){ - free(tmp); - return FALSE; - } - } - free(tmp); - } else if(dpecode != DP_ENOITEM){ - return FALSE; - } - if(odotcb && (i + 1) % OD_OTPERWORDS == 0){ - sprintf(otmsg, "... (%d/%d)", i + 1, rnum); - odotcb(fname, odeum, otmsg); - } - } - cbmapclose(odeum->sortmap); - odeum->sortmap = cbmapopenex(odcachebnum); - return TRUE; -} - - -/* Compare two pairs of structures of a search result. - `a' specifies the pointer to the region of one pair. - `b' specifies the pointer to the region of the other pair. - The return value is positive if the former is big, negative if the latter is big, 0 if both - are equivalent. */ -static int odsortcompare(const void *a, const void *b){ - ODPAIR *ap, *bp; - int rv; - assert(a && b); - ap = (ODPAIR *)a; - bp = (ODPAIR *)b; - rv = bp->score - ap->score; - if(rv != 0) return rv; - return ap->id - bp->id; -} - - -/* Purge the elements of the deleted documents from the inverted index. - `odeum' specifies a database handle. - `fname' specifies the name of caller function. - If successful, the return value is true, else, it is false. */ -static int odpurgeindex(ODEUM *odeum, const char *fname){ - ODPAIR *pairs; - char *kbuf, *vbuf, otmsg[OD_OTCBBUFSIZ]; - int i, rnum, tnum, ksiz, vsiz, pnum, wi; - assert(odeum); - if((rnum = crrnum(odeum->indexdb)) < 1) return TRUE; - if(odotcb) odotcb(fname, odeum, "purging dispensable regions"); - if(!criterinit(odeum->indexdb)) return FALSE; - tnum = 0; - while(TRUE){ - if(!(kbuf = criternext(odeum->indexdb, &ksiz))){ - if(dpecode != DP_ENOITEM) return FALSE; - break; - } - if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){ - dpecodeset(DP_EBROKEN, __FILE__, __LINE__); - free(kbuf); - return FALSE; - } - pairs = (ODPAIR *)vbuf; - pnum = vsiz / sizeof(ODPAIR); - wi = 0; - for(i = 0; i < pnum; i++){ - if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){ - pairs[wi++] = pairs[i]; - } - } - if(wi > 0){ - if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){ - free(vbuf); - free(kbuf); - return FALSE; - } - } else { - if(!crout(odeum->indexdb, kbuf, ksiz)){ - free(vbuf); - free(kbuf); - return FALSE; - } - } - free(vbuf); - free(kbuf); - if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){ - sprintf(otmsg, "... (%d/%d)", tnum + 1, rnum); - odotcb(fname, odeum, otmsg); - } - tnum++; - } - return TRUE; -} - - -/* Create a map of a document array. - `pairs' specifies the pointer to a document array. - `num' specifies the number of elements of the array. - The return value is a map of the document array. */ -static CBMAP *odpairsmap(const ODPAIR *pairs, int num){ - CBMAP *map; - int i; - assert(pairs && num >= 0); - map = cbmapopen(); - for(i = 0; i < num; i++){ - cbmapput(map, (char *)&(pairs[i].id), sizeof(int), - (char *)&(pairs[i].score), sizeof(int), TRUE); - } - return map; -} - - -/* compare two pairs of structures of words in a document. - `a' specifies the pointer to the region of one word. - `b' specifies the pointer to the region of the other word. - The return value is positive if the former is big, negative if the latter is big, 0 if both - are equivalent. */ -static int odwordcompare(const void *a, const void *b){ - ODWORD *ap, *bp; - int rv; - assert(a && b); - ap = (ODWORD *)a; - bp = (ODWORD *)b; - if((rv = bp->num - ap->num) != 0) return rv; - if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv; - return strcmp(ap->word, bp->word); -} - - -/* Match an operator without taking it off the token list. - `odeum' specifies a database handle. - `tokens' specifies a list handle of tokens. - The return value is whether the next token is an operator. */ -static int odmatchoperator(ODEUM *odeum, CBLIST *tokens){ - const char *tk = NULL; - int tk_len = 0; - tk = cblistval(tokens, 0, &tk_len); - if(tk && (tk[0] == '&' || tk[0] == '|' || tk[0] == '!')) return 1; - return 0; -} - - -/* Implements the subexpr part of the grammar. - `odeum' specifies a database handle. - `tokens' specifies a list handle of tokens. - `nwords' specifies a list handle of normalized words. - `np' specifies the pointer to a variable to which the number of the elements of the return - value is assigned. - `errors' specifies a list handle into which error messages are stored. - The return value is the pointer to an array of document IDs. */ -static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, - CBLIST *errors){ - char *tk = NULL; - int tk_len = 0; - char *nword = NULL; /* used to do the actual search, should match with tokens */ - ODPAIR *result = NULL; - int result_num = 0; - int i; - double ival; - if((tk = cblistshift(tokens, &tk_len)) != NULL){ - assert(tk != NULL); - if(tk[0] == '('){ - free(tk); - /* recurse into expr */ - result = odparseexpr(odeum, tokens, nwords, &result_num, errors); - /* match right token RPAREN */ - tk = cblistshift(tokens, &tk_len); - /* print an error if either we didn't get anything or we didn't get a ) */ - if(tk == NULL){ - if(errors) cblistpush(errors, "Expression ended without closing ')'", -1); - } else if(tk[0] != ')'){ - if(errors) cblistpush(errors, "Un-balanced parenthesis.", -1); - } - } else if(odeum->statechars[*(unsigned char *)tk] == 0){ - /* Perform odsearch with the next norm word that isn't an operator. */ - nword = cblistshift(nwords, NULL); - assert(nword != NULL); - if((result = odsearch(odeum, nword, -1, &result_num)) != NULL){ - /* TF-IDF tuning */ - ival = odlogarithm(result_num); - ival = (ival * ival) / 4.0; - if(ival < 4.0) ival = 4.0; - for(i = 0; i < result_num; i++){ - result[i].score = (int)(result[i].score / ival); - } - } - free(nword); - } else { - if(errors) cblistpush(errors, "Invalid sub-expression. Expected '(' or WORD.", -1); - result = cbmalloc(1); - result_num = 0; - } - /* done with the token */ - free(tk); - } - *np = result_num; - return result; -} - - -/* Implements the actual recursive decent parser for the mini query language. - `odeum' specifies a database handle. - `tokens' specifies a list handle of tokens. - `nwords' specifies a list handle of normalized words. - `np' specifies the pointer to a variable to which the number of the elements of the return - value is assigned. - `errors' specifies a list handle into which error messages are stored. - The return value is the pointer to an array of document IDs. - It simply parses an initial subexpr, and then loops over as many (operator subexpr) - sequences as it can find. The odmatchoperator function handles injecting a default & - between consecutive words. */ -static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np, - CBLIST *errors){ - ODPAIR *left = NULL; - ODPAIR *right = NULL; - ODPAIR *temp = NULL; - int left_num = 0; - int right_num = 0; - int temp_num = 0; - char *op = NULL; - int op_len = 0; - if(!(left = odparsesubexpr(odeum, tokens, nwords, &left_num, errors))) return NULL; - /* expr ::= subexpr ( op subexpr )* */ - while(odmatchoperator(odeum, tokens)){ - op = cblistshift(tokens, &op_len); - if(!(right = odparsesubexpr(odeum, tokens, nwords, &right_num, errors))){ - free(op); - free(left); - return NULL; - } - switch(op[0]){ - case '&': - temp = odpairsand(left, left_num, right, right_num, &temp_num); - break; - case '|': - temp = odpairsor(left, left_num, right, right_num, &temp_num); - break; - case '!': - temp = odpairsnotand(left, left_num, right, right_num, &temp_num); - break; - default: - if(errors) cblistpush(errors, "Invalid operator. Expected '&', '|', or '!'.", -1); - break; - } - if(temp){ - /* an operator was done so we must swap it with the left */ - free(left); left = NULL; - left = temp; - left_num = temp_num; - } - free(op); - if(right) free(right); - } - *np = left_num; - return left; -} - - -/* Processes the tokens in order to break them up further. - `odeum' specifies a database handle. - `tokens' specifies a list handle of tokens. */ -static void odfixtokens(ODEUM *odeum, CBLIST *tokens){ - const char *tk = NULL; - int tk_len = 0; - int i = 0; - int lastword = 0; - for(i = 0; i < cblistnum(tokens); i++){ - tk = cblistval(tokens, i, &tk_len); - assert(tk); - if(tk[0] == '&' || tk[0] == '|' || tk[0] == '!' || tk[0] == '(' || tk[0] == ')'){ - lastword = 0; - if(tk_len > 1){ - /* need to break it up for the next loop around */ - tk = cblistremove(tokens, i, &tk_len); - cblistinsert(tokens, i, tk, 1); - cblistinsert(tokens, i+1, tk+1, tk_len-1); - free((char *)tk); - } - } else if(odeum->statechars[*(unsigned char *)tk] == 0){ - /* if the last one was a word and this is a word then we need a default & between them */ - if(lastword){ - cblistinsert(tokens, i, "&", 1); - i++; - } - lastword = 1; - } - } -} - - -/* Cleans out the parts of the normalized word list that are not considered words. - `odeum' specifies a database handle. - `tokens' specifies a list handle of tokens. */ -static void odcleannormalized(ODEUM *odeum, CBLIST *nwords){ - char *tk = NULL; - int tk_len = 0; - int i = 0; - for(i = 0; i < cblistnum(nwords); i++){ - tk = (char *)cblistval(nwords, i, &tk_len); - if(tk_len == 0 || (!odeum->statechars[*(unsigned char *)tk] == 0)){ - /* not a word so delete it */ - tk = cblistremove(nwords, i, &tk_len); - free(tk); - i--; - } - } -} - - - -/* END OF FILE */ |