diff options
author | Chris Wilson <chris+github@qwirx.com> | 2010-01-09 16:15:40 +0000 |
---|---|---|
committer | Chris Wilson <chris+github@qwirx.com> | 2010-01-09 16:15:40 +0000 |
commit | 84ce49c7e644e2a7da022f55a703e30498eee388 (patch) | |
tree | b2c2e4ed6560735261652e94ff6bb19532a8e76a /qdbm/odeum.h | |
parent | 059daaf09d915509982cdf472d4f3aa8c78c4d25 (diff) |
Add QDBM 1.8.77 dbm-compatible database library to sources, will
probably be used as a default for the reference tracking implementation
as it's smaller than BDB and apparently faster.
Diffstat (limited to 'qdbm/odeum.h')
-rw-r--r-- | qdbm/odeum.h | 590 |
1 files changed, 590 insertions, 0 deletions
diff --git a/qdbm/odeum.h b/qdbm/odeum.h new file mode 100644 index 00000000..62def9ee --- /dev/null +++ b/qdbm/odeum.h @@ -0,0 +1,590 @@ +/************************************************************************************************* + * The inverted API of QDBM + * Copyright (C) 2000-2007 Mikio Hirabayashi + * This file is part of QDBM, Quick Database Manager. + * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU + * Lesser General Public License as published by the Free Software Foundation; either version + * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * You should have received a copy of the GNU Lesser General Public License along with QDBM; if + * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA. + *************************************************************************************************/ + + +#ifndef _ODEUM_H /* duplication check */ +#define _ODEUM_H + +#if defined(__cplusplus) /* export for C++ */ +extern "C" { +#endif + + +#include <depot.h> +#include <curia.h> +#include <cabin.h> +#include <villa.h> +#include <stdlib.h> +#include <time.h> + + +#if defined(_MSC_VER) && !defined(QDBM_INTERNAL) && !defined(QDBM_STATIC) +#define MYEXTERN extern __declspec(dllimport) +#else +#define MYEXTERN extern +#endif + + + +/************************************************************************************************* + * API + *************************************************************************************************/ + + +typedef struct { /* type of structure for a database handle */ + char *name; /* name of the database directory */ + int wmode; /* whether to be writable */ + int fatal; /* whether a fatal error occured */ + int inode; /* inode of the database directory */ + CURIA *docsdb; /* database handle for documents */ + CURIA *indexdb; /* database handle for the inverted index */ + VILLA *rdocsdb; /* database handle for the reverse dictionary */ + CBMAP *cachemap; /* cache for dirty buffers of words */ + int cacheasiz; /* total allocated size of dirty buffers */ + CBMAP *sortmap; /* map handle for candidates of sorting */ + int dmax; /* max number of the document ID */ + int dnum; /* number of the documents */ + int ldid; /* ID number of the last registered document */ + char statechars[256]; /* state of single byte characters */ +} ODEUM; + +typedef struct { /* type of structure for a document handle */ + int id; /* ID number */ + char *uri; /* uniform resource identifier */ + CBMAP *attrs; /* map handle for attrubutes */ + CBLIST *nwords; /* list handle for words in normalized form */ + CBLIST *awords; /* list handle for words in appearance form */ +} ODDOC; + +typedef struct { /* type of structure for an element of search result */ + int id; /* ID number of the document */ + int score; /* score of the document */ +} ODPAIR; + +enum { /* enumeration for open modes */ + OD_OREADER = 1 << 0, /* open as a reader */ + OD_OWRITER = 1 << 1, /* open as a writer */ + OD_OCREAT = 1 << 2, /* a writer creating */ + OD_OTRUNC = 1 << 3, /* a writer truncating */ + OD_ONOLCK = 1 << 4, /* open without locking */ + OD_OLCKNB = 1 << 5 /* lock without blocking */ +}; + + +/* Get a database handle. + `name' specifies the name of a database directory. + `omode' specifies the connection mode: `OD_OWRITER' as a writer, `OD_OREADER' as a reader. + If the mode is `OD_OWRITER', the following may be added by bitwise or: `OD_OCREAT', which + means it creates a new database if not exist, `OD_OTRUNC', which means it creates a new + database regardless if one exists. Both of `OD_OREADER' and `OD_OWRITER' can be added to by + bitwise or: `OD_ONOLCK', which means it opens a database directory without file locking, or + `OD_OLCKNB', which means locking is performed without blocking. + The return value is the database handle or `NULL' if it is not successful. + While connecting as a writer, an exclusive lock is invoked to the database directory. + While connecting as a reader, a shared lock is invoked to the database directory. + The thread blocks until the lock is achieved. If `OD_ONOLCK' is used, the application is + responsible for exclusion control. */ +ODEUM *odopen(const char *name, int omode); + + +/* Close a database handle. + `odeum' specifies a database handle. + If successful, the return value is true, else, it is false. + Because the region of a closed handle is released, it becomes impossible to use the handle. + Updating a database is assured to be written when the handle is closed. If a writer opens + a database but does not close it appropriately, the database will be broken. */ +int odclose(ODEUM *odeum); + + +/* Store a document. + `odeum' specifies a database handle connected as a writer. + `doc' specifies a document handle. + `wmax' specifies the max number of words to be stored in the document database. If it is + negative, the number is unlimited. + `over' specifies whether the data of the duplicated document is overwritten or not. If it + is false and the URI of the document is duplicated, the function returns as an error. + If successful, the return value is true, else, it is false. */ +int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over); + + +/* Delete a document specified by a URI. + `odeum' specifies a database handle connected as a writer. + `uri' specifies the string of the URI of a document. + If successful, the return value is true, else, it is false. False is returned when no + document corresponds to the specified URI. */ +int odout(ODEUM *odeum, const char *uri); + + +/* Delete a document specified by an ID number. + `odeum' specifies a database handle connected as a writer. + `id' specifies the ID number of a document. + If successful, the return value is true, else, it is false. False is returned when no + document corresponds to the specified ID number. */ +int odoutbyid(ODEUM *odeum, int id); + + +/* Retrieve a document specified by a URI. + `odeum' specifies a database handle. + `uri' specifies the string the URI of a document. + If successful, the return value is the handle of the corresponding document, else, it is + `NULL'. `NULL' is returned when no document corresponds to the specified URI. + Because the handle of the return value is opened with the function `oddocopen', it should + be closed with the function `oddocclose'. */ +ODDOC *odget(ODEUM *odeum, const char *uri); + + +/* Retrieve a document by an ID number. + `odeum' specifies a database handle. + `id' specifies the ID number of a document. + If successful, the return value is the handle of the corresponding document, else, it is + `NULL'. `NULL' is returned when no document corresponds to the specified ID number. + Because the handle of the return value is opened with the function `oddocopen', it should + be closed with the function `oddocclose'. */ +ODDOC *odgetbyid(ODEUM *odeum, int id); + + +/* Retrieve the ID of the document specified by a URI. + `odeum' specifies a database handle. + `uri' specifies the string the URI of a document. + If successful, the return value is the ID number of the document, else, it is -1. -1 is + returned when no document corresponds to the specified URI. */ +int odgetidbyuri(ODEUM *odeum, const char *uri); + + +/* Check whether the document specified by an ID number exists. + `odeum' specifies a database handle. + `id' specifies the ID number of a document. + The return value is true if the document exists, else, it is false. */ +int odcheck(ODEUM *odeum, int id); + + +/* Search the inverted index for documents including a particular word. + `odeum' specifies a database handle. + `word' specifies a searching word. + `max' specifies the max number of documents to be retrieve. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + If successful, the return value is the pointer to an array, else, it is `NULL'. Each + element of the array is a pair of the ID number and the score of a document, and sorted in + descending order of their scores. Even if no document corresponds to the specified word, + it is not error but returns an dummy array. + Because the region of the return value is allocated with the `malloc' call, it should be + released with the `free' call if it is no longer in use. Note that each element of the array + of the return value can be data of a deleted document. */ +ODPAIR *odsearch(ODEUM *odeum, const char *word, int max, int *np); + + +/* Get the number of documents including a word. + `odeum' specifies a database handle. + `word' specifies a searching word. + If successful, the return value is the number of documents including the word, else, it is -1. + Because this function does not read the entity of the inverted index, it is faster than + `odsearch'. */ +int odsearchdnum(ODEUM *odeum, const char *word); + + +/* Initialize the iterator of a database handle. + `odeum' specifies a database handle. + If successful, the return value is true, else, it is false. + The iterator is used in order to access every document stored in a database. */ +int oditerinit(ODEUM *odeum); + + +/* Get the next key of the iterator. + `odeum' specifies a database handle. + If successful, the return value is the handle of the next document, else, it is `NULL'. + `NULL' is returned when no document is to be get out of the iterator. + It is possible to access every document by iteration of calling this function. However, + it is not assured if updating the database is occurred while the iteration. Besides, the + order of this traversal access method is arbitrary, so it is not assured that the order of + string matches the one of the traversal access. Because the handle of the return value is + opened with the function `oddocopen', it should be closed with the function `oddocclose'. */ +ODDOC *oditernext(ODEUM *odeum); + + +/* Synchronize updating contents with the files and the devices. + `odeum' specifies a database handle connected as a writer. + If successful, the return value is true, else, it is false. + This function is useful when another process uses the connected database directory. */ +int odsync(ODEUM *odeum); + + +/* Optimize a database. + `odeum' specifies a database handle connected as a writer. + If successful, the return value is true, else, it is false. + Elements of the deleted documents in the inverted index are purged. */ +int odoptimize(ODEUM *odeum); + + +/* Get the name of a database. + `odeum' specifies a database handle. + If successful, the return value is the pointer to the region of the name of the database, + else, it is `NULL'. + Because the region of the return value is allocated with the `malloc' call, it should be + released with the `free' call if it is no longer in use. */ +char *odname(ODEUM *odeum); + + +/* Get the total size of database files. + `odeum' specifies a database handle. + If successful, the return value is the total size of the database files, else, it is -1.0. */ +double odfsiz(ODEUM *odeum); + + +/* Get the total number of the elements of the bucket arrays in the inverted index. + `odeum' specifies a database handle. + If successful, the return value is the total number of the elements of the bucket arrays, + else, it is -1. */ +int odbnum(ODEUM *odeum); + + +/* Get the total number of the used elements of the bucket arrays in the inverted index. + `odeum' specifies a database handle. + If successful, the return value is the total number of the used elements of the bucket + arrays, else, it is -1. */ +int odbusenum(ODEUM *odeum); + + +/* Get the number of the documents stored in a database. + `odeum' specifies a database handle. + If successful, the return value is the number of the documents stored in the database, else, + it is -1. */ +int oddnum(ODEUM *odeum); + + +/* Get the number of the words stored in a database. + `odeum' specifies a database handle. + If successful, the return value is the number of the words stored in the database, else, + it is -1. + Because of the I/O buffer, the return value may be less than the hard number. */ +int odwnum(ODEUM *odeum); + + +/* Check whether a database handle is a writer or not. + `odeum' specifies a database handle. + The return value is true if the handle is a writer, false if not. */ +int odwritable(ODEUM *odeum); + + +/* Check whether a database has a fatal error or not. + `odeum' specifies a database handle. + The return value is true if the database has a fatal error, false if not. */ +int odfatalerror(ODEUM *odeum); + + +/* Get the inode number of a database directory. + `odeum' specifies a database handle. + The return value is the inode number of the database directory. */ +int odinode(ODEUM *odeum); + + +/* Get the last modified time of a database. + `odeum' specifies a database handle. + The return value is the last modified time of the database. */ +time_t odmtime(ODEUM *odeum); + + +/* Merge plural database directories. + `name' specifies the name of a database directory to create. + `elemnames' specifies a list of names of element databases. + If successful, the return value is true, else, it is false. + If two or more documents which have the same URL come in, the first one is adopted and the + others are ignored. */ +int odmerge(const char *name, const CBLIST *elemnames); + + +/* Remove a database directory. + `name' specifies the name of a database directory. + If successful, the return value is true, else, it is false. + A database directory can contain databases of other APIs of QDBM, they are also removed by + this function. */ +int odremove(const char *name); + + +/* Get a document handle. + `uri' specifies the URI of a document. + The return value is a document handle. + The ID number of a new document is not defined. It is defined when the document is stored + in a database. */ +ODDOC *oddocopen(const char *uri); + + +/* Close a document handle. + `doc' specifies a document handle. + Because the region of a closed handle is released, it becomes impossible to use the handle. */ +void oddocclose(ODDOC *doc); + + +/* Add an attribute to a document. + `doc' specifies a document handle. + `name' specifies the string of the name of an attribute. + `value' specifies the string of the value of the attribute. */ +void oddocaddattr(ODDOC *doc, const char *name, const char *value); + + +/* Add a word to a document. + `doc' specifies a document handle. + `normal' specifies the string of the normalized form of a word. Normalized forms are + treated as keys of the inverted index. If the normalized form of a word is an empty + string, the word is not reflected in the inverted index. + `asis' specifies the string of the appearance form of the word. Appearance forms are used + after the document is retrieved by an application. */ +void oddocaddword(ODDOC *doc, const char *normal, const char *asis); + + +/* Get the ID number of a document. + `doc' specifies a document handle. + The return value is the ID number of a document. */ +int oddocid(const ODDOC *doc); + + +/* Get the URI of a document. + `doc' specifies a document handle. + The return value is the string of the URI of a document. */ +const char *oddocuri(const ODDOC *doc); + + +/* Get the value of an attribute of a document. + `doc' specifies a document handle. + `name' specifies the string of the name of an attribute. + The return value is the string of the value of the attribute, or `NULL' if no attribute + corresponds. */ +const char *oddocgetattr(const ODDOC *doc, const char *name); + + +/* Get the list handle contains words in normalized form of a document. + `doc' specifies a document handle. + The return value is the list handle contains words in normalized form. */ +const CBLIST *oddocnwords(const ODDOC *doc); + + +/* Get the list handle contains words in appearance form of a document. + `doc' specifies a document handle. + The return value is the list handle contains words in appearance form. */ +const CBLIST *oddocawords(const ODDOC *doc); + + +/* Get the map handle contains keywords in normalized form and their scores. + `doc' specifies a document handle. + `max' specifies the max number of keywords to get. + `odeum' specifies a database handle with which the IDF for weighting is calculate. + If it is `NULL', it is not used. + The return value is the map handle contains keywords and their scores. Scores are expressed + as decimal strings. + Because the handle of the return value is opened with the function `cbmapopen', it should + be closed with the function `cbmapclose' if it is no longer in use. */ +CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum); + + +/* Break a text into words in appearance form. + `text' specifies the string of a text. + The return value is the list handle contains words in appearance form. + Words are separated with space characters and such delimiters as period, comma and so on. + Because the handle of the return value is opened with the function `cblistopen', it should + be closed with the function `cblistclose' if it is no longer in use. */ +CBLIST *odbreaktext(const char *text); + + +/* Make the normalized form of a word. + `asis' specifies the string of the appearance form of a word. + The return value is is the string of the normalized form of the word. + Alphabets of the ASCII code are unified into lower cases. Words composed of only delimiters + are treated as empty strings. Because the region of the return value is allocated with the + `malloc' call, it should be released with the `free' call if it is no longer in use. */ +char *odnormalizeword(const char *asis); + + +/* Get the common elements of two sets of documents. + `apairs' specifies the pointer to the former document array. + `anum' specifies the number of the elements of the former document array. + `bpairs' specifies the pointer to the latter document array. + `bnum' specifies the number of the elements of the latter document array. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + The return value is the pointer to a new document array whose elements commonly belong to + the specified two sets. + Elements of the array are sorted in descending order of their scores. Because the region of + the return value is allocated with the `malloc' call, it should be released with the `free' + call if it is no longer in use. */ +ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np); + + +/* Get the sum of elements of two sets of documents. + `apairs' specifies the pointer to the former document array. + `anum' specifies the number of the elements of the former document array. + `bpairs' specifies the pointer to the latter document array. + `bnum' specifies the number of the elements of the latter document array. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + The return value is the pointer to a new document array whose elements belong to both or + either of the specified two sets. + Elements of the array are sorted in descending order of their scores. Because the region of + the return value is allocated with the `malloc' call, it should be released with the `free' + call if it is no longer in use. */ +ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np); + + +/* Get the difference set of documents. + `apairs' specifies the pointer to the former document array. + `anum' specifies the number of the elements of the former document array. + `bpairs' specifies the pointer to the latter document array of the sum of elements. + `bnum' specifies the number of the elements of the latter document array. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + The return value is the pointer to a new document array whose elements belong to the former + set but not to the latter set. + Elements of the array are sorted in descending order of their scores. Because the region of + the return value is allocated with the `malloc' call, it should be released with the `free' + call if it is no longer in use. */ +ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np); + + +/* Sort a set of documents in descending order of scores. + `pairs' specifies the pointer to a document array. + `pnum' specifies the number of the elements of the document array. */ +void odpairssort(ODPAIR *pairs, int pnum); + + +/* Get the natural logarithm of a number. + `x' specifies a number. + The return value is the natural logarithm of the number. If the number is equal to or less + than 1.0, the return value is 0.0. + This function is useful when an application calculates the IDF of search results. */ +double odlogarithm(double x); + + +/* Get the cosine of the angle of two vectors. + `avec' specifies the pointer to one array of numbers. + `bvec' specifies the pointer to the other array of numbers. + `vnum' specifies the number of elements of each array. + The return value is the cosine of the angle of two vectors. + This function is useful when an application calculates similarity of documents. */ +double odvectorcosine(const int *avec, const int *bvec, int vnum); + + +/* Set the global tuning parameters. + `ibnum' specifies the number of buckets for inverted indexes. + `idnum' specifies the division number of inverted index. + `cbnum' specifies the number of buckets for dirty buffers. + `csiz' specifies the maximum bytes to use memory for dirty buffers. + The default setting is equivalent to `odsettuning(32749, 7, 262139, 8388608)'. This function + should be called before opening a handle. */ +void odsettuning(int ibnum, int idnum, int cbnum, int csiz); + + +/* Break a text into words and store appearance forms and normalized form into lists. + `odeum' specifies a database handle. + `text' specifies the string of a text. + `awords' specifies a list handle into which appearance form is store. + `nwords' specifies a list handle into which normalized form is store. If it is `NULL', it is + ignored. + Words are separated with space characters and such delimiters as period, comma and so on. */ +void odanalyzetext(ODEUM *odeum, const char *text, CBLIST *awords, CBLIST *nwords); + + +/* Set the classes of characters used by `odanalyzetext'. + `odeum' specifies a database handle. + `spacechars' spacifies a string contains space characters. + `delimchars' spacifies a string contains delimiter characters. + `gluechars' spacifies a string contains glue characters. */ +void odsetcharclass(ODEUM *odeum, const char *spacechars, const char *delimchars, + const char *gluechars); + + +/* Query a database using a small boolean query language. + `odeum' specifies a database handle. + 'query' specifies the text of the query. + `np' specifies the pointer to a variable to which the number of the elements of the return + value is assigned. + `errors' specifies a list handle into which error messages are stored. If it is `NULL', it + is ignored. + If successful, the return value is the pointer to an array, else, it is `NULL'. Each + element of the array is a pair of the ID number and the score of a document, and sorted in + descending order of their scores. Even if no document corresponds to the specified condition, + it is not error but returns an dummy array. + Because the region of the return value is allocated with the `malloc' call, it should be + released with the `free' call if it is no longer in use. Note that each element of the array + of the return value can be data of a deleted document. */ +ODPAIR *odquery(ODEUM *odeum, const char *query, int *np, CBLIST *errors); + + + +/************************************************************************************************* + * features for experts + *************************************************************************************************/ + + +/* Get the internal database handle for documents. + `odeum' specifies a database handle. + The return value is the internal database handle for documents. + Note that the the returned handle should not be updated. */ +CURIA *odidbdocs(ODEUM *odeum); + + +/* Get the internal database handle for the inverted index. + `odeum' specifies a database handle. + The return value is the internal database handle for the inverted index. + Note that the the returned handle should not be updated. */ +CURIA *odidbindex(ODEUM *odeum); + + +/* Get the internal database handle for the reverse dictionary. + `odeum' specifies a database handle. + The return value is the internal database handle for the reverse dictionary. + Note that the the returned handle should not be updated. */ +VILLA *odidbrdocs(ODEUM *odeum); + + +/* Set the call back function called in merging. + `otcb' specifires the pointer to a function to report outturn. Its first argument is the name + of processing function. Its second argument is the handle of the database being processed. + Its third argument is ths string of a log message. If it is `NULL', the call back function is + cleared. */ +void odsetotcb(void (*otcb)(const char *, ODEUM *, const char *)); + + +/* Get the positive one of square roots of a number. + `x' specifies a number. + The return value is the positive one of square roots of a number. If the number is equal to + or less than 0.0, the return value is 0.0. */ +double odsquareroot(double x); + + +/* Get the absolute of a vector. + `vec' specifies the pointer to an array of numbers. + `vnum' specifies the number of elements of the array. + The return value is the absolute of a vector. */ +double odvecabsolute(const int *vec, int vnum); + + +/* Get the inner product of two vectors. + `avec' specifies the pointer to one array of numbers. + `bvec' specifies the pointer to the other array of numbers. + `vnum' specifies the number of elements of each array. + The return value is the inner product of two vectors. */ +double odvecinnerproduct(const int *avec, const int *bvec, int vnum); + + + +#undef MYEXTERN + +#if defined(__cplusplus) /* export for C++ */ +} +#endif + +#endif /* duplication check */ + + +/* END OF FILE */ |