diff options
author | Chris Wilson <chris+github@qwirx.com> | 2010-10-18 20:34:25 +0000 |
---|---|---|
committer | Chris Wilson <chris+github@qwirx.com> | 2010-10-18 20:34:25 +0000 |
commit | cd154e05f09a5bd40e258f50a71e89d263143e35 (patch) | |
tree | facb5de5351f04d00c32b0504291f68eb8890b26 /qdbm/odidx.c | |
parent | a1f32183837c489dae93a1813653288e41c60f70 (diff) |
Revert [2710] and [2717], remove TDB and replace with QDBM again, to fix
build on Windows and make it easier to merge Charles' work.
Diffstat (limited to 'qdbm/odidx.c')
-rw-r--r-- | qdbm/odidx.c | 890 |
1 files changed, 890 insertions, 0 deletions
diff --git a/qdbm/odidx.c b/qdbm/odidx.c new file mode 100644 index 00000000..a4b5b406 --- /dev/null +++ b/qdbm/odidx.c @@ -0,0 +1,890 @@ +/************************************************************************************************* + * Utility for indexing document files into a database of Odeum + * Copyright (C) 2000-2007 Mikio Hirabayashi + * This file is part of QDBM, Quick Database Manager. + * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU + * Lesser General Public License as published by the Free Software Foundation; either version + * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * You should have received a copy of the GNU Lesser General Public License along with QDBM; if + * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA. + *************************************************************************************************/ + + +#include <depot.h> +#include <cabin.h> +#include <odeum.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <stdarg.h> +#include <time.h> +#include <signal.h> + +#undef TRUE +#define TRUE 1 /* boolean true */ +#undef FALSE +#define FALSE 0 /* boolean false */ + +#define PATHCHR '/' /* delimiter character of path */ +#define EXTCHR '.' /* delimiter character of extension */ +#define CDIRSTR "." /* string of current directory */ +#define PDIRSTR ".." /* string of parent directory */ +#define MTDBNAME "_mtime" /* name of the database for last modified times */ +#define MTDBLRM 81 /* records in a leaf node of time database */ +#define MTDBNIM 192 /* records in a non-leaf node of time database */ +#define MTDBLCN 64 /* number of leaf cache of time database */ +#define MTDBNCN 32 /* number of non-leaf cache of time database */ +#define SCDBNAME "_score" /* name of the database for scores */ +#define SCDBBNUM 32749 /* bucket number of the score database */ +#define SCDBALIGN -3 /* alignment of the score database */ +#define PATHBUFSIZ 2048 /* size of a path buffer */ +#define MAXLOAD 0.85 /* max ratio of bucket loading */ +#define KEYNUM 32 /* number of keywords to store */ + + +/* for Win32 and RISC OS */ +#if defined(_WIN32) +#undef PATHCHR +#define PATHCHR '\\' +#undef EXTCHR +#define EXTCHR '.' +#undef CDIRSTR +#define CDIRSTR "." +#undef PDIRSTR +#define PDIRSTR ".." +#elif defined(__riscos__) || defined(__riscos) +#include <unixlib/local.h> +int __riscosify_control = __RISCOSIFY_NO_PROCESS; +#undef PATHCHR +#define PATHCHR '.' +#undef EXTCHR +#define EXTCHR '/' +#undef CDIRSTR +#define CDIRSTR "@" +#undef PDIRSTR +#define PDIRSTR "^" +#endif + + +/* global variables */ +const char *progname; /* program name */ +int sigterm; /* flag for termination signal */ + + +/* function prototypes */ +int main(int argc, char **argv); +void setsignals(void); +void sigtermhandler(int num); +void usage(void); +int runregister(int argc, char **argv); +int runrelate(int argc, char **argv); +int runpurge(int argc, char **argv); +int bwimatchlist(const char *str, const CBLIST *keys); +char *fgetl(FILE *ifp); +void otcb(const char *fname, ODEUM *odeum, const char *msg); +void pdperror(const char *name); +void printferror(const char *format, ...); +void printfinfo(const char *format, ...); +const char *datestr(time_t t); +int proclist(const char *name, const char *lfile, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist); +int procdir(const char *name, const char *dir, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist); +int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist); +int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist); +char *filetouri(const char *file); +ODDOC *makedocplain(const char *uri, const char *text, const char *date); +ODDOC *makedochtml(const char *uri, const char *html, const char *date); +CBMAP *htmlescpairs(void); +int procrelate(const char *name); +int procpurge(const char *name); + + +/* main routine */ +int main(int argc, char **argv){ + int rv; + cbstdiobin(); + progname = argv[0]; + sigterm = FALSE; + setsignals(); + if(argc < 2) usage(); + odsetotcb(otcb); + rv = 0; + if(!strcmp(argv[1], "register")){ + rv = runregister(argc, argv); + } else if(!strcmp(argv[1], "relate")){ + rv = runrelate(argc, argv); + } else if(!strcmp(argv[1], "purge")){ + rv = runpurge(argc, argv); + } else { + usage(); + } + return rv; +} + + +/* set signal handlers */ +void setsignals(void){ + signal(1, sigtermhandler); + signal(2, sigtermhandler); + signal(3, sigtermhandler); + signal(13, sigtermhandler); + signal(15, sigtermhandler); +} + + +/* handler of termination signal */ +void sigtermhandler(int num){ + signal(num, SIG_DFL); + sigterm = TRUE; + printfinfo("the termination signal %d catched", num); +} + + +/* print the usage and exit */ +void usage(void){ + fprintf(stderr, "%s: indexer of document files\n", progname); + fprintf(stderr, "\n"); + fprintf(stderr, "usage:\n"); + fprintf(stderr, " %s register [-l file] [-wmax num] [-tsuf sufs] [-hsuf sufs] name [dir]\n", + progname); + fprintf(stderr, " %s relate name\n", progname); + fprintf(stderr, " %s purge name\n", progname); + fprintf(stderr, "\n"); + exit(1); +} + + +/* parse arguments of register command */ +int runregister(int argc, char **argv){ + char *name, *dir, *lfile, *tsuf, *hsuf, path[PATHBUFSIZ]; + int i, wmax, plen, rv; + CBLIST *tsuflist, *hsuflist; + name = NULL; + dir = NULL; + lfile = NULL; + tsuf = NULL; + hsuf = NULL; + wmax = -1; + for(i = 2; i < argc; i++){ + if(!name && argv[i][0] == '-'){ + if(!strcmp(argv[i], "-l")){ + if(++i >= argc) usage(); + lfile = argv[i]; + } else if(!strcmp(argv[i], "-wmax")){ + if(++i >= argc) usage(); + wmax = atoi(argv[i]); + } else if(!strcmp(argv[i], "-tsuf")){ + if(++i >= argc) usage(); + tsuf = argv[i]; + } else if(!strcmp(argv[i], "-hsuf")){ + if(++i >= argc) usage(); + hsuf = argv[i]; + } else { + usage(); + } + } else if(!name){ + name = argv[i]; + } else if(!dir){ + dir = argv[i]; + } else { + usage(); + } + } + if(!name) usage(); + if(!dir) dir = CDIRSTR; + plen = sprintf(path, "%s", dir); + if(plen > 1 && path[plen-1] == PATHCHR) path[plen-1] = '\0'; + tsuflist = cbsplit(tsuf ? tsuf : ".txt,.text", -1, ","); + hsuflist = cbsplit(hsuf ? hsuf : ".html,.htm", -1, ","); + if(lfile){ + rv = proclist(name, lfile, wmax, tsuflist, hsuflist); + } else { + rv = procdir(name, path, wmax, tsuflist, hsuflist); + } + cblistclose(hsuflist); + cblistclose(tsuflist); + return rv; +} + + +/* parse arguments of relate command */ +int runrelate(int argc, char **argv){ + char *name; + int i, rv; + name = NULL; + for(i = 2; i < argc; i++){ + if(!name && argv[i][0] == '-'){ + usage(); + } else if(!name){ + name = argv[i]; + } else { + usage(); + } + } + if(!name) usage(); + rv = procrelate(name); + return rv; +} + + +/* parse arguments of purge command */ +int runpurge(int argc, char **argv){ + char *name; + int i, rv; + name = NULL; + for(i = 2; i < argc; i++){ + if(!name && argv[i][0] == '-'){ + usage(); + } else if(!name){ + name = argv[i]; + } else { + usage(); + } + } + if(!name) usage(); + rv = procpurge(name); + return rv; +} + + +/* case insensitive backward matching with a list */ +int bwimatchlist(const char *str, const CBLIST *keys){ + int i; + for(i = 0; i < cblistnum(keys); i++){ + if(cbstrbwimatch(str, cblistval(keys, i, NULL))) return TRUE; + } + return FALSE; +} + + +/* read a line */ +char *fgetl(FILE *ifp){ + char *buf; + int c, len, blen; + buf = NULL; + len = 0; + blen = 256; + while((c = fgetc(ifp)) != EOF){ + if(blen <= len) blen *= 2; + buf = cbrealloc(buf, blen + 1); + if(c == '\n') c = '\0'; + buf[len++] = c; + if(c == '\0') break; + } + if(!buf) return NULL; + buf[len] = '\0'; + return buf; +} + + +/* report the outturn */ +void otcb(const char *fname, ODEUM *odeum, const char *msg){ + char *name; + name = odname(odeum); + printf("%s: %s: %s: %s\n", progname, fname, name, msg); + free(name); +} + + +/* print an error message */ +void pdperror(const char *name){ + printf("%s: ERROR: %s: %s\n", progname, name, dperrmsg(dpecode)); + fflush(stdout); +} + + +/* print formatted error string and flush the buffer */ +void printferror(const char *format, ...){ + va_list ap; + va_start(ap, format); + printf("%s: ERROR: ", progname); + vprintf(format, ap); + putchar('\n'); + fflush(stdout); + va_end(ap); +} + + +/* print formatted information string and flush the buffer */ +void printfinfo(const char *format, ...){ + va_list ap; + va_start(ap, format); + printf("%s: INFO: ", progname); + vprintf(format, ap); + putchar('\n'); + fflush(stdout); + va_end(ap); +} + + +/* get static string of the date */ +const char *datestr(time_t t){ + static char buf[32]; + struct tm *stp; + if(!(stp = localtime(&t))) return "0000/00/00 00:00:00"; + sprintf(buf, "%04d/%02d/%02d %02d:%02d:%02d", + stp->tm_year + 1900, stp->tm_mon + 1, stp->tm_mday, + stp->tm_hour, stp->tm_min, stp->tm_sec); + return buf; +} + + +/* processing with finding files in a list file */ +int proclist(const char *name, const char *lfile, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist){ + ODEUM *odeum; + VILLA *mtdb; + FILE *ifp; + char *line, path[PATHBUFSIZ]; + int err, fatal; + if(!strcmp(lfile, "-")){ + ifp = stdin; + } else { + if(!(ifp = fopen(lfile, "rb"))){ + printferror("%s: file cannot be opened", lfile); + return 1; + } + } + printfinfo("%s: registration started", name); + if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){ + pdperror(name); + if(ifp != stdin) fclose(ifp); + return 1; + } + sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME); + if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){ + pdperror(name); + odclose(odeum); + if(ifp != stdin) fclose(ifp); + return 1; + } + vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN); + printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + err = FALSE; + while((line = fgetl(ifp)) != NULL){ + if(sigterm){ + printferror("aborting due to a termination signal"); + free(line); + err = TRUE; + break; + } + if(!indexfile(odeum, mtdb, name, line, wmax, tsuflist, hsuflist)) err = TRUE; + free(line); + } + fatal = odfatalerror(odeum); + printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + if(!vlclose(mtdb)){ + pdperror(name); + err = TRUE; + } + if(!odclose(odeum)){ + pdperror(name); + err = TRUE; + } + if(ifp != stdin) fclose(ifp); + if(err){ + printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : ""); + } else { + printfinfo("%s: registration completed successfully", name); + } + return err ? 1 : 0; +} + + +/* processing with finding files in a directory */ +int procdir(const char *name, const char *dir, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist){ + ODEUM *odeum; + VILLA *mtdb; + char path[PATHBUFSIZ]; + int err, fatal; + printfinfo("%s: registration started", name); + if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){ + pdperror(name); + return 1; + } + sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME); + if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){ + pdperror(name); + odclose(odeum); + return 1; + } + vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN); + printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + err = FALSE; + if(!indexdir(odeum, mtdb, name, dir, wmax, tsuflist, hsuflist)) err = TRUE; + fatal = odfatalerror(odeum); + printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + if(!vlclose(mtdb)){ + pdperror(name); + err = TRUE; + } + if(!odclose(odeum)){ + pdperror(name); + err = TRUE; + } + if(err){ + printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : ""); + } else { + printfinfo("%s: registration completed successfully", name); + } + return err ? 1 : 0; +} + + +/* find and index files in a directory */ +int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist){ + CBLIST *files; + const char *file; + char path[PATHBUFSIZ]; + int i, isroot, isdir, err; + if(!(files = cbdirlist(dir))){ + printferror("%s: directory cannot be opened", dir); + return FALSE; + } + isroot = dir[0] == PATHCHR && dir[1] == '\0'; + err = FALSE; + for(i = 0; i < cblistnum(files); i++){ + if(sigterm){ + printferror("aborting due to a termination signal"); + cblistclose(files); + return FALSE; + } + file = cblistval(files, i, NULL); + if(!strcmp(file, CDIRSTR) || !strcmp(file, PDIRSTR)) continue; + if(isroot){ + sprintf(path, "%s%s", dir, file); + } else { + sprintf(path, "%s%c%s", dir, PATHCHR, file); + } + if(!cbfilestat(path, &isdir, NULL, NULL)){ + printferror("%s: file does not exist", file); + err = TRUE; + continue; + } + if(isdir){ + if(!indexdir(odeum, mtdb, name, path, wmax, tsuflist, hsuflist)) err = TRUE; + } else { + if(!indexfile(odeum, mtdb, name, path, wmax, tsuflist, hsuflist)) err = TRUE; + } + } + cblistclose(files); + return err ? FALSE : TRUE; +} + + +/* index a file into the database */ +int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax, + const CBLIST *tsuflist, const CBLIST *hsuflist){ + static int cnt = 0; + char *vbuf, *buf, *uri; + const char *title; + int size, hot, vsiz, wnum, bnum; + time_t mtime; + ODDOC *doc; + if(!cbfilestat(file, NULL, &size, &mtime)){ + printferror("%s: file does not exist", file); + return FALSE; + } + hot = TRUE; + if((vbuf = vlget(mtdb, file, -1, &vsiz)) != NULL){ + if(vsiz == sizeof(int) && mtime <= *(int *)vbuf) hot = FALSE; + free(vbuf); + } + if(!hot){ + printfinfo("%s: passed", file); + return TRUE; + } + doc = NULL; + uri = filetouri(file); + if(bwimatchlist(file, tsuflist)){ + if(!(buf = cbreadfile(file, NULL))){ + printferror("%s: file cannot be opened", file); + return FALSE; + } + doc = makedocplain(uri, buf, datestr(mtime)); + free(buf); + } else if(bwimatchlist(file, hsuflist)){ + if(!(buf = cbreadfile(file, NULL))){ + printferror("%s: file cannot be opened", file); + return FALSE; + } + doc = makedochtml(uri, buf, datestr(mtime)); + free(buf); + } + free(uri); + if(doc){ + if(!(title = oddocgetattr(doc, "title")) || strlen(title) < 1){ + if((title = strrchr(file, PATHCHR)) != NULL){ + title++; + } else { + title = file; + } + oddocaddattr(doc, "title", title); + } + if(odput(odeum, doc, wmax, TRUE) && + vlput(mtdb, file, -1, (char *)&mtime, sizeof(int), VL_DOVER)){ + printfinfo("%s: registered: id=%d wnum=%d", + file, oddocid(doc), cblistnum(oddocnwords(doc))); + cnt++; + } else { + pdperror(file); + } + oddocclose(doc); + } + wnum = odwnum(odeum); + bnum = odbnum(odeum); + if(wnum != -1 && bnum != -1 && (double)wnum / (double)bnum > MAXLOAD){ + printfinfo("%s: optimizing started: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + if(!odoptimize(odeum)){ + pdperror(file); + return FALSE; + } + printfinfo("%s: optimizing completed: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + } + if(cnt >= 256){ + printfinfo("%s: database status: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + cnt = 0; + } + return TRUE; +} + + +/* make the url from file path */ +char *filetouri(const char *file){ + CBLIST *list; + char str[PATHBUFSIZ], *wp, *enc; + const char *name; + int i, nsiz; + sprintf(str, "%c", PATHCHR); + list = cbsplit(file, -1, str); + wp = str; + for(i = 0; i < cblistnum(list); i++){ + if(i > 0) *(wp++) = '/'; + name = cblistval(list, i, &nsiz); + enc = cburlencode(name, nsiz); + wp += sprintf(wp, "%s", enc); + free(enc); + } + cblistclose(list); + *wp = '\0'; + return cbmemdup(str, -1); +} + + +/* make a document of plain text */ +ODDOC *makedocplain(const char *uri, const char *text, const char *date){ + ODDOC *doc; + CBLIST *awords; + const char *asis; + char *normal; + int i; + doc = oddocopen(uri); + if(date) oddocaddattr(doc, "date", date); + awords = odbreaktext(text); + for(i = 0; i < cblistnum(awords); i++){ + asis = cblistval(awords, i, NULL); + normal = odnormalizeword(asis); + oddocaddword(doc, normal, asis); + free(normal); + } + cblistclose(awords); + return doc; +} + + +/* make a document of HTML */ +ODDOC *makedochtml(const char *uri, const char *html, const char *date){ + ODDOC *doc; + CBMAP *pairs; + CBLIST *elems, *awords; + const char *text, *asis; + char *rtext, *normal; + int i, j, body; + pairs = htmlescpairs(); + doc = oddocopen(uri); + if(date) oddocaddattr(doc, "date", date); + elems = cbxmlbreak(html, TRUE); + body = FALSE; + for(i = 0; i < cblistnum(elems); i++){ + text = cblistval(elems, i, NULL); + if(cbstrfwimatch(text, "<title")){ + i++; + if(i < cblistnum(elems)){ + text = cblistval(elems, i, NULL); + if(text[0] == '<') text = ""; + rtext = cbreplace(text, pairs); + for(j = 0; rtext[j] != '\0'; j++){ + if(strchr("\t\n\v\f\r", rtext[j])) rtext[j] = ' '; + } + while(--j >= 0){ + if(rtext[j] != ' ') break; + rtext[j] = '\0'; + } + for(j = 0; rtext[j] != '\0'; j++){ + if(rtext[j] != ' ') break; + } + oddocaddattr(doc, "title", rtext + j); + awords = odbreaktext(rtext); + for(j = 0; j < cblistnum(awords); j++){ + asis = cblistval(awords, j, NULL); + normal = odnormalizeword(asis); + oddocaddword(doc, normal, ""); + free(normal); + } + cblistclose(awords); + free(rtext); + } + } else if(cbstrfwimatch(text, "<body")){ + body = TRUE; + } else if(body && text[0] != '<'){ + rtext = cbreplace(text, pairs); + awords = odbreaktext(rtext); + for(j = 0; j < cblistnum(awords); j++){ + asis = cblistval(awords, j, NULL); + normal = odnormalizeword(asis); + oddocaddword(doc, normal, asis); + free(normal); + } + cblistclose(awords); + free(rtext); + } + } + if(!body){ + for(i = 0; i < cblistnum(elems); i++){ + text = cblistval(elems, i, NULL); + if(cbstrfwimatch(text, "<title")){ + i++; + } else if(text[0] != '<'){ + rtext = cbreplace(text, pairs); + awords = odbreaktext(rtext); + for(j = 0; j < cblistnum(awords); j++){ + asis = cblistval(awords, j, NULL); + normal = odnormalizeword(asis); + oddocaddword(doc, normal, asis); + free(normal); + } + cblistclose(awords); + free(rtext); + } + } + } + cblistclose(elems); + return doc; +} + + +/* get pairs of escaping characters */ +CBMAP *htmlescpairs(void){ + char *latinext[] = { + " ", "!", "(cent)", "(pound)", "(currency)", "(yen)", "|", "(section)", "\"", "(C)", + "", "<<", "(not)", "-", "(R)", "~", "(degree)", "+-", "^2", "^3", + "'", "(u)", "(P)", "*", ",", "^1", "", ">>", "(1/4)", "(1/2)", + "(3/4)", "?", "A", "A", "A", "A", "A", "A", "AE", "C", + "E", "E", "E", "E", "I", "I", "I", "I", "D", "N", + "O", "O", "O", "O", "O", "*", "O", "U", "U", "U", + "U", "Y", "P", "s", "a", "a", "a", "a", "a", "a", + "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i", + "o", "n", "o", "o", "o", "o", "o", "/", "o", "u", + "u", "u", "u", "y", "p", "y", NULL + }; + static CBMAP *pairs = NULL; + char kbuf[8], vbuf[8]; + int i, ksiz, vsiz; + if(pairs) return pairs; + pairs = cbmapopen(); + cbglobalgc(pairs, (void (*)(void *))cbmapclose); + cbmapput(pairs, "&", -1, "&", -1, TRUE); + cbmapput(pairs, "<", -1, "<", -1, TRUE); + cbmapput(pairs, ">", -1, ">", -1, TRUE); + cbmapput(pairs, """, -1, "\"", -1, TRUE); + cbmapput(pairs, "'", -1, "'", -1, TRUE); + cbmapput(pairs, " ", -1, " ", -1, TRUE); + cbmapput(pairs, "©", -1, "(C)", -1, TRUE); + cbmapput(pairs, "®", -1, "(R)", -1, TRUE); + cbmapput(pairs, "™", -1, "(TM)", -1, TRUE); + for(i = 1; i <= 127; i++){ + ksiz = sprintf(kbuf, "&#%d;", i); + vsiz = sprintf(vbuf, "%c", i); + cbmapput(pairs, kbuf, ksiz, vbuf, vsiz, TRUE); + } + cbmapput(pairs, "‚", -1, ",", -1, TRUE); + cbmapput(pairs, "„", -1, ",,", -1, TRUE); + cbmapput(pairs, "…", -1, "...", -1, TRUE); + cbmapput(pairs, "‹", -1, "<", -1, TRUE); + cbmapput(pairs, "‘", -1, "'", -1, TRUE); + cbmapput(pairs, "’", -1, "'", -1, TRUE); + cbmapput(pairs, "“", -1, "\"", -1, TRUE); + cbmapput(pairs, "”", -1, "\"", -1, TRUE); + cbmapput(pairs, "–", -1, "-", -1, TRUE); + cbmapput(pairs, "—", -1, "-", -1, TRUE); + cbmapput(pairs, "˜", -1, "~", -1, TRUE); + cbmapput(pairs, "™", -1, "(TM)", -1, TRUE); + cbmapput(pairs, "›", -1, ">", -1, TRUE); + for(i = 0; latinext[i]; i++){ + ksiz = sprintf(kbuf, "&#%d;", i + 160); + cbmapput(pairs, kbuf, ksiz, latinext[i], -1, TRUE); + } + return pairs; +} + + +/* register scores of documents */ +int procrelate(const char *name){ + ODEUM *odeum; + DEPOT *scdb; + ODDOC *doc; + CBMAP *scores; + const char *file; + char path[PATHBUFSIZ], *mbuf; + int err, fatal, id, msiz; + printfinfo("%s: relating started", name); + if(!(odeum = odopen(name, OD_OWRITER))){ + pdperror(name); + return 1; + } + sprintf(path, "%s%c%s", name, PATHCHR, SCDBNAME); + if(!(scdb = dpopen(path, OD_OWRITER | OD_OCREAT, SCDBBNUM))){ + pdperror(name); + odclose(odeum); + return 1; + } + if(!dpsetalign(scdb, SCDBALIGN)){ + pdperror(name); + dpclose(scdb); + odclose(odeum); + return 1; + } + printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + err = FALSE; + if(!oditerinit(odeum)){ + pdperror(name); + err = TRUE; + } else { + while(TRUE){ + if(sigterm){ + printferror("aborting due to a termination signal"); + err = TRUE; + break; + } + if(!(doc = oditernext(odeum))){ + if(dpecode != DP_ENOITEM){ + pdperror(name); + err = TRUE; + } + break; + } + file = oddocuri(doc); + id = oddocid(doc); + scores = oddocscores(doc, KEYNUM, odeum); + mbuf = cbmapdump(scores, &msiz); + if(!dpput(scdb, (char *)&id, sizeof(int), mbuf, msiz, DP_DOVER)){ + pdperror(name); + err = TRUE; + } else { + printfinfo("%s: related", file); + } + free(mbuf); + cbmapclose(scores); + oddocclose(doc); + if(err) break; + } + } + fatal = odfatalerror(odeum); + printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + if(!dpclose(scdb)){ + pdperror(name); + err = TRUE; + } + if(!odclose(odeum)){ + pdperror(name); + err = TRUE; + } + if(err){ + printfinfo("%s: relating was over%s", name, fatal ? " with fatal error" : ""); + } else { + printfinfo("%s: relating completed successfully", name); + } + return err ? 1 : 0; +} + + +/* purge documents which is not existing. */ +int procpurge(const char *name){ + ODEUM *odeum; + ODDOC *doc; + const char *file; + int err, fatal; + printfinfo("%s: purging started", name); + if(!(odeum = odopen(name, OD_OWRITER))){ + pdperror(name); + return 1; + } + printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + err = FALSE; + if(!oditerinit(odeum)){ + pdperror(name); + err = TRUE; + } else { + while(TRUE){ + if(sigterm){ + printferror("aborting due to a termination signal"); + err = TRUE; + break; + } + if(!(doc = oditernext(odeum))){ + if(dpecode != DP_ENOITEM){ + pdperror(name); + err = TRUE; + } + break; + } + file = oddocuri(doc); + if(cbfilestat(file, NULL, NULL, NULL)){ + printfinfo("%s: passed", file); + } else { + if(!odout(odeum, file)){ + pdperror(file); + err = TRUE; + } + printfinfo("%s: purged", file); + } + oddocclose(doc); + } + } + fatal = odfatalerror(odeum); + printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d", + name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum)); + if(!odclose(odeum)){ + pdperror(name); + err = TRUE; + } + if(err){ + printfinfo("%s: purging was over%s", name, fatal ? " with fatal error" : ""); + } else { + printfinfo("%s: purging completed successfully", name); + } + return err ? 1 : 0; +} + + + +/* END OF FILE */ |