swish.c
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:16k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** along with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **---------------------------------------------------------
- ** Added support for METADATA
- ** G. Hill ghill@library.berkeley.edu 3/18/97
- **
- ** Added printing of common words
- ** G.Hill 4/7/97 ghill@library.berkeley.edu
- **
- */
- #define MAIN_FILE
- #include "swish.h"
- #include "error.h"
- #include "list.h"
- #include "search.h"
- #include "index.h"
- #include "string.h"
- #include "file.h"
- #include "merge.h"
- #include "docprop.h"
- /*
- ** This array has pointers to all the indexing data source
- ** structures
- */
- extern struct _indexing_data_source_def *data_sources[];
- int main(argc, argv)
- int argc;
- char **argv;
- {
- char c, word[MAXWORDLEN], wordlist[MAXSTRLEN],
- maxhitstr[MAXSTRLEN], structstr[MAXSTRLEN];
- char tmpindex1[MAXFILELEN], tmpindex2[MAXFILELEN],
- index1[MAXSTRLEN], index2[MAXSTRLEN],
- index3[MAXSTRLEN], index4[MAXSTRLEN];
- int i, j, hasindex, hasdir, hasconf, hasverbose, structure,
- totalfiles, stopwords, index, decode, merge, pos,
- hasMetaName;
- long offsetstart, starttime, stoptime, plimit, flimit;
- FILE *fp1, *fp2;
- struct file *filep;
- struct entry *entryp;
- struct swline *conflist, *tmplist;
-
- index = decode = merge = 0;
- hasindex = hasdir = hasconf = hasverbose = hasMetaName = 0;
- followsymlinks = stopwords = 0;
- totalwords = stopwords = stopPos = 0;
- applyStemmingRules = 0; /* added 11/24/98 */
- useCustomOutputDelimiter = 0; /* added 11/24/98 */
- customOutputDelimiter[0] = ' '; /* added 11/24/98 */
- ignoreTotalWordCountWhenRanking = 0; /* added 11/24/98 */
- filelist = NULL;
- entrylist = NULL;
- dirlist = indexlist = conflist = tmplist = NULL;
- replacelist = NULL;
- metaEntryList = NULL;
- maxhits = -1;
- verbose = VERBOSE;
- plimit = PLIMIT;
- flimit = FLIMIT;
- minwordlimit = MINWORDLIMIT;
- maxwordlimit = MAXWORDLIMIT;
- indexComments = 1;
- nocontentslist = 0;
- strcpy(wordchars,WORDCHARS);
- strcpy(beginchars,BEGINCHARS);
- strcpy(endchars,ENDCHARS);
- strcpy(ignorelastchar, IGNORELASTCHAR);
- strcpy(ignorefirstchar, IGNOREFIRSTCHAR);
- structure = 1;
- wordlist[0] = ' ';
- structstr[0] = ' ';
- indexn[0] = ' ';
- indexd[0] = ' ';
- indexp[0] = ' ';
- indexa[0] = ' ';
- setlocale(LC_CTYPE,"");
- /* By default we are set up to use the first data source in the list */
- IndexingDataSource = data_sources[0];
-
- if (argc == 1)
- usage();
- while (--argc > 0)
- {
- ++argv;
- if ((*argv)[0] != '-')
- usage();
- c = (*argv)[1];
- if ((*argv)[2] != ' ' && isalpha((*argv)[2]))
- usage();
- if (c == 'i')
- {
- index = 1;
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-') {
- dirlist = (struct swline *)
- addswline(dirlist, (++argv)[0]);
- argc--;
- }
- }
- else if (c == 'w')
- {
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-')
- {
- strcpy(word, (++argv)[0]);
- argc--;
- sprintf(wordlist, "%s%s%s", wordlist,
- (wordlist[0] == ' ') ? "" : " ", word);
- }
- }
- else if (c == 'S')
- {
- struct _indexing_data_source_def **data_source;
- const char* opt = (++argv)[0];
- argc--;
- for (data_source = data_sources; *data_source != 0; data_source++ )
- {
- if (strcmp(opt, (*data_source)->IndexingDataSourceId) == 0)
- {
- break;
- }
- }
-
- if (!*data_source)
- {
- sprintf(errorstr, "Unknown -S option "%s"", opt);
- progerr(errorstr);
- }
- else
- {
- IndexingDataSource = *data_source;
- }
- }
- #ifdef SUPPORT_DOC_PROPERTIES
- else if (c == 'p') {
- /* -p <property_to_display> [<property_to_display>]* */
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-')
- {
- addSearchResultDisplayProperty((++argv)[0]);
- argc--;
- }
- }
- #endif
- else if (c == 'f')
- {
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-')
- {
- indexlist = (struct swline *)
- addswline(indexlist, (++argv)[0]);
- argc--;
- }
- }
- else if (c == 'c')
- {
- index = 1;
- hasconf = 1;
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-') {
- conflist = (struct swline *)
- addswline(conflist, (++argv)[0]);
- argc--;
- }
- }
- else if (c == 'C') {
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-')
- {
- conflist = (struct swline *)
- addswline(conflist, (++argv)[0]);
- argc--;
- }
- if (conflist == NULL)
- progerr("Specifiy the configuration file.");
- else
- hasMetaName = 1;
- }
- else if (c == 'l') {
- followsymlinks = 1;
- argc--;
- }
- else if (c == 'm') {
- if ((argv + 1)[0] == ' ')
- maxhits = -1;
- else {
- strcpy(maxhitstr, (++argv)[0]);
- if (lstrstr(maxhitstr, "all"))
- maxhits = -1;
- else if (isdigit(maxhitstr[0]))
- maxhits = atoi(maxhitstr);
- else
- maxhits = -1;
- argc--;
- }
- }
- else if (c == 't') {
- if ((argv + 1)[0] == ' ')
- progerr("Specify tag fields (HBtheca).");
- else {
- structure = 0;
- strcpy(structstr, (++argv)[0]);
- argc--;
- }
- }
- else if (c == 'v') {
- hasverbose = 1;
- if ((argv + 1)[0] == ' ') {
- verbose = 3;
- break;
- }
- else if (!isdigit((argv + 1)[0][0]))
- verbose = 3;
- else
- verbose = atoi((++argv)[0]);
- argc--;
- }
- else if (c == 'V')
- printversion();
- else if (c == 'z' || c == 'h' || c == '?')
- usage();
- else if (c == 'M') {
- merge = 1;
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-') {
- indexlist = (struct swline *)
- addswline(indexlist, (++argv)[0]);
- argc--;
- }
- }
- else if (c == 'D') {
- decode = 1;
- while ((argv + 1)[0] != ' ' && *(argv + 1)[0] != '-') {
- indexlist = (struct swline *)
- addswline(indexlist, (++argv)[0]);
- argc--;
- }
- }
- else if (c == 'd')
- {
- /* added 11/24/98 MG */
- if (((argv + 1)[0] != ' ') && (*(argv + 1)[0] != '-'))
- {
- useCustomOutputDelimiter = 1;
- strcpy(customOutputDelimiter, (++argv)[0]);
- if (strcmp(customOutputDelimiter, "dq") == 0)
- strcpy(customOutputDelimiter, """); /* double quote is cool */
- argc--;
- }
- }
- else
- usage();
- if (argc == 0)
- break;
- }
- hasdir = (dirlist == NULL) ? 0 : 1;
- hasindex = (indexlist == NULL) ? 0 : 1;
-
- if (index && merge)
- index = 0;
-
- if (decode) {
-
- if (!hasindex)
- progerr("Specify the index file to decode.");
-
- while (indexlist != NULL) {
-
- if ((fp1 = openIndexFileForRead(indexlist->line)) == NULL) {
- sprintf(errorstr,
- "Couldn't open the index file "%s".",
- indexlist->line);
- progerr(errorstr);
- }
- if (!isokindexheader(fp1)) {
- sprintf(errorstr,
- ""%s" has an unknown format.",
- indexlist->line);
- progerr(errorstr);
- }
-
- decompress(fp1);
- putchar('n');
- fclose(fp1);
-
- indexlist = indexlist->next;
- }
- exit(0);
-
- }
- else if (index)
- {
- printf("Indexing Data Source: "%s"n", IndexingDataSource->IndexingDataSourceName);
- if (hasconf)
- {
- while (conflist != NULL) {
- getdefaults(conflist->line, &hasdir, &hasindex,
- &plimit, &flimit, hasverbose);
- conflist = conflist->next;
- }
- }
- if (!hasindex)
- indexlist = (struct swline *) addswline(indexlist, INDEXFILE);
- if (!hasdir)
- progerr("Specify directories or files to index.");
-
- if (verbose < 0)
- verbose = 0;
- if (verbose > 4)
- verbose = 4;
- if (verbose)
- starttime = getthetime();
-
- while (dirlist != NULL) {
- indexpath(dirlist->line);
- dirlist = dirlist->next;
- }
-
- if ((fp1 = openIndexFileForWrite(indexlist->line)) == NULL) {
- sprintf(errorstr,
- "Couldn't write the index file "%s".",
- indexlist->line);
- progerr(errorstr);
- }
-
- if (verbose > 1)
- putchar('n');
- if (verbose)
- printf("Removing very common words... ");
-
- filep = filelist;
- totalfiles = getfilecount(filep);
- entryp = entrylist;
- stopwords = removestops(entryp, totalfiles, plimit, flimit);
-
- if (verbose) {
- if (stopwords) {
- printf("%d word%s removed.n",
- stopwords, (stopwords == 1) ? "" : "s");
- printf("%d words removed not in common words array:n",
- stopPos);
- for (pos = 0; pos < stopPos; pos++)
- printf("%s, ", stopList[pos]);
- printf("n");
- }
- else
- printf("no words removed.n");
- printf("Writing main index... ");
- }
-
- printheader(fp1, indexlist->line, totalwords, totalfiles, 0);
-
- offsetstart = ftell(fp1);
- for (i = 0; i < MAXCHARS; i++)
- fprintf(fp1, "%016li", offsets[i]);
- fputc('n', fp1);
-
- printindex(entrylist, fp1);
- printstopwords(fp1);
-
- if (verbose) {
- if (totalwords)
- printf("%d unique word%s indexed.n",
- totalwords, (totalwords == 1) ? "" : "s");
- else
- printf("no unique words indexed.n");
- printf("Writing file index... ");
- }
-
- printfilelist(filelist, fp1);
- printfileoffsets(fp1);
- printMetaNames(fp1);
- fclose(fp1);
-
- fp2 = openIndexFileForReadAndWrite(indexlist->line);
- fseek(fp2, offsetstart, 0);
- for (i = 0; i < MAXCHARS; i++)
- fprintf(fp2, "%016li", offsets[i]);
- fclose(fp2);
-
- if (verbose)
- {
- if (totalfiles)
- printf("%d file%s indexed.n", totalfiles,
- (totalfiles == 1) ? "" : "s");
- else
- printf("no files indexed.n");
-
- stoptime = getthetime();
- printrunning(starttime, stoptime);
- printf("Indexing done!n");
- }
- #ifdef INDEXPERMS
- chmod(indexlist->line, INDEXPERMS);
- #endif
- exit(0);
- }
- else if (merge)
- {
-
- if (indexlist == NULL)
- progerr("Specify index files and an output file.");
- if (hasconf)
- {
- while (conflist != NULL)
- {
- getdefaults(conflist->line, &hasdir, &hasindex,
- &plimit, &flimit, hasverbose);
- conflist = conflist->next;
- }
- }
-
- tmplist = indexlist;
- for (i = 0; tmplist != NULL; i++) {
- strcpy(index4, tmplist->line);
- tmplist = tmplist->next;
- }
- j = i - 2;
- if (i < 3)
- progerr("Specify index files and an output file.");
-
- sprintf(tmpindex1, tmpnam(NULL));
- sprintf(tmpindex2, tmpnam(NULL));
-
- i = 1;
- strcpy(index1, indexlist->line);
- indexlist = indexlist->next;
- while (i <= j) {
- strcpy(index2, indexlist->line);
- if (i % 2) {
- if (i != 1)
- strcpy(index1, tmpindex2);
- strcpy(index3, tmpindex1);
- }
- else {
- strcpy(index1, tmpindex1);
- strcpy(index3, tmpindex2);
- }
- if (i == j)
- strcpy(index3, index4);
- readmerge(index1, index2, index3);
- indexlist = indexlist->next;
- i++;
- }
- #ifdef INDEXPERMS
- chmod(index3, INDEXPERMS);
- #endif
- if (isfile(tmpindex1))
- remove(tmpindex1);
- if (isfile(tmpindex2))
- remove(tmpindex2);
-
- }
- else
- {
-
- for (i = 0; structstr[i] != ' '; i++)
- {
- switch (structstr[i])
- {
- case 'H':
- structure |= IN_HEAD;
- break;
- case 'B':
- structure |= IN_BODY;
- break;
- case 't':
- structure |= IN_TITLE;
- break;
- case 'h':
- structure |= IN_HEADER;
- break;
- case 'e':
- structure |= IN_EMPHASIZED;
- break;
- case 'c':
- structure |= IN_COMMENTS;
- break;
- default:
- structure |= IN_FILE;
- break;
- }
- }
-
- if (maxhits <= 0)
- maxhits = -1;
- if (!hasindex)
- indexlist = (struct swline *)
- addswline(indexlist, INDEXFILE);
- if (hasMetaName)
- while (conflist != NULL) {
- getdefaults(conflist->line, &hasdir, &hasindex,
- &plimit, &flimit, hasverbose);
- conflist = conflist->next;
- }
- search(wordlist, indexlist, structure);
-
- }
-
- exit(0);
- return 0;
- }
- /* Gets the current time in seconds since the epoch.
- */
- long getthetime()
- {
- long thetime;
- time_t tp;
-
- thetime = (long) time(&tp);
- return thetime;
- }
- /* Prints the running time (the time it took for indexing).
- */
- void printrunning(starttime, stoptime)
- long starttime;
- long stoptime;
- {
- int minutes, seconds;
-
- minutes = (stoptime - starttime) / SECSPERMIN;
- seconds = (stoptime - starttime) % SECSPERMIN;
- printf("Running time: ");
- if (minutes)
- printf("%d minute%s", minutes, (minutes == 1) ? "" : "s");
- if (minutes && seconds)
- printf(", ");
- if (seconds)
- printf("%d second%s", seconds, (seconds == 1) ? "" : "s");
- if (!minutes && !seconds)
- printf("Less than a second");
- printf(".n");
- }
- /* Prints the SWISH usage.
- */
- void usage()
- {
- const char* defaultIndexingSystem = "";
- printf(" usage: swish [-i dir file ... ] [-S system] [-c file] [-f file] [-l] [-v (num)]n");
- printf(" swish -w word1 word2 ... [-f file1 file2 ...] [-p prop1 ...] [-m num] [-t str] [-d delim]n");
- printf(" swish -M index1 index2 ... outputfilen");
- printf(" swish -D filen");
- printf(" swish -Vn");
- putchar('n');
- printf("options: defaults are in bracketsn");
- printf(" -S : specify which indexing system to use.n");
- printf(" Valid options are:n");
- #ifdef ALLOW_FILESYSTEM_INDEXING_DATA_SOURCE
- printf(" "fs" - index local files in your File Systemn");
- if (!*defaultIndexingSystem)
- defaultIndexingSystem = "fs";
- #endif
- #ifdef ALLOW_HTTP_INDEXING_DATA_SOURCE
- printf(" "http" - index web site files using a web crawlern");
- if (!*defaultIndexingSystem)
- defaultIndexingSystem = "http";
- #endif
- printf(" The default value is: "%s"n", defaultIndexingSystem);
- printf(" -i : create an index from the specified filesn");
- printf(" -w : search for words "word1 word2 ..."n");
- printf(" -t : tags to search in - specify as a stringn");
- printf(" "HBthec" - in head, body, title, header,n");
- printf(" emphasized, commentsn");
- printf(" -f : index file to create or search from [%s]n", INDEXFILE);
- printf(" -c : configuration file to use for indexingn");
- printf(" -v : verbosity level (0 to 3) [%d]n", VERBOSE);
- printf(" -l : follow symbolic links when indexingn");
- printf(" -m : the maximum number of results to return [%d]n", MAXHITS);
- printf(" -M : merges index filesn");
- printf(" -D : decodes an index filen");
- #ifdef SUPPORT_DOC_PROPERTIES
- printf(" -p : include these document properties in the output "prop1 prop2 ..."n");
- #endif
- printf(" -d : next param is delimiter. use "-d dq" to use a double quoten");
- printf(" -V : prints the current versionnn");
- printf("version: %sn", VERSION);
- printf(" docs: http://sunsite.berkeley.edu/SWISH-E/n");
- exit(1);
- }
- void printversion()
- {
- printf("SWISH-E %sn", VERSION);
- exit(0);
- }
- /*
- * Binary files must be open with the "b" option under Win32, so all
- * fopen() calls to index files have to go through these routines to
- * keep the code portable.
- * Note: text files should be opened normally, without the "b" option,
- * otherwise end-of-line processing is not done correctly (on Win32).
- */
- #ifdef _WIN32
- #define FILEMODE_READ "rb"
- #define FILEMODE_WRITE "wb"
- #define FILEMODE_READWRITE "rb+"
- #else
- #define FILEMODE_READ "r"
- #define FILEMODE_WRITE "w"
- #define FILEMODE_READWRITE "r+"
- #endif
- FILE* openIndexFileForWrite(filename)
- char* filename;
- {
- return fopen(filename, FILEMODE_WRITE);
- }
- FILE* openIndexFileForRead(filename)
- char* filename;
- {
- return fopen(filename, FILEMODE_READ);
- }
- FILE* openIndexFileForReadAndWrite(filename)
- char* filename;
- {
- return fopen(filename, FILEMODE_READWRITE);
- }
- /*
- * Invoke the methods of the current Indexing Data Source
- */
- void indexpath(path)
- char *path;
- {
- /* invoke routine to index a "path" */
- (*IndexingDataSource->indexpath_fn)(path);
- }
- int vgetc(vp)
- void *vp;
- {
- /* invoke routine to get char from "file" */
- return (*IndexingDataSource->vgetc_fn)(vp);
- }
- int vsize(vp)
- void *vp;
- {
- /* invoke routine to get size of "file" */
- return (*IndexingDataSource->vsize_fn)(vp);
- }
- int parseconfline(line)
- char *line;
- {
- /* invoke routine to parse config file lines */
- return (*IndexingDataSource->parseconfline_fn)(line);
- }