search.c
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:28k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** along with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **-----------------------------------------------------------------
- ** Changes in expandstar and parseterm to fix the wildcard * problem.
- ** G. Hill, ghill@library.berkeley.edu 3/11/97
- **
- ** Changes in notresultlist, parseterm, and fixnot to fix the NOT problem
- ** G. Hill, ghill@library.berkeley.edu 3/13/97
- **
- ** Changes in search, parseterm, fixnot, operate, getfileinfo
- ** to support METADATA
- ** G. Hill 3/18/97 ghill@library.berkeley.edu
- **
- ** Change in search to allow for search with a list including
- ** also some empty indexes.
- ** G. Hill after a suggestion by J. Winstead 12/18/97
- **
- ** Created countResults for number of hits in search
- ** G. Hill 12/18/97
- */
- #include "swish.h"
- #include "search.h"
- #include "file.h"
- #include "list.h"
- #include "string.h"
- #include "merge.h"
- #include "hash.h"
- #include "mem.h"
- #include "docprop.h"
- #include "stemmer.h"
- /* The main search function.
- ** Parentheses are stripped out, things made lowercase,
- ** extra blanks removed, etc.
- */
- void search(words, indexlist, structure)
- char *words;
- struct swline *indexlist;
- int structure;
- {
- int i, j, metaName, indexYes, totalResults;
- float num;
- char word[MAXWORDLEN];
- struct result *resultlist;
- struct sortresult *sortresultlist;
- struct swline *tmplist;
- FILE *fp;
- #ifdef DEBUG
- struct swline *newp2;
- #endif
- #if IGNORE_STOPWORDS_IN_QUERY
- struct swline *pointer1, *pointer2;
- #endif
-
- searchwordlist = NULL;
- metaName = 1;
- indexYes = 0;
-
- for (i = j = 0; words[i] != ' ' && words[i] != 'n'; i++)
- {
- if (isspace(words[i]) || words[i] == '(' || words[i] == ')' || words[i] == '=')
- {
- if (words[i] == '=')
- {
- if (j != 0)
- {
- if (words[i-1] != '\')
- {
- word[j] = ' ';
- searchwordlist = (struct swline *) addswline(searchwordlist,
- (char *) convertentities(word));
- j = 0;
- searchwordlist = (struct swline *) addswline(searchwordlist, "=");
- }
- else
- {
- /* Needs to erase the '' */
- j--;
- word[j] = tolower(words[i]);
- j++;
- }
- }
- else
- {
- searchwordlist = (struct swline *) addswline(searchwordlist, "=");
- }
- }
- else
- {
- if (j)
- {
- word[j] = ' ';
- searchwordlist = (struct swline *) addswline(searchwordlist,
- (char *) convertentities(word));
- j = 0;
- }
- if (words[i] == '(')
- {
- searchwordlist = (struct swline *) addswline(searchwordlist, "(");
- }
- if (words[i] == ')')
- {
- searchwordlist = (struct swline *)
- addswline(searchwordlist, ")");
- }
- }
- }
- else
- {
- word[j] = tolower(words[i]);
- j++;
- }
- }
- if (j)
- {
- word[j] = ' ';
- searchwordlist = (struct swline *) addswline(searchwordlist,
- (char *) convertentities(word));
- }
-
- printf("%sn", INDEXHEADER);
- if (words[0] == ' ')
- {
- printf("err: no search words specifiedn.n");
- exit(0);
- }
- while (indexlist != NULL) {
-
- commonerror = bigrank = 0;
-
- if ((fp = openIndexFileForRead(indexlist->line)) == NULL) {
- printf("# Name: unknown indexn");
- printf("err: could not open index filen.n");
- exit(0);
- }
-
- if (!isokindexheader(fp)) {
- printf("err: the index file format is unknownn.n");
- exit(0);
- }
-
- /* Was stemming applied to the index? If so, we want
- * to apply stemming to the search terms as well */
- applyStemmingRules = wasStemmingAppliedToIndex(fp);
-
- getheader(fp);
-
- if (!getindexfilenum(fp)) {
- indexlist = indexlist->next;
- continue;
- }
- else
- { indexYes = 1; /*There is a non-empty index */ }
-
- readoffsets(fp);
- readstopwords(fp);
- readfileoffsets(fp);
- readMetaNames(fp);
-
- #if IGNORE_STOPWORDS_IN_QUERY
- /* Added JM 1/10/98. */
-
- pointer1 = searchwordlist;
- pointer2 = searchwordlist->next;
-
- while (pointer1 != NULL &&
- isstopword(pointer1->line) && !isrule(pointer1->line)) {
-
- searchwordlist = pointer2;
- free(pointer1);
- pointer1 = pointer2;
- pointer2 = pointer2 ? pointer2->next : NULL;
- }
- if (pointer1 == NULL)
- {
- /* This query contained only stopwords! */
- printf("err: all search words too common to be usefuln.n");
- exit(0);
- }
-
- while (pointer2 != NULL) {
- if (isstopword(pointer2->line) && !isrule(pointer2->line)) {
- pointer1->next = pointer2->next;
- free(pointer2);
- }
- else {
- pointer1 = pointer1->next;
- }
- pointer2 = pointer2->next;
- }
- #endif
-
- printf("# Search words:");
- tmplist = searchwordlist;
- while (tmplist != NULL) {
- printf(" %s", tmplist->line);
- tmplist = tmplist->next;
- }
- putchar('n');
-
- resultlist = NULL;
- tmplist = searchwordlist;
- tmplist = (struct swline *) fixnot(tmplist);
- searchwordlist = (struct swline *) expandstar(tmplist, fp);
- #ifdef DEBUG
- newp2 = searchwordlist;
- while (newp2 != NULL) {
- printf("%s ", newp2->line);
- newp2 = newp2->next;
- }
- putchar('n');
- #endif
-
- #ifdef SUPPORT_DOC_PROPERTIES
- initSearchResultProperties();
- #endif
- resultlist = (struct result *) parseterm(fp, 0, metaName);
-
- sortresultlist = NULL;
- while (resultlist != NULL)
- {
- if (resultlist->structure & structure)
- {
- long propPos;
- char* fileInfo = lookupfile(resultlist->filenum, fp, &propPos);
- sortresultlist = (struct sortresult *)
- addsortresult(sortresultlist, resultlist->rank,
- fileInfo,
- propPos,
- resultlist->filenum);
- }
- resultlist = resultlist->next;
- }
-
- if (sortresultlist == NULL) {
- if (commonerror)
- printf("err: a word is too commonn");
- else
- printf("err: no resultsn");
- }
- else {
- if (bigrank)
- num = 1000.0f / (float) bigrank;
- else
- num = 1000.0f;
- totalResults = countResults(sortresultlist);
- printf("# Number of hits: %dn",totalResults);
- printsortedresults(sortresultlist, num, fp);
-
- }
-
- /* keep file open during printsortedresults() so that
- * doc properties can be retrieved */
- fclose(fp);
- searchwordlist = tmplist;
- indexlist = indexlist->next;
-
- }
- if (!indexYes)
- {
- printf("err: the index file(s) is emptyn.n");
- exit(0);
- }
- printf(".n");
- }
- /* This puts parentheses in the right places around not structures
- ** so the parser can do its thing correctly.
- ** It does it both for 'not' and '='; the '=' is used for the METADATA (GH)
- */
- struct swline *fixnot(sp)
- struct swline *sp;
- {
- int openparen, hasnot;
- int openMeta, hasMeta;
- struct swline *tmpp, *newp;
- #ifdef DEBUG
- struct swline *newp2;
- #endif
-
- tmpp = sp;
- newp = NULL;
-
- openparen = 0;
- openMeta = 0;
- hasMeta = 0;
- hasnot = 0;
- while (tmpp != NULL) {
- if ( ((tmpp->line)[0] == '(') && hasnot)
- openparen++;
- else if ( ((tmpp->line)[0] == '(') && hasMeta)
- openMeta++;
- else if ( ((tmpp->line)[0] == ')') && hasnot)
- openparen--;
- else if ( ((tmpp->line)[0] == ')') && hasMeta)
- openMeta--;
- if (isMetaName(tmpp->next)) {
- /* If it is a metaName add the name and = and skip to next */
- hasMeta = 1;
- newp = (struct swline *) addswline(newp, "(");
- newp = (struct swline *) addswline(newp, tmpp->line);
- newp = (struct swline *) addswline(newp, "=");
- tmpp = tmpp->next;
- tmpp = tmpp->next;
- continue;
- }
- if (!strcmp(tmpp->line, "not") ) {
- hasnot = 1;
- newp = (struct swline *) addswline(newp, "(");
- }
- else if (hasnot && !openparen) {
- hasnot = 0;
- newp = (struct swline *) addswline(newp, tmpp->line);
- newp = (struct swline *) addswline(newp, ")");
- tmpp = tmpp->next;
- continue;
- }
- else if (hasMeta && !openMeta) {
- hasMeta = 0;
- newp = (struct swline *) addswline(newp, tmpp->line);
- newp = (struct swline *) addswline(newp, ")");
- tmpp = tmpp->next;
- continue;
- }
- newp = (struct swline *) addswline(newp, tmpp->line);
- if (!strcmp(tmpp->line, "=") ) {
- hasMeta = 1;
- newp = (struct swline *) addswline(newp, "(");
- }
- tmpp = tmpp->next;
- }
-
- #ifdef DEBUG
- newp2 = newp;
- while (newp2 != NULL) {
- printf("%s ", newp2->line);
- newp2 = newp2->next;
- }
- putchar('n');
- #endif
-
- return newp;
- }
- /* Expands words with asterisks as wildcards into a series of
- ** "or" searches. Terms like "quick*" are expanded into
- ** "quicktime or quickly", etc.
- */
- struct swline *expandstar(sp, fp)
- struct swline *sp;
- FILE *fp;
- {
- int i, firsttime, gotstar;
- char foundword[MAXWORDLEN], searchword[MAXWORDLEN];
- struct swline *newp;
-
- newp = NULL;
- while (sp != NULL) {
- strcpy(searchword, sp->line);
- if (searchword[0] != '*' && strchr(searchword, '*')) {
- for (i = gotstar = 0; searchword[i]; i++)
- if (gotstar)
- searchword[i] = ' ';
- else if (searchword[i] == '*') {
- searchword[i] = ' ';
- gotstar = 1;
- }
- firsttime = 0;
- do {
- strcpy(foundword, getmatchword(searchword,
- fp, firsttime));
- if (strcmp(foundword, NOWORD)) {
- /* Add "(" if it is the first time */
- if (firsttime == 0)
- newp = (struct swline *)
- addswline(newp, "(");
- if (firsttime)
- newp = (struct swline *)
- addswline(newp, "or");
- newp = (struct swline *)
- addswline(newp, foundword);
- }
- else {
- if (!firsttime)
- newp = (struct swline *)
- addswline(newp, NOWORD);
- else /*Add ")" if last of many */
- newp = (struct swline *)
- addswline(newp, ")");
- break;
- }
- firsttime++;
- } while (strcmp(foundword, NOWORD));
- }
- else {
- newp = (struct swline *) addswline(newp,
- searchword);
- }
- sp = sp->next;
- }
- return newp;
- }
- /* If firsttime is 1, returns the first match to a beginnng of a word.
- ** Else if it's 0, returns the next match, until nothing is found,
- ** in which case NULL is returned.
- */
- char *getmatchword(word, fp, firsttime)
- char *word;
- FILE *fp;
- int firsttime;
- {
- int i, c, found;
- char *d;
- static char fileword[MAXWORDLEN];
-
- if (!firsttime) {
- for (i = found = 0; indexchars[i] != ' '; i++)
- if (word[0] == indexchars[i]) {
- fseek(fp, offsets[i], 0);
- found = 1;
- }
- if (!found)
- return NOWORD;
- }
-
- if (offsets[STOPWORDPOS] == ftell(fp))
- return NOWORD;
- for (i = 0; (c = fgetc(fp)) != 0; ) {
- if (c == ':') {
- fileword[i] = ' ';
- i = 0;
- while ((c = fgetc(fp)) != 0)
- ;
- if (fileword[0] != word[0])
- return NOWORD;
- d = (char *) strstr(fileword, word);
- if (d != NULL && d == &fileword[0])
- return fileword;
- else {
- if (offsets[STOPWORDPOS] == ftell(fp))
- return NOWORD;
- }
- }
- else
- fileword[i++] = c;
- }
- return NOWORD;
- }
- /* Reads and prints the header of an index file.
- */
- void getheader(fp)
- FILE *fp;
- {
- int c;
- char line[MAXSTRLEN];
-
- fgets(line, MAXSTRLEN, fp);
- while (1) {
- c = fgetc(fp);
- ungetc(c, fp);
- if (c == '#') {
- fgets(line, MAXSTRLEN, fp);
- printf("%s", line);
- continue;
- }
- else
- break;
- }
- fseek(fp, 0, 0);
- }
- /* Reads the offsets in the index file so word lookup is faster.
- */
- void readoffsets(fp)
- FILE *fp;
- {
- int c, i, k;
- long j, num;
-
- for (i = 0; i < MAXCHARS; i++)
- offsets[i] = 0;
-
- fseek(fp, 0, 0);
- while (1) {
- c = fgetc(fp);
- if (c == '#') {
- do {
- c = fgetc(fp);
- } while (c && c != 'n');
- continue;
- }
- else
- break;
- }
-
- j = 0;
- while (c != EOF && c != 'n') {
- k = MAXLONGLEN;
- for (num = 0; c && isdigit(c) && k--; ) {
- num = (num * 10) + (c - '0');
- c = fgetc(fp);
- }
- offsets[j++] = num;
- }
- }
- /* Reads the stopwords in the index file.
- */
- void readstopwords(fp)
- FILE *fp;
- {
- int i, c;
- char word[MAXWORDLEN];
-
- fseek(fp, offsets[STOPWORDPOS], 0);
- for (i = 0; (c = fgetc(fp)) != 'n' && c != EOF; )
- if (!isspace(c))
- word[i++] = c;
- else {
- word[i] = ' ';
- addstophash(word);
- i = 0;
- }
- }
- /* Reads the metaNames from the index
- */
- void readMetaNames(fp)
- FILE *fp;
- {
- int i, c;
- char word[MAXWORDLEN];
-
- fseek(fp, offsets[METANAMEPOS], 0);
- for (i = 0; (c = fgetc(fp)) != 'n' && c != EOF; )
- {
- if (!isspace(c))
- {
- word[i++] = c;
- }
- else
- {
- int docPropStyle = 0;
- char* docPropStyleTmp;
- word[i] = ' ';
- /* parse the meta name style:
- * <name>"0 -> normal meta name [default]
- * <name>"1 -> doc property name
- * <name>"2 -> both
- */
- docPropStyleTmp = strrchr(word, '"');
- if (docPropStyleTmp != NULL)
- {
- *docPropStyleTmp++ = ' '; /* remove (and move past) quote */
- docPropStyle = atoi(docPropStyleTmp);
- }
- /* add the meta tag, possible twice */
- if ((docPropStyle == 0) || (docPropStyle == 2))
- addMetaEntry(&metaEntryList, word, 0); /* as metaName */
- if ((docPropStyle == 1) || (docPropStyle == 2))
- addMetaEntry(&metaEntryList, word, 1); /* as docProp */
- i = 0;
- }
- }
- }
- /* Reads the file offset table in the index file.
- */
- void readfileoffsets(fp)
- FILE *fp;
- {
- int j, k, c;
- long num;
-
- j = 0;
- fseek(fp, offsets[FILEOFFSETPOS], 0);
- c = fgetc(fp);
- while (c != EOF && c != 'n') {
- k = MAXLONGLEN;
- for (num = 0; c != EOF && isdigit(c) && k--; ) {
- num = (num * 10) + (c - '0');
- c = fgetc(fp);
- }
- addtofilehashlist(j++, num);
- }
- }
- /* The recursive parsing function.
- ** This was a headache to make but ended up being surprisingly easy. :)
- ** parseone tells the function to only operate on one word or term.
- */
- struct result *parseterm(fp, parseone, metaName)
- FILE *fp;
- int parseone;
- int metaName;
- {
- int rulenum;
- char word[MAXWORDLEN];
- struct result *rp, *newrp;
- /*
- * The andLevel is used to help keep the ranking function honest
- * when it ANDs the results of the latest search term with
- * the results so far (rp). The idea is that if you AND three
- * words together you ultimately want to resulting rank to
- * be the average of all three individual work ranks. By keeping
- * a running total of the number of terms already ANDed, the
- * next AND operation can properly scale the average-rank-so-far
- * and recompute the new average properly (see andresultlists()).
- * This implementation is a little weak in that it will not average
- * across terms that are in parenthesis. (It treats an () expression
- * as one term, and weights it as "one".)
- */
- int andLevel = 0; /* number of terms ANDed so far */
-
-
- rp = NULL;
-
- rulenum = OR_RULE;
- while (searchwordlist != NULL) {
- strcpy(word, searchwordlist->line);
-
- if (rulenum == NO_RULE)
- rulenum = DEFAULT_RULE;
- if (isunaryrule(word)) {
- searchwordlist = searchwordlist->next;
- rp = (struct result *) parseterm(fp, 1, metaName);
- rp = (struct result *) notresultlist(rp, fp);
- /* Wild goose chase */
- rulenum = NO_RULE;
- continue;
- }
- else if (isbooleanrule(word)) {
- rulenum = getrulenum(word);
- searchwordlist = searchwordlist->next;
- continue;
- }
-
- if (rulenum != AND_RULE)
- andLevel = 0; /* reset */
- else if (rulenum == AND_RULE)
- andLevel++;
-
- if (word[0] == '(') {
-
- searchwordlist = searchwordlist->next;
- newrp = (struct result *) parseterm(fp, 0, metaName);
-
- if (rulenum == AND_RULE)
- rp = (struct result *)
- andresultlists(rp, newrp, andLevel);
- else if (rulenum == OR_RULE)
- rp = (struct result *)
- orresultlists(rp, newrp);
- if (searchwordlist == NULL)
- break;
-
- rulenum = NO_RULE;
- continue;
-
- }
- else if (word[0] == ')') {
- searchwordlist = searchwordlist->next;
- break;
- }
-
- /* Check if the next word is '=' */
- if ( isMetaName(searchwordlist->next) ) {
- metaName = getMetaName(word);
- if (metaName == 1){
- printf ("err: The metaName %s doesn't exist in user configfilen", word);
- exit(0);
- }
- /* Skip both the metaName end the '=' */
- searchwordlist = searchwordlist->next->next;
- newrp = (struct result *) parseterm(fp, 1, metaName);
- if (rulenum == AND_RULE)
- rp = (struct result *) andresultlists(rp, newrp, andLevel);
- else if (rulenum == OR_RULE)
- rp = (struct result *) orresultlists(rp, newrp);
- if (searchwordlist == NULL)
- break;
-
- rulenum = NO_RULE;
- metaName = 1;
- continue;
- }
-
- rp = (struct result *) operate(rp, rulenum, word,
- fp, metaName,
- andLevel);
-
- if (parseone) {
- searchwordlist = searchwordlist->next;
- break;
- }
- rulenum = NO_RULE;
-
- searchwordlist = searchwordlist->next;
- }
-
- return rp;
- }
- /* Looks up a word in the index file -
- ** it calls getfileinfo(), which does the real searching.
- */
- struct result *operate(rp, rulenum, word, fp, metaName, andLevel)
- struct result *rp;
- int rulenum;
- char *word;
- FILE *fp;
- int metaName;
- int andLevel;
- {
- int i, found;
- struct result *newrp, *returnrp;
-
- if (applyStemmingRules)
- {
- /* apply stemming algorithm to the search term */
- Stem(word);
- }
- if (isstopword(word) && !isrule(word))
- {
- if (rulenum == OR_RULE && rp != NULL)
- return rp;
- else
- commonerror = 1;
- }
-
- for (i = found = 0; indexchars[i] != ' '; i++)
- {
- if (word[0] == indexchars[i])
- {
- fseek(fp, offsets[i], 0);
- found = 1;
- }
- }
- if (!found)
- {
- if (rulenum == AND_RULE)
- return NULL;
- else if (rulenum == OR_RULE)
- return rp;
- }
-
- newrp = (struct result *) getfileinfo(word, fp, metaName);
- if (rulenum == AND_RULE)
- returnrp = (struct result *) andresultlists(rp, newrp, andLevel);
- else if (rulenum == OR_RULE)
- returnrp = (struct result *) orresultlists(rp, newrp);
- else if (rulenum == NOT_RULE)
- returnrp = (struct result *) notresultlist(newrp, fp);
- return returnrp;
- }
- /* Looks up a file name in the index file.
- */
- char *lookupfile(filenum, fp, propPos)
- int filenum;
- FILE *fp;
- long *propPos;
- {
- static char line[MAXSTRLEN];
-
- fseek(fp, getfilenum(decodefilenum(filenum) - 1), 0);
- fgets(line, MAXSTRLEN, fp);
-
- #ifdef SUPPORT_DOC_PROPERTIES
- if (propPos != NULL)
- *propPos = ftell(fp);
- #endif
-
- return line;
- }
- /* Finds a word and returns its corresponding file and rank information list.
- ** If not found, NULL is returned.
- */
- struct result *getfileinfo(word, fp, metaName)
- char *word;
- FILE *fp;
- int metaName;
- {
- int i, c, x, countnum, rank, filenum, structure;
- char fileword[MAXWORDLEN];
- struct result *rp;
- int res;
-
- rp = NULL;
-
- for (i = 0; (c = fgetc(fp)) != 0; ) {
- if (c == ':') {
- fileword[i] = ' ';
- i = 0;
- res = strcmp(word,fileword);
- if (!res)
- break;
- else if (res > 0){
- while ((c = fgetc(fp)) != 0)
- ;
- if (offsets[STOPWORDPOS] == ftell(fp))
- return NULL;
- continue;
- }
- else if (res < 0)
- return NULL;
- }
- else
- fileword[i++] = c;
- }
- if (c == 0)
- return NULL;
-
- countnum = 1;
-
- ungetc(c, fp);
- while ((c = fgetc(fp)) != 0)
- {
- x = 0;
- do {
- c = fgetc(fp);
- if (c == 0)
- return rp;
- x *= 128;
- x += c & 127;
- } while (c & 128);
- if (x)
- {
- if (countnum == 1) {
- filenum = x;
- countnum++;
- }
- else if (countnum == 2) {
- rank = x;
- countnum++;
- }
- else if (countnum == 3) {
- structure = x;
- countnum++;
- }
- else if (countnum == 4) {
- if ( x == metaName )
- {
- rp = (struct result *) addtoresultlist(rp, filenum, rank, structure);
- if (verbose == 4)
- {
- /* dump diagnostic info */
- char* pos;
- char* fileinfo;
- long curFilePos;
- curFilePos = ftell(fp); /* save */
- fileinfo = lookupfile(filenum, fp, NULL);
- pos = strchr(fileinfo, '"'); /* after file name */
- if (pos)
- *(pos-1) = ' '; /* truncate */
-
- printf("# diagt%st%st%dn",
- fileinfo,
- word,
- rank);
-
- if (pos)
- *(pos-1) = ' '; /* restore */
- fseek(fp, curFilePos, 0); /* restore */
- }
-
- }
- countnum = 1;
- }
- }
- }
-
- return rp;
- }
- /* Is a word a rule?
- */
- int isrule(word)
- char *word;
- {
- if (!strcmp(word, "and") || !strcmp(word, "or") || !strcmp(word, "not"))
- return 1;
- else
- return 0;
- }
- /* Is a word a boolean rule?
- */
- int isbooleanrule(word)
- char *word;
- {
- if (!strcmp(word, "and") || !strcmp(word, "or"))
- return 1;
- else
- return 0;
- }
- /* Is a word a unary rule?
- */
- int isunaryrule(word)
- char *word;
- {
- if (!strcmp(word, "not"))
- return 1;
- else
- return 0;
- }
- /* Return the number for a rule.
- */
- int getrulenum(word)
- char *word;
- {
- if (!strcmp(word, "and"))
- return AND_RULE;
- else if (!strcmp(word, "or"))
- return OR_RULE;
- else if (!strcmp(word, "not"))
- return NOT_RULE;
- return NO_RULE;
- }
- /* Takes two lists of results from searches and ANDs them together.
- */
- struct result *andresultlists(r1, r2, andLevel)
- struct result *r1;
- struct result *r2;
- int andLevel;
- {
- static struct result *tmpnode, *newnode;
-
- if (r1 == NULL || r2 == NULL)
- return NULL;
-
- newnode = NULL;
- if (andLevel < 1)
- andLevel = 1;
-
- while (r1 != NULL) {
- tmpnode = r2;
- while (tmpnode != NULL) {
- if (r1->filenum == tmpnode->filenum)
- {
- /*
- * Computing the new rank is interesting because
- * we want to weight each of the words that was
- * previously ANDed equally along with the new word.
- * We compute a running average using andLevel and
- * simply scale up the old average (in r1->rank)
- * and recompute a new, equally weighted average.
- */
- int newRank;
- /*newRank = (r1->rank + tmpnode->rank) / 2;*/
- newRank = ((r1->rank * andLevel) + tmpnode->rank) / (andLevel+1);
- newnode = (struct result *)
- addtoresultlist(newnode,
- r1->filenum,
- newRank,
- r1->structure & tmpnode->structure);
- }
- tmpnode = tmpnode->next;
- }
- r1 = r1->next;
- }
-
- return newnode;
- }
- /* Takes two lists of results from searches and ORs them together.
- */
- struct result *orresultlists(r1, r2)
- struct result *r1;
- struct result *r2;
- {
- int i;
- struct result *rp;
- static struct result *newnode;
-
- newnode = NULL;
-
- if (r1 == NULL)
- return r2;
- else if (r2 == NULL)
- return r1;
-
- initresulthashlist();
- while (r1 != NULL) {
- mergeresulthashlist(r1->filenum, r1->rank, r1->structure);
- r1 = r1->next;
- }
- while (r2 != NULL) {
- mergeresulthashlist(r2->filenum, r2->rank, r2->structure);
- r2 = r2->next;
- }
- for (i = 0; i < HASHSIZE; i++) {
- rp = resulthashlist[i];
- while (rp != NULL) {
- newnode = (struct result *) addtoresultlist(newnode,
- rp->filenum, rp->rank, rp->structure);
- rp = rp->next;
- }
- }
-
- return newnode;
- }
- /* This performs the NOT unary operation on a result list.
- ** NOTed files are marked with a default rank of 1000.
- **
- ** Basically it returns all the files that have not been
- ** marked (GH)
- */
- struct result *notresultlist(rp, fp)
- struct result *rp;
- FILE *fp;
- {
- int i, filenums;
- struct result *newp;
-
- newp = NULL;
-
- initmarkentrylist();
- while (rp != NULL) {
- marknum(rp->filenum);
- rp = rp->next;
- }
-
- filenums = getindexfilenum(fp);
-
- for (i = 1; i <= filenums; i++) {
- if (!ismarked(i))
- newp = (struct result *) addtoresultlist(newp, i, 1000, IN_ALL);
- }
-
- return newp;
- }
- /* Adds a file number and rank to a list of results.
- */
- struct result *addtoresultlist(rp, filenum, rank, structure)
- struct result *rp;
- int filenum;
- int rank;
- int structure;
- {
- struct result *newnode;
- static struct result *head;
-
- newnode = (struct result *) emalloc(sizeof(struct result));
- newnode->filenum = filenum;
- newnode->rank = rank;
- newnode->structure = structure;
- newnode->next = NULL;
-
- if (rp == NULL)
- rp = newnode;
- else
- head->next = newnode;
-
- head = newnode;
-
- return rp;
- }
- /* Adds the results of a search, sorts them by rank.
- */
- struct sortresult *addsortresult(sp, rank, fileinfo, propPos, filenum)
- struct sortresult *sp;
- int rank;
- char *fileinfo;
- long propPos;
- int filenum;
- {
- if (rank > bigrank)
- bigrank = rank;
-
- if (sp == NULL) {
- char* endOfLinePos;
- sp = (struct sortresult *) emalloc(sizeof(struct sortresult));
- sp->rank = rank;
- sp->fileinfo = (char *) mystrdup(fileinfo);
- sp->left = sp->right = NULL;
- /* formatting search results is easier without the newline */
- endOfLinePos = strchr(sp->fileinfo, 'n');
- if (endOfLinePos)
- *endOfLinePos = ' ';
- #ifdef SUPPORT_DOC_PROPERTIES
- sp->propPos = propPos; /* allows later lookup of doc properties */
- #endif
- }
- else {
- if (sp->rank < rank)
- sp->left = (struct sortresult *) addsortresult(sp->left, rank, fileinfo, propPos, filenum);
- else
- sp->right = (struct sortresult *) addsortresult(sp->right, rank, fileinfo, propPos, filenum);
- }
-
- return sp;
- }
- /* Counts the number of files that are the result
- of a search
- */
- int countResults(sp)
- struct sortresult *sp;
- {
- int tot;
-
- if (sp == NULL)
- return 0;
- else
- {
- tot = countResults(sp->right) + countResults(sp->left) + 1;
- }
- return tot;
- }
- /* Prints the final results of a search.
- */
- void printsortedresults(sp, num, fp)
- struct sortresult *sp;
- double num;
- FILE* fp;
- {
- int rank;
-
- if (sp != NULL)
- {
- printsortedresults(sp->left, num, fp);
- rank = (int) ((float) sp->rank * num);
- if (rank >= 999)
- rank = 1000;
- if (maxhits)
- {
- if (useCustomOutputDelimiter)
- {
- /* parse fileinfo into filename and title */
- char* filename;
- char* title;
- char* endOfTitle = NULL;
- char* fileSize;
- filename = sp->fileinfo;
- title = strchr(filename, '"');
- if (title == NULL)
- {
- title = "";
- fileSize = "0";
- }
- else
- {
- *(title-1) = ' '; /* remove space between filename and title */
- title++; /* past double quote */
- endOfTitle = strchr(title, '"'); /* end of title */
- if (endOfTitle)
- {
- *endOfTitle = ' ';
- fileSize = endOfTitle+1;
- while (*fileSize == ' ')
- fileSize++;
- }
- else
- {
- fileSize = "0";
- }
- }
- printf("%d%s%s%s%s%s%s",
- (rank <= 0) ? 1 : rank,
- customOutputDelimiter,
- filename,
- customOutputDelimiter,
- title,
- customOutputDelimiter,
- fileSize);
- if (*title)
- {
- /* restore fileinfo... */
- *(--title) = ' '; /* restore space */
- if (endOfTitle)
- *endOfTitle = '"';
- }
- }
- else
- {
- printf("%d %s", (rank <= 0) ? 1 : rank, sp->fileinfo);
- }
- #ifdef SUPPORT_DOC_PROPERTIES
- printSearchResultProperties(sp->propPos, fp);
- #endif
- printf("n");
-
- if (maxhits > 0)
- {
- maxhits--;
- }
-
- }
- printsortedresults(sp->right, num, fp);
- }
- }
- /* Reads a compressed line. This is just here for testing, etc.
- */
- void getrawindexline(fp)
- FILE *fp;
- {
- int c, inword;
-
- inword = 1;
- while ((c = fgetc(fp)) != EOF) {
- if (c == ':' && inword)
- inword = 0;
- if (!inword) {
- do {
- c = fgetc(fp);
- if (c == 0)
- return;
- } while (c & 128);
- }
- }
- }
- /* Does an index file have a readable format?
- */
- int isokindexheader(fp)
- FILE *fp;
- {
- char line[MAXSTRLEN];
-
- fseek(fp, 0, 0);
- fgets(line, MAXSTRLEN, fp);
- if (line[strlen(line) - 1] == 'n')
- line[strlen(line) - 1] = ' ';
- if (strcmp(line, INDEXHEADER)) {
- fseek(fp, 0, 0);
- return 0;
- }
- fseek(fp, 0, 0);
- return 1;
- }
- int wasStemmingAppliedToIndex(fp)
- FILE *fp;
- {
- /* Check the header for the magic line "# Stemming Applied:"
- * and see if stemming was applied.
- * Return 1 if it was, 0 otherwise
- */
- char line[MAXSTRLEN];
- int stemmingDone = 0; /* assume no stemming */
- int hdrLen;
- hdrLen = strlen(STEMMINGHEADER);
- fseek(fp, 0, 0);
- fgets(line, MAXSTRLEN, fp);
- while (line[0] == '#')
- {
- if (strncmp(line, STEMMINGHEADER, hdrLen) == 0)
- {
- /* found the line. what does it say? */
- stemmingDone = atoi(line+hdrLen);
- break;
- }
- fgets(line, MAXSTRLEN, fp);
- }
- fseek(fp, 0, 0);
- return stemmingDone;
- }
- /* Returns the value associated with the metaName if it exists
- */
- int getMetaName(word)
- char * word;
- {
- struct metaEntry* temp;
-
- for (temp = metaEntryList; temp != NULL; temp = temp->next)
- if (!strcmp(temp->metaName, word))
- return temp->index;
-
- return 1;
- }
- /* Checks if the next word is "="
- */
- int isMetaName (searchWord)
- struct swline* searchWord;
- {
- if (searchWord == NULL)
- return 0;
- if (!strcmp(searchWord->line, "=") )
- return 1;
- return 0;
- }
-