index.c
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:29k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** long with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **--------------------------------------------------------------------
- ** ** ** PATCHED 5/13/96, CJC
- **
- ** Added code to countwords and countwordstr to disreguard the last char
- ** if requiered by the config.h
- ** G. Hill 3/12/97 ghill@library.berkeley.edu
- **
- ** Changed addentry, countwords, countwordstr, parsecomment, printindex
- ** added createMetaEntryList, getMeta, parseMetaData
- ** to support METADATA
- ** G. Hill 3/18/97 ghill@library.berkeley.edu
- **
- ** Changed removestops to support printing of stop words
- ** G. Hill 4/7/97
- **
- ** Changed countwords, countwrdstr, and parseMetaData to disreguard the
- ** first char if required by the config.h
- ** G.Hill 10/16/97 ghill@library.berkeley.edu
- **
- ** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively
- ** remove all ignore characters from the end of each word.
- ** P. Bergner 10/5/97 bergner@lcse.umn.edu
- **
- ** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of
- ** the ignore first chars iterative.
- ** G. Hill 11/19/97 ghill@library.berkeley.edu
- **
- ** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata
- ** G. Hill 1/14/98
- **
- ** Added regex for replace rule G.Hill 1/98
- */
- #include "swish.h"
- #include "index.h"
- #include "hash.h"
- #include "mem.h"
- #include "string.h"
- #include "check.h"
- #include "search.h"
- #include "docprop.h"
- #include "stemmer.h"
- /* Stores file names in alphabetical order so they can be
- ** indexed alphabetically. No big whoop.
- */
- struct sortentry *addsortentry(e, filename, title)
- struct sortentry *e;
- char *filename;
- char *title;
- {
- if (e == NULL) {
- e = (struct sortentry *) emalloc(sizeof(struct sortentry));
- e->filename = (char *) mystrdup(filename);
- e->title = (char *) mystrdup(title);
- e->left = e->right = NULL;
- }
- else {
- if (strcmp(e->filename, filename) > 0)
- e->left = (struct sortentry *)
- addsortentry(e->left, filename, title);
- else
- e->right = (struct sortentry *)
- addsortentry(e->right, filename, title);
- }
-
- return e;
- }
- /* Adds a word to the master index tree.
- */
- struct entry *addentry(e, word, filenum, emphasized, structure, metaName)
- struct entry *e;
- char *word;
- int filenum;
- int emphasized;
- int structure;
- int metaName;
- {
- int isbigger;
- struct location *tp, *oldtp;
-
- if (e == NULL) {
- e = (struct entry *) emalloc(sizeof(struct entry));
- e->word = (char *) mystrdup(word);
- e->tfrequency = 1;
- e->locationlist = (struct location *)
- emalloc(sizeof(struct location));
- e->locationlist->filenum = filenum;
- e->locationlist->frequency = 1;
- e->locationlist->emphasized = emphasized;
- e->locationlist->structure = structure;
- e->locationlist->metaName = metaName;
- e->locationlist->next = NULL;
- e->left = e->right = NULL;
- totalwords++;
- }
- else {
- isbigger = wordcompare(e->word, word);
- if (isbigger == 0) {
- tp = e->locationlist;
- while (tp != NULL) {
- if (tp->filenum == filenum && tp->metaName == metaName)
- break;
- oldtp = tp;
- tp = tp->next;
- }
- if (tp == NULL) {
- tp = (struct location *) emalloc(sizeof(struct
- location));
- tp->filenum = filenum;
- tp->frequency = 1;
- tp->emphasized = emphasized;
- tp->structure = structure;
- tp->metaName = metaName;
- tp->next = NULL;
- oldtp->next = tp;
- if (!emphasized)
- e->tfrequency = e->tfrequency + 1;
- }
- else {
- if ((tp->filenum == filenum) && (tp->metaName == metaName))
- {
- tp->frequency++;
- if (emphasized)
- tp->emphasized++;
- tp->structure |= structure;
- }
- }
- }
- else if (isbigger > 0)
- {
- e->left = (struct entry *)
- addentry(e->left, word, filenum, emphasized, structure, metaName);
- }
- else
- {
- e->right = (struct entry *)
- addentry(e->right, word, filenum, emphasized, structure, metaName);
- }
- }
-
- return e;
- }
- /* Adds a file to the master list of files and file numbers.
- */
- struct file *addtofilelist(filep, filename, title, size, newFileEntry)
- struct file *filep;
- char *filename;
- char *title;
- int size;
- struct file ** newFileEntry;
- {
- struct file *newnode;
- static struct file *filelistp = NULL;
-
- newnode = (struct file *) emalloc(sizeof(struct file));
- if (newFileEntry != NULL)
- {
- *newFileEntry = newnode; /* pass object pointer up to caller */
- }
- newnode->filename = (char *) mystrdup(filename);
- newnode->title = (char *) mystrdup(title);
- newnode->size = size;
- newnode->next = NULL;
-
- #ifdef SUPPORT_DOC_PROPERTIES
- newnode->docProperties = NULL;
- #endif
- if (filep == NULL)
- filep = newnode;
- else if (filelistp != NULL)
- filelistp->next = newnode;
-
- filelistp = newnode;
-
- return filep;
- }
- /* Just goes through the master list of files and
- ** counts 'em.
- */
- int getfilecount(filep)
- struct file *filep;
- {
- int i;
-
- for (i = 0; filep != NULL; filep = filep->next)
- i++;
-
- return i;
- }
- /* Returns the nicely formatted date.
- */
- char *getthedate()
- {
- static char date[MAXSTRLEN];
- time_t time;
-
- time = (time_t) getthetime();
- /* strftime(date, MAXSTRLEN, "%x %X", (struct tm *) localtime(&time));*/
- strftime(date, MAXSTRLEN, "%d/%m/%y %H:%M:%S %Z",
- (struct tm *) localtime(&time));
-
- return date;
- }
- /* Indexes all the words in a file and adds the appropriate information
- ** to the appropriate structures.
- */
- int countwords(vp, filename, title, indextitleonly)
- void *vp;
- char *filename;
- char *title;
- int indextitleonly;
- {
- int c, i, j, inword, ftotalwords, emphasized, structure;
- int metaName;
- static int filenum;
- char word[MAXWORDLEN], tag[MAXSTRLEN];
- struct file *thisFileEntry = NULL;
-
- ftotalwords = 0;
- if (indextitleonly) {
- filelist = addtofilelist(filelist, filename, title, vsize(vp), NULL);
- filenum++;
- if (!(filenum % 128))
- filenum++;
- addtofwordtotals(filenum, 100);
- return (countwordstr(title, filenum, 0));
- }
-
- filelist = addtofilelist(filelist, filename, title, vsize(vp), &thisFileEntry);
- filenum++;
- if (!(filenum % 128))
- filenum++;
-
- c = 1;
- i = j = 0;
- inword = 0;
- emphasized = 0;
- structure = 1;
- metaName = 1;
-
- while (c != EOF && (c = vgetc(vp)) != EOF)
- {
- if (!inword) {
- if (iswordchar((char)c)) {
- i = 0;
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- inword = 1;
- }
- }
- else if (inword)
- {
- if (!iswordchar((char)c))
- {
- word[i++] = ' ';
- if (i == MAXWORDLEN)
- word[--i] = ' ';
-
- /* Move this stuff after entities are converted
- for (i = 0; word[i]; i++)
- word[i] = tolower(word[i]);
- i = 0;
- */
- if (isokword(word))
- {
- strcpy(word, (char *) convertentities(word));
- }
-
- /* Ok, can now go to lowercase, the whole problem
- was with entities Á would become á
- */
- for (i = 0; word[i]; i++)
- word[i] = tolower(word[i]);
- i = 0;
-
- /* Get rid of the last char's */
- stripIgnoreLastChars(word);
-
- /* Get rid of the first char */
- stripIgnoreFirstChars(word);
-
- if (applyStemmingRules)
- {
- /* apply stemming algorithm to the word to index */
- Stem(word);
- }
- /* Sorry, have to do isokword() twice to filter out converted strings! */
-
- if (hasokchars(word) && isokword(word))
- {
- #ifdef DEBUG
- printf(" %s %dn", word, structure);
- #endif
- entrylist = (struct entry *)
- addentry(entrylist, word, filenum, emphasized, structure, metaName);
- ftotalwords++;
- }
- inword = 0;
- }
- else {
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- }
- }
- if (c == '<' && !INDEXTAGS) {
- j = 0;
- while ((c = vgetc(vp)) != EOF) {
- tag[j++] = c;
- if (j == MAXSTRLEN)
- j--;
- if (c == '>' && notEscaped(tag,j) ) {
- if (j)
- tag[--j] = ' ';
- else
- tag[j] = ' ';
- #ifdef DEBUG
- printf("t: %sn", tag);
- #endif
- structure = getstructure(tag,
- structure);
- #ifdef DEBUG
- printf("s: %dn", structure);
- #endif
- if ((tag[0]=='!') &&
- lstrstr(tag,"META") &&
- (lstrstr(tag,"START") ||
- lstrstr(tag,"END") ) )
- {
- if (lstrstr(tag, "START"))
- {
- metaName=getMeta(tag, NULL);
- /* If there is not a legal metaName
- ** the program quits into getMeta
- */
- }
- else if (lstrstr(tag, "END") )
- {
- metaName = 1;
- }
- }
- else if ( (tag[0] != '!') &&
- (lstrstr(tag, "META")) &&
- (lstrstr(tag,"NAME")) &&
- (lstrstr(tag,"CONTENT")) )
- {
- ftotalwords += parseMetaData(tag, filenum, structure, thisFileEntry);
- }
- else if ( (tag[0] == '!') && indexComments)
- {
- ftotalwords += parsecomment(tag, filenum, structure,1);
- }
-
- if ((structure & IN_HEADER) ||
- (structure & IN_TITLE))
- emphasized = 5;
- else
- emphasized = 0;
-
- break;
- }
- }
- }
- }
- addtofwordtotals(filenum, ftotalwords);
- return ftotalwords;
- }
- /* Indexes the words in a string, such as a file name or an
- ** HTML title.
- */
- int countwordstr(s, filenum, emphasized)
- char *s;
- int filenum;
- int emphasized;
- {
- int i, j, inword, wordcount;
- char c, word[MAXWORDLEN], tmpstr[MAXFILELEN];
-
- sprintf(tmpstr, "%s ", s);
- for (j = inword = wordcount = 0; (c = tmpstr[j]) != ' '; j++) {
- if (!inword) {
- if (iswordchar(c)) {
- i = 0;
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- inword = 1;
- }
- }
- else {
- if (!iswordchar(c)) {
- wordcount++;
- word[i] = ' ';
-
- /* Same as above, move after convertentities
- for (i = 0; word[i]; i++)
- word[i] = tolower(word[i]);
- */
-
- if (isokword(word))
- strcpy(word, (char *)
- convertentities(word));
-
- /* Ok to go lower now */
- for (i = 0; word[i]; i++)
- word[i] = tolower(word[i]);
-
- /* Get rid of specified last char's */
- stripIgnoreLastChars (word);
-
- /* Get rid of the first char */
- stripIgnoreFirstChars(word);
-
- if (hasokchars(word) && isokword(word))
- entrylist = (struct entry *)
- addentry(entrylist, word,
- filenum, emphasized, IN_FILE, 1);
- inword = 0;
- }
- else {
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- }
- }
- }
-
- return wordcount;
- }
- /* This returns the value corresponding to the HTML structures
- ** a word is in.
- */
- int getstructure(tag, structure)
- char *tag;
- int structure;
- {
- int len;
- char oldChar = 0;
- char* endOfTag = NULL;
- char* pos;
- pos = tag;
- while (*pos)
- {
- if (isspace(*pos))
- {
- endOfTag = pos; /* remember where we are... */
- oldChar = *pos; /* ...and what we saw */
- *pos = ' '; /* truncate string, for now */
- }
- else
- pos++;
- }
- if (ourstricmp(tag, "/title") == 0)
- structure &= ~IN_TITLE;
- else if (ourstricmp(tag, "title") == 0)
- structure |= IN_TITLE;
- else if (ourstricmp(tag, "/head") == 0)
- structure &= ~IN_HEAD;
- else if (ourstricmp(tag, "head") == 0)
- structure |= IN_HEAD;
- else if (ourstricmp(tag, "/body") == 0)
- structure &= ~IN_BODY;
- else if (ourstricmp(tag, "body") == 0)
- structure |= IN_BODY;
- else if (tag[0] == '/' && tolower(tag[1]) == 'h' && isdigit(tag[2]))
- structure &= ~IN_HEADER;
- else if (tolower(tag[0]) == 'h' && isdigit(tag[1]))
- structure |= IN_HEADER;
- else if ((ourstricmp(tag, "/em") == 0) ||
- (ourstricmp(tag, "/strong") == 0))
- structure &= ~IN_EMPHASIZED;
- else if ((ourstricmp(tag, "em") == 0) ||
- (ourstricmp(tag, "strong") == 0))
- structure |= IN_EMPHASIZED;
- else if ((ourstricmp(tag, "b") == 0) || (ourstricmp(tag, "i") == 0))
- structure |= IN_EMPHASIZED;
- else if ((ourstricmp(tag, "/b") == 0) ||
- (ourstricmp(tag, "/i") == 0))
- structure &= ~IN_EMPHASIZED;
- if (endOfTag != NULL)
- {
- *endOfTag = oldChar;
- }
- return structure;
- }
- /* Parses the words in a comment.
- */
- int parsecomment(tag, filenum, structure, metaName)
- char *tag;
- int filenum;
- int structure;
- int metaName;
- {
- int i, j, inword, wordcount, emphasized;
- char c, word[MAXWORDLEN];
-
- if (EMPHASIZECOMMENTS)
- emphasized = 5;
- else
- emphasized = 0;
- structure |= IN_COMMENTS;
- for (j = 1, inword = wordcount = 0; (c = tag[j]) != ' '; j++) {
- if (!inword) {
- if (iswordchar(c)) {
- i = 0;
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- inword = 1;
- }
- }
- else {
- if (!iswordchar(c)) {
- wordcount++;
- word[i] = ' ';
- for (i = 0; word[i]; i++)
- word[i] = tolower(word[i]);
- if (isokword(word))
- strcpy(word, (char *)
- convertentities(word));
- if (hasokchars(word) && isokword(word))
- {
- if (applyStemmingRules)
- {
- /* apply stemming algorithm to the word to index */
- Stem(word);
- }
- entrylist = (struct entry *)
- addentry(entrylist, word, filenum, emphasized, structure, metaName);
- }
- inword = 0;
- }
- else {
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- }
- }
- }
-
- return wordcount;
- }
- /* Removes words that occur in over _plimit_ percent of the files and
- ** that occur in over _flimit_ files (marks them as stopwords, that is).
- */
- int removestops(ep, totalfiles, plimit, flimit)
- struct entry *ep;
- int totalfiles;
- int plimit;
- int flimit;
- {
- int percent, wordfilecount, stopwords;
- struct location *lp;
-
- stopwords = 0;
- if (ep != NULL) {
- stopwords += removestops(ep->left, totalfiles, plimit, flimit);
- lp = ep->locationlist;
- wordfilecount = 0;
- while (lp != NULL) {
- wordfilecount++;
- lp = lp->next;
- }
- percent = (int) (((float) wordfilecount / (float) totalfiles) * 100.0f);
- if (percent >= plimit && wordfilecount >= flimit) {
- addStopList(ep->word);
- addstophash(ep->word);
- stopwords++;
- }
- stopwords += removestops(ep->right,totalfiles, plimit, flimit);
- }
- return stopwords;
- }
- /* This is somewhat similar to the rank calculation algorithm
- ** from WAIS (I think). Any suggestions for improvements?
- ** Note that ranks can't be smaller than 1, emphasized words
- ** (words in titles, headers) have ranks multiplied by at least 5
- ** (just a guess), and ranks divisible by 128 are bumped up by one
- ** (to make the compression scheme with with ' ' as a line delimiter
- ** work). Fudging with the ranks doesn't seem to make much difference.
- */
- int getrank(freq, tfreq, words, emphasized)
- int freq;
- int tfreq;
- int words;
- int emphasized;
- {
- double d, e, f;
- int tmprank;
- if (freq < 5)
- freq = 5;
- d = 1.0 / (double) tfreq;
- e = (log((double) freq) + 10.0) * d;
- if (!ignoreTotalWordCountWhenRanking)
- {
- e /= words;
- }
- else
- {
- /* scale the rank down a bit. a larger has the effect of
- making small differences in work frequency wash out */
- e /= 100;
- }
- f = e * 10000.0;
-
- /*sprintf(rankstr, "%f", f);
- tmprank = atoi(rankstr);*/
- tmprank = (int) f;
- if (tmprank <= 0)
- tmprank = 1;
- if (emphasized)
- tmprank *= emphasized;
- if (!(tmprank % 128))
- tmprank++;
-
- return tmprank;
- }
- /* Prints the index information at the head of index files.
- */
- void printheader(fp, filename, totalwords, totalfiles, merged)
- FILE *fp;
- char *filename;
- int totalwords;
- int totalfiles;
- int merged;
- {
- char *c;
-
- c = (char *) strrchr(filename, '/');
-
- fprintf(fp, "%sn", INDEXHEADER);
- fprintf(fp, "%sn", INDEXVERSION);
- fprintf(fp, "# %sn", (merged) ? "MERGED INDEX" : "");
- fprintf(fp, "# Name: %sn", (indexn[0] == ' ') ? "(no name)" :
- indexn);
- fprintf(fp, "# Saved as: %sn",
- (c == NULL && c + 1 != ' ') ? filename : c + 1);
- fprintf(fp, "# Counts: ");
- if (totalwords)
- fprintf(fp, "%d words%s", totalwords, (totalfiles) ? ", " : "");
- if (totalfiles)
- fprintf(fp, "%d files", totalfiles);
- fprintf(fp, "n");
- fprintf(fp, "# Indexed on: %sn", getthedate());
- fprintf(fp, "# Description: %sn", (indexd[0] == ' ') ?
- "(no description)" : indexd);
- fprintf(fp, "# Pointer: %sn", (indexp[0] == ' ') ?
- "(no pointer)" : indexp);
- fprintf(fp, "# Maintained by: %sn", (indexa[0] == ' ') ?
- "(no maintainer)" : indexa);
- #ifdef SUPPORT_DOC_PROPERTIES
- fprintf(fp, "# DocumentProperties: %sn", "Enabled");
- #endif
- fprintf(fp, "%s %dn", STEMMINGHEADER, applyStemmingRules);
- }
- char* getFileNameByFileNum(int filenum)
- {
- /* for diagnostics only */
- struct file *filep = filelist;
- while ((filep != NULL) && --filenum)
- {
- filep = filep->next;
- }
- if (filep != NULL)
- return filep->filename;
- else
- return "";
- }
- /* Print the index entries that hold the word, rank, and other information.
- */
- void printindex(ep, fp)
- struct entry *ep;
- FILE *fp;
- {
- int i, rank;
- struct location *lp;
-
- if (ep != NULL) {
- printindex(ep->left, fp);
- if (!isstopword(ep->word)) {
-
- for (i = 0; indexchars[i] != ' '; i++)
- if ((ep->word)[0] == indexchars[i] &&
- !offsets[i])
- offsets[i] = ftell(fp);
-
- fprintf(fp, "%s:", ep->word);
- lp = ep->locationlist;
- while (lp != NULL) {
- int totalWords;
- totalWords = gettotalwords(lp->filenum);
- rank = getrank(lp->frequency, ep->tfrequency,
- totalWords, lp->emphasized);
- if (verbose == 4)
- {
- printf("%st%st%dt%dt%dt%dt%dt%dt%dn",
- getFileNameByFileNum(lp->filenum),
- ep->word,
- lp->emphasized,
- rank,
- lp->frequency,
- ep->tfrequency,
- totalWords,
- lp->structure,
- lp->metaName);
- }
- compress(lp->filenum, fp);
- compress(rank, fp);
- compress(lp->structure, fp);
- compress(lp->metaName, fp);
- lp = lp->next;
- }
- fputc(0, fp);
-
- }
- printindex(ep->right, fp);
- }
- }
- /* Prints the list of stopwords into the index file.
- */
- void printstopwords(fp)
- FILE *fp;
- {
- int hashval;
- struct swline *sp;
-
- offsets[STOPWORDPOS] = ftell(fp);
- for (hashval = 0; hashval < HASHSIZE; hashval++) {
- sp = hashstoplist[hashval];
- while (sp != NULL) {
- fprintf(fp, "%s ", sp->line);
- sp = sp->next;
- }
- }
- fprintf(fp, "n");
- }
- void writeFileEntry(filep, fp)
- struct file *filep;
- FILE *fp;
- {
- fprintf(fp, "%s "%s" %dn",
- ruleparse(filep->filename),
- filep->title, filep->size);
- #ifdef SUPPORT_DOC_PROPERTIES
- storeDocProperties(filep->docProperties, fp);
- freeDocProperties(&filep->docProperties);
- #endif
- }
- void readFileEntry(fp, line, docProperties)
- FILE* fp;
- char* line;
- struct docPropertyEntry **docProperties;
- {
- fgets(line, MAXSTRLEN, fp);
- if (docProperties != NULL)
- *docProperties = NULL;
- #ifdef SUPPORT_DOC_PROPERTIES
- /* read (or skip over) the document properties section */
- fetchDocProperties(docProperties, fp);
- #endif
- }
- /* Prints the list of files, titles, and sizes into the index file.
- */
- void printfilelist(filep, fp)
- struct file *filep;
- FILE *fp;
- {
- int i;
-
- i = 0;
- offsets[FILELISTPOS] = ftell(fp);
- while (filep != NULL)
- {
- int filenum = i++;
- addtofilehashlist(filenum, ftell(fp));
- writeFileEntry(filep, fp);
- filep = filep->next;
- }
- }
- /* Prints the list of metaNames into the file index
- */
- void printMetaNames(fp)
- FILE *fp;
- {
- struct metaEntry* entry;
- char docPropStyle[20];
-
- offsets[METANAMEPOS] = ftell(fp);
- for (entry = metaEntryList; entry; entry = entry->next)
- {
- docPropStyle[0] = ' ';
- #ifdef SUPPORT_DOC_PROPERTIES
- if (entry->isDocProperty)
- {
- /* write the meta name style:
- * <name>"0 -> normal meta name [default, so does not have to be written]
- * <name>"1 -> doc property name
- * <name>"2 -> both
- */
- sprintf(docPropStyle, ""%d", (entry->isOnlyDocProperty) ? 1 : 2);
- }
- #endif
- fprintf(fp, "%s%s ", entry->metaName, docPropStyle);
- }
- fprintf(fp, "n");
- }
- /* Prints the list of file offsets into the index file.
- */
- void printfileoffsets(fp)
- FILE *fp;
- {
- int i;
-
- offsets[FILEOFFSETPOS] = ftell(fp);
- for (i = 0; getfilenum(i) != 0; i++)
- fprintf(fp, "%016li", getfilenum(i));
- fprintf(fp,"n");
- }
- /* Takes a number and prints it to a file using the simple
- ** accordion scheme of storing numbers.
- */
- void compress(num, fp)
- int num;
- FILE *fp;
- {
- int i, r;
- static char s[8];
-
- i = 0;
- while (num) {
- r = num % 128;
- num /= 128;
- s[i++] = r;
- }
- while (i-- >= 0)
- fputc(s[i] | (i ? 128 : 0), fp);
- }
- /* Prints out the decompressed values in an index file.*/
- void decompress(fp)
- FILE *fp;
- {
- int c, x, inword, fieldnum;
- long pos;
- char line[MAXSTRLEN], header[MAXHEADCHARS + 1];
-
- readoffsets(fp);
- if (verbose == 4)
- {
- readfileoffsets(fp);
- }
- fseek(fp, 0, 0);
- inword = 1;
- fieldnum = 0;
-
- while (1) {
- c = fgetc(fp);
- ungetc(c, fp);
- if (c == '#') {
- fgets(line, MAXSTRLEN, fp);
- printf("%s", line);
- continue;
- }
- else {
- fgets(header, MAXHEADCHARS + 1, fp);
- printf("%s", header);
- break;
- }
- }
-
- while ((c = fgetc(fp)) != EOF)
- {
- if (c == ':' && inword) {
- inword = 0;
- putchar(c);
- }
- if (inword)
- putchar(c);
- else {
- x = 0;
- do {
- c = fgetc(fp);
- pos = ftell(fp);
- if (pos == offsets[STOPWORDPOS]) {
- putchar('n');
- while (fgets(line, MAXSTRLEN, fp) != NULL)
- {
- printf("%s", line);
- }
- return;
- }
- if (c == 0) {
- putchar('n');
- inword = 1;
- break;
- }
- x *= 128;
- x += c & 127;
- } while (c & 128);
- if (x)
- {
- if (verbose == 4)
- {
- if (fieldnum == 0)
- {
- char* filename;
- char* junk;
-
- pos = ftell(fp);
- filename = lookupfile(x, fp, NULL);
- junk = strchr(filename, '"');
- *(junk-1) = ' ';
- printf(" %s", filename);
- fseek(fp, pos, 0);
- }
- else if (fieldnum == 1)
- {
-
- printf(" %d", x);
- }
- else if (fieldnum == 2)
- {
- }
- else if (fieldnum == 3)
- {
- }
- }
- else
- {
- printf(" %d", x);
- }
-
- fieldnum++;
- if (fieldnum == 4)
- fieldnum = 0;
- }
- }
- }
- }
- /* Parses lines according to the ReplaceRules directives.
- */
- char *ruleparse(line)
- char *line;
- {
- char rule[MAXSTRLEN];
- static char tmpline[MAXSTRLEN], newtmpline[MAXSTRLEN];
- static char line1[MAXSTRLEN], line2[MAXSTRLEN];
- struct swline *tmplist;
-
- if (replacelist == NULL)
- return line;
-
- tmplist = replacelist;
- strcpy(tmpline, line);
- while (1)
- {
- if (tmplist == NULL)
- return tmpline;
- strcpy(rule, tmplist->line);
- tmplist = tmplist->next;
- if (tmplist == NULL)
- return tmpline;
- if (rule == NULL) {
- replacelist = tmplist;
- return tmpline;
- }
- else {
- if (lstrstr(rule, "replace")) {
- strcpy(line1, tmplist->line);
- tmplist = tmplist->next;
- if (tmplist)
- {
- strcpy(line2, tmplist->line);
- tmplist = tmplist->next;
- }
- else
- {
- /* Handle case where 2nd part of replace rule
- ** is an empty string. Config-file parsing
- ** idiosyncrasies cause a replace of "x" to ""
- ** to incompletely represent the rule.
- */
- line2[0] = ' ';
- }
- strcpy(newtmpline, (char *) matchAndChange(tmpline,
- line1, line2));
- }
- else if (lstrstr(rule, "append")) {
- sprintf(newtmpline, "%s%s", tmpline,
- tmplist->line);
- tmplist = tmplist->next;
- }
- else if (lstrstr(rule, "prepend")) {
- sprintf(newtmpline, "%s%s", tmplist->line,
- tmpline);
- tmplist = tmplist->next;
- }
- else if (lstrstr(rule,"remove")) {
- strcpy(newtmpline, (char *)matchAndChange(tmpline,tmplist->line,""));
- }
- strcpy(tmpline, newtmpline);
- }
- }
- }
- /* Get the MetaData index when the whole tag is passed */
- int getMeta(tag, docPropName)
- char* tag;
- int* docPropName;
- {
- char* temp;
- char word[MAXWORDLEN];
- int i;
- struct metaEntry* list;
-
- if (docPropName != NULL)
- {
- *docPropName = 0;
- }
-
- i = 0;
- temp = (char*) lstrstr((char*)tag,(char*) "NAME");
- if (temp == NULL)
- return 1;
-
- temp += strlen("NAME");
-
- /* Get to the '=' sign disreguarding blanks */
- while (temp != NULL) {
- if (strncmp(temp, "=",1))
- temp++;
- else {
- temp++;
- break;
- }
- }
-
- /* Get to the beginning of the word disreguarding blanks and quotes */
- while (temp != NULL) {
- if (!strncmp(temp," ",1) || !strncmp(temp,""",1) )
- temp++;
- else
- break;
- }
-
- /* Copy the word and convert to lowercase */
- while (temp !=NULL && strncmp(temp," ",1)
- && strncmp(temp,""",1) && i<= MAXWORDLEN ) {
- word[i] = *temp++;
- word[i] = tolower(word[i]);
- i++;
- }
-
- if (i == MAXWORDLEN)
- word[--i] = ' ';
- else
- word[i] = ' ';
-
- for (list = metaEntryList; list != NULL; list = list->next)
- {
- if (!strcmp(list->metaName, word) )
- {
- #ifdef SUPPORT_DOC_PROPERTIES
- if ((docPropName != NULL) && (list->isDocProperty))
- {
- *docPropName = list->index;
- }
- if (list->isOnlyDocProperty)
- {
- /* property is not for indexing, so return generic metaName value */
- return 1;
- }
- #endif
- return list->index;
- }
- }
-
- /* If it is ok not to have the name listed, just index as no-name */
- if (OKNOMETA) {
- /* printf ("nwarning: metaName %s does not exiest in the user config file", word); */
- return 1;
- }
- else {
- printf ("nerr: INDEXING FAILUREn");
- printf ("err: The metaName %s does not exist in the user config filen", word);
- exit(0);
- }
-
- }
- /* Parses the Meta tag */
- int parseMetaData(tag, filenum, structure, thisFileEntry)
- char* tag;
- int filenum;
- int structure;
- struct file* thisFileEntry;
- {
- int metaName, j, i, inword, wordcount, emphasized, jstart;
- char* temp;
- char c, word[MAXWORDLEN];
- int docPropName = 0;
-
- wordcount = 0;
- temp = NULL;
- metaName= getMeta(tag, &docPropName);
- temp = (char*) lstrstr((char*) tag,(char*) "CONTENT");
-
- /* if there is no CONTENT is another tag so just ignore the whole thing
- * the check is done here istead of before because META tags do not have
- * a fixed length that can be checked
- */
- if (temp != NULL)
- {
- temp += strlen("CONTENT");
-
- /* Get to the " sign disreguarding blanks */
- while (temp != NULL) {
- if (strncmp(temp, """,1))
- temp++;
- else {
- temp++;
- break;
- }
- }
-
- jstart = strlen(tag) - strlen(temp);
-
- if (EMPHASIZECOMMENTS)
- emphasized = 5;
- else
- emphasized = 0;
-
- #ifdef SUPPORT_DOC_PROPERTIES
- if (docPropName)
- {
- temp = strchr(tag + jstart, '"'); /* first quote after start of CONTENT */
- if (temp != NULL)
- {
- *temp = ' '; /* terminate CONTENT, temporarily */
- addDocProperty(&thisFileEntry->docProperties, docPropName, tag+jstart);
- *temp = '"'; /* restore string */
- }
- }
- #endif
-
- for (j = jstart, inword = wordcount = 0;(c = tag[j]) != ' '; j++) {
- if (!inword) {
- if (iswordchar(c)) {
- i = 0;
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- inword = 1;
- }
- }
- else {
- if (!iswordchar(c)) {
- wordcount++;
- word[i] = ' ';
- for (i = 0; word[i]; i++)
- word[i] = tolower(word[i]);
- if (isokword(word))
- strcpy(word, (char *)
- convertentities(word));
- /* Get rid of the last specified char's */
- stripIgnoreLastChars(word);
-
- /* Get rid of the first char */
- stripIgnoreFirstChars(word);
-
- if (applyStemmingRules)
- {
- /* apply stemming algorithm to the word to index */
- Stem(word);
- }
-
- if (hasokchars(word) && isokword(word))
- entrylist = (struct entry *)
- addentry(entrylist, word,
- filenum, emphasized,
- structure, metaName);
- inword = 0;
- if (c == '"' && tag[j-1] != '\')
- break;
- }
- else {
- word[i++] = c;
- if (i == MAXWORDLEN)
- i--;
- }
- }
- }
- }
- return wordcount;
- }
- /* These 2 routines fix the problem when a word ends with mutiple
- ** IGNORELASTCHAR's (eg, qwerty'. ). The old code correctly deleted
- ** the ".", but didn't check if the new last character ("'") is also
- ** an ignore character.
- */
- void stripIgnoreLastChars(char *word)
- {
- int i;
-
- /* Get rid of specified last char's */
- for (i=0; word[i] != ' '; i++)
- ;
- /* Iteratively strip off the last character if it's an ignore character */
- while ( isIgnoreLastChar(word[--i]) )
- word[i] = ' ';
- }
- int isIgnoreLastChar(char c)
- {
- int i;
-
- /* Returns TRUE if the character is a member of ignorelastchar,
- ** FALSE otherwise.
- */
- for (i=0; ignorelastchar[i] != ' '; i++)
- {
- if (c == ignorelastchar[i])
- return 1;
- }
-
-
- return 0;
- }
- void stripIgnoreFirstChars(char *word)
- {
- int j, k;
- int i = 0;
-
- /* Keep going until a char not to ignore is found */
- while ( isIgnoreFirstChar(word[i]) )
- i++;
-
- /* If all the char's are valid, just return */
- if (0 == i)
- return;
- else
- {
- for ( k=i, j=0; word[k] != ' '; j++,k++)
- {
- word[j] = word[k];
- }
- /* Add the NULL */
- word[j] = ' ';
- }
- }
- int isIgnoreFirstChar(char c)
- {
- int i;
-
- for (i=0; ignorefirstchar[i] != ' '; i++)
- if (c == ignorefirstchar[i])
- return 1;
-
- return 0;
- }
- int notEscaped( char *tag, int j)
- {
- if ( j > 1)
- {
- if ( tag[j - 2] != '\')
- { return 1;}
- else
- { return 0; }
- }
- else
- { return 1; }
- }