搜索引擎

开发平台：
Perl

index.c：源码内容
							/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** ** ** PATCHED 5/13/96, CJC
**
** Added code to countwords and countwordstr to disreguard the last char
** if requiered by the config.h
** G. Hill  3/12/97  ghill@library.berkeley.edu
**
** Changed addentry, countwords, countwordstr, parsecomment, printindex
** added createMetaEntryList, getMeta, parseMetaData
** to support METADATA
** G. Hill 3/18/97 ghill@library.berkeley.edu
**
** Changed removestops to support printing of stop words
** G. Hill 4/7/97
**
** Changed countwords, countwrdstr, and parseMetaData to disreguard the
** first char if required by the config.h
** G.Hill 10/16/97  ghill@library.berkeley.edu
**
** Added stripIgnoreLastChars and isIgnoreLastChar routines which iteratively
** remove all ignore characters from the end of each word.
** P. Bergner  10/5/97  bergner@lcse.umn.edu
**
** Added stripIgnoreFirstChars and isIgnoreFirstChar to make stripping of
** the ignore first chars iterative.
** G. Hill 11/19/97 ghill@library.berkeley.edu
**
** Added possibility of use of quotes and brackets in meta CONTENT countwords, parsemetadata
** G. Hill 1/14/98
**
** Added regex for replace rule G.Hill 1/98
*/
#include "swish.h"
#include "index.h"
#include "hash.h"
#include "mem.h"
#include "string.h"
#include "check.h"
#include "search.h"
#include "docprop.h"
#include "stemmer.h"
/* Stores file names in alphabetical order so they can be
** indexed alphabetically. No big whoop.
*/
struct sortentry *addsortentry(e, filename, title)
struct sortentry *e;
char *filename;
char *title;
{
	if (e == NULL) {
		e = (struct sortentry *) emalloc(sizeof(struct sortentry));
		e->filename = (char *) mystrdup(filename);
		e->title = (char *) mystrdup(title);
		e->left = e->right = NULL;
	}
	else {
		if (strcmp(e->filename, filename) > 0)
			e->left = (struct sortentry *)
			addsortentry(e->left, filename, title);
		else
			e->right = (struct sortentry *)
			addsortentry(e->right, filename, title);
	}
	
	return e;
}
/* Adds a word to the master index tree.
*/
struct entry *addentry(e, word, filenum, emphasized, structure, metaName)
struct entry *e;
char *word;
int filenum;
int emphasized;
int structure;
int metaName;
{
	int isbigger;
	struct location *tp, *oldtp;
	
	if (e == NULL) {
		e = (struct entry *) emalloc(sizeof(struct entry));
		e->word = (char *) mystrdup(word);
		e->tfrequency = 1;
		e->locationlist = (struct location *)
			emalloc(sizeof(struct location));
		e->locationlist->filenum = filenum;
		e->locationlist->frequency = 1;
		e->locationlist->emphasized = emphasized;
		e->locationlist->structure = structure;
		e->locationlist->metaName = metaName;
		e->locationlist->next = NULL;
		e->left = e->right = NULL;
		totalwords++;
	}
	else {
		isbigger = wordcompare(e->word, word);
		if (isbigger == 0) {
			tp = e->locationlist;
			while (tp != NULL) {
				if (tp->filenum == filenum && tp->metaName == metaName) 
					break;
				oldtp = tp;
				tp = tp->next;
			}
			if (tp == NULL) {
				tp = (struct location *) emalloc(sizeof(struct
					location));
				tp->filenum = filenum;
				tp->frequency = 1;
				tp->emphasized = emphasized;
				tp->structure = structure;
				tp->metaName = metaName;
				tp->next = NULL;
				oldtp->next = tp;
				if (!emphasized)
					e->tfrequency = e->tfrequency + 1;
			}
			else {
				if ((tp->filenum == filenum) && (tp->metaName == metaName)) 
				{
					tp->frequency++;
					if (emphasized)
						tp->emphasized++;
					tp->structure |= structure;
				}
			}
		}
		else if (isbigger > 0)
		{
			e->left = (struct entry *)
			addentry(e->left, word, filenum, emphasized, structure, metaName);
		}
		else
		{
			e->right = (struct entry *)
			addentry(e->right, word, filenum, emphasized, structure, metaName);
		}
	}
	
	return e;
}
/* Adds a file to the master list of files and file numbers.
*/
struct file *addtofilelist(filep, filename, title, size, newFileEntry)
struct file *filep;
char *filename;
char *title;
int size;
struct file ** newFileEntry;
{
	struct file *newnode;
	static struct file *filelistp = NULL;
	
	newnode = (struct file *) emalloc(sizeof(struct file));
	if (newFileEntry != NULL)
	{
		*newFileEntry = newnode;	/* pass object pointer up to caller */
	}
	newnode->filename = (char *) mystrdup(filename);
	newnode->title = (char *) mystrdup(title);
	newnode->size = size;
	newnode->next = NULL;
	
	#ifdef SUPPORT_DOC_PROPERTIES
	newnode->docProperties = NULL;
	#endif
	if (filep == NULL)
		filep = newnode;
	else if (filelistp != NULL)
		filelistp->next = newnode;
	
	filelistp = newnode;
	
	return filep;
}
/* Just goes through the master list of files and
** counts 'em.
*/
int getfilecount(filep)
struct file *filep;
{
	int i;
	
	for (i = 0; filep != NULL; filep = filep->next)
		i++;
	
	return i;
}
/* Returns the nicely formatted date.
*/
char *getthedate()
{
	static char date[MAXSTRLEN];
	time_t time;
	
	time = (time_t) getthetime();
/*	strftime(date, MAXSTRLEN, "%x %X", (struct tm *) localtime(&time));*/
	strftime(date, MAXSTRLEN, "%d/%m/%y %H:%M:%S %Z",
		(struct tm *) localtime(&time)); 
	
	return date;
}
/* Indexes all the words in a file and adds the appropriate information
** to the appropriate structures.
*/
int countwords(vp, filename, title, indextitleonly)
void *vp;
char *filename;
char *title;
int indextitleonly;
{
	int c, i, j, inword, ftotalwords, emphasized, structure;
	int metaName;
	static int filenum;
	char word[MAXWORDLEN], tag[MAXSTRLEN];
	struct file *thisFileEntry = NULL;
	
	ftotalwords = 0;
	if (indextitleonly) {
		filelist = addtofilelist(filelist, filename, title, vsize(vp), NULL);
		filenum++;
		if (!(filenum % 128))
			filenum++;
		addtofwordtotals(filenum, 100);
		return (countwordstr(title, filenum, 0));
	}
	
	filelist = addtofilelist(filelist, filename, title, vsize(vp), &thisFileEntry);
	filenum++;
	if (!(filenum % 128))
		filenum++;
	
	c = 1;
	i = j = 0;
	inword = 0;
	emphasized = 0;
	structure = 1;
	metaName = 1;
	
	while (c != EOF && (c = vgetc(vp)) != EOF) 
	{
		if (!inword) {
			if (iswordchar((char)c)) {
				i = 0;
				word[i++] = c;
				if (i == MAXWORDLEN)
					i--;
				inword = 1;
			}
		}
		else if (inword) 
		{
			if (!iswordchar((char)c)) 
			{
				word[i++] = '';
				if (i == MAXWORDLEN)
					word[--i] = '';
				
				/* Move this stuff after entities are converted
				for (i = 0; word[i]; i++)
				word[i] = tolower(word[i]);
				i = 0;
				*/
				if (isokword(word))
				{
					strcpy(word, (char *) convertentities(word));
				}
				
				/* Ok, can now go to lowercase, the whole problem
				was with entities &Aacute; would become &aacute;
				*/
				for (i = 0; word[i]; i++)
					word[i] = tolower(word[i]);
				i = 0;
				
				/* Get rid of the last char's */
				stripIgnoreLastChars(word);
				
				/* Get rid of the first char */
				stripIgnoreFirstChars(word);
				
				if (applyStemmingRules)
				{
					/* apply stemming algorithm to the word to index */
					Stem(word);
				}
				/* Sorry, have to do isokword() twice to filter out converted strings! */
				
				if (hasokchars(word) && isokword(word)) 
				{
					#ifdef DEBUG
					printf("	%s %dn", word, structure);
					#endif
					entrylist = (struct entry *)
						addentry(entrylist, word, filenum, emphasized,  structure, metaName);
					ftotalwords++;
				}
				inword = 0;
			}
			else {
				word[i++] = c;
				if (i == MAXWORDLEN)
					i--;
			}
		}
		if (c == '<' && !INDEXTAGS) {
			j = 0;
			while ((c = vgetc(vp)) != EOF) {
				tag[j++] = c;
				if (j == MAXSTRLEN)
					j--;
				if (c == '>' && notEscaped(tag,j) ) {
					if (j)
						tag[--j] = '';
					else
						tag[j] = '';
#ifdef DEBUG
					printf("t: %sn", tag);
#endif
					structure = getstructure(tag,
						structure);
#ifdef DEBUG
					printf("s: %dn", structure);
#endif
					if ((tag[0]=='!') && 
						lstrstr(tag,"META") && 
						(lstrstr(tag,"START") ||
						lstrstr(tag,"END") ) ) 
					{
						if (lstrstr(tag, "START")) 
						{
							metaName=getMeta(tag, NULL);
							/* If there is not a legal metaName
							** the program quits into getMeta
							*/
						}
						else if (lstrstr(tag, "END") ) 
						{
							metaName = 1;
						}
					}
					else if ( (tag[0] != '!') &&
						(lstrstr(tag, "META")) &&
						(lstrstr(tag,"NAME")) &&
						(lstrstr(tag,"CONTENT")) )
					{
						ftotalwords += parseMetaData(tag, filenum, structure, thisFileEntry);
					}
					else if ( (tag[0] == '!') && indexComments)
					{
						ftotalwords += parsecomment(tag, filenum, structure,1);
					}
					
					if ((structure & IN_HEADER) ||
						(structure & IN_TITLE))
						emphasized = 5;
					else
						emphasized = 0;
					
					break;
				}
			}
		}
	}
	addtofwordtotals(filenum, ftotalwords);
	return ftotalwords;
}
/* Indexes the words in a string, such as a file name or an
** HTML title.
*/
int countwordstr(s, filenum, emphasized)
char *s;
int filenum;
int emphasized;
{
	int i, j, inword, wordcount;
	char c, word[MAXWORDLEN], tmpstr[MAXFILELEN];
	
	sprintf(tmpstr, "%s ", s);
	for (j = inword = wordcount = 0; (c = tmpstr[j]) != ''; j++) {
		if (!inword) {
			if (iswordchar(c)) {
				i = 0;
				word[i++] = c;
				if (i == MAXWORDLEN)
					i--;
				inword = 1;
			}
		}
		else {
			if (!iswordchar(c)) {
				wordcount++;
				word[i] = '';
				
				/* Same as above, move after convertentities
				for (i = 0; word[i]; i++)
				word[i] = tolower(word[i]);
				*/
				
				if (isokword(word))
					strcpy(word, (char *)
					convertentities(word));
				
				/* Ok to go lower now */
				for (i = 0; word[i]; i++)
					word[i] = tolower(word[i]);
				
				/* Get rid of specified last char's */
				stripIgnoreLastChars (word);
				
				/* Get rid of the first char */
				stripIgnoreFirstChars(word);
				
				if (hasokchars(word) && isokword(word))
					entrylist = (struct entry *)
					addentry(entrylist, word,
					filenum, emphasized, IN_FILE, 1);
				inword = 0;
			}
			else {
				word[i++] = c;
				if (i == MAXWORDLEN)
					i--;
			}
		}
	}
	
	return wordcount;
}
/* This returns the value corresponding to the HTML structures
** a word is in.
*/
int getstructure(tag, structure)
char *tag;
int structure;
{
        int len;
        char oldChar = 0;
        char* endOfTag = NULL;
        char* pos;
	pos = tag;
        while (*pos)
	  {
                if (isspace(*pos))
		  {
                        endOfTag = pos; /* remember where we are... */
                        oldChar = *pos; /* ...and what we saw */
                        *pos = '';    /* truncate string, for now */
		      }
		else 
		  pos++;
	      }
        if (ourstricmp(tag, "/title") == 0)
                structure &= ~IN_TITLE;
        else if (ourstricmp(tag, "title") == 0)
                structure |= IN_TITLE;
        else if (ourstricmp(tag, "/head") == 0)
                structure &= ~IN_HEAD;
        else if (ourstricmp(tag, "head") == 0)
                structure |= IN_HEAD;
        else if (ourstricmp(tag, "/body") == 0)
                structure &= ~IN_BODY;
        else if (ourstricmp(tag, "body") == 0)
                structure |= IN_BODY;
        else if (tag[0] == '/' && tolower(tag[1]) == 'h' && isdigit(tag[2]))
                structure &= ~IN_HEADER;
        else if (tolower(tag[0]) == 'h' && isdigit(tag[1]))
                structure |= IN_HEADER;
        else if ((ourstricmp(tag, "/em") == 0) || 
		 (ourstricmp(tag, "/strong") == 0))
                structure &= ~IN_EMPHASIZED;
        else if ((ourstricmp(tag, "em") == 0) ||
		 (ourstricmp(tag, "strong") == 0))
                structure |= IN_EMPHASIZED;
        else if ((ourstricmp(tag, "b") == 0) || (ourstricmp(tag, "i") == 0))
                structure |= IN_EMPHASIZED;
        else if ((ourstricmp(tag, "/b") == 0) || 
		 (ourstricmp(tag, "/i") == 0))
                structure &= ~IN_EMPHASIZED;
        if (endOfTag != NULL)
	  {
                *endOfTag = oldChar;
	      }
        return structure;
}
/* Parses the words in a comment.
*/
int parsecomment(tag, filenum, structure, metaName)
char *tag;
int filenum;
int structure;
int metaName;
{
	int i, j, inword, wordcount, emphasized;
	char c, word[MAXWORDLEN];
	
	if (EMPHASIZECOMMENTS)
		emphasized = 5;
	else
		emphasized = 0;
	structure |= IN_COMMENTS;
	for (j = 1, inword = wordcount = 0; (c = tag[j]) != ''; j++) {
		if (!inword) {
			if (iswordchar(c)) {
				i = 0;
				word[i++] = c;
				if (i == MAXWORDLEN)
					i--;
				inword = 1;
			}
		}
		else {
			if (!iswordchar(c)) {
				wordcount++;
				word[i] = '';
				for (i = 0; word[i]; i++)
					word[i] = tolower(word[i]);
				if (isokword(word))
					strcpy(word, (char *)
					convertentities(word));
				if (hasokchars(word) && isokword(word))
				{
					if (applyStemmingRules)
					{
						/* apply stemming algorithm to the word to index */
						Stem(word);
					}
					entrylist = (struct entry *)
						addentry(entrylist, word, filenum, emphasized, structure, metaName);
				}
				inword = 0;
			}
			else {
				word[i++] = c;
				if (i == MAXWORDLEN)
					i--;
			}
		}
	}
	
	return wordcount;
}
/* Removes words that occur in over _plimit_ percent of the files and
** that occur in over _flimit_ files (marks them as stopwords, that is).
*/
int removestops(ep, totalfiles, plimit, flimit)
struct entry *ep;
int totalfiles;
int plimit;
int flimit;
{
	int percent, wordfilecount, stopwords;
	struct location *lp;
	
	stopwords = 0;
	if (ep != NULL) {
		stopwords += removestops(ep->left, totalfiles, plimit, flimit);
		lp = ep->locationlist;
		wordfilecount = 0;
		while (lp != NULL) {
			wordfilecount++;
			lp = lp->next;
		}
		percent = (int) (((float) wordfilecount / (float) totalfiles) * 100.0f);
		if (percent >= plimit && wordfilecount >= flimit) {
			addStopList(ep->word);
			addstophash(ep->word);
			stopwords++;
		}
		stopwords += removestops(ep->right,totalfiles, plimit, flimit);
	}
	return stopwords;
}
/* This is somewhat similar to the rank calculation algorithm
** from WAIS (I think). Any suggestions for improvements?
** Note that ranks can't be smaller than 1, emphasized words
** (words in titles, headers) have ranks multiplied by at least 5
** (just a guess), and ranks divisible by 128 are bumped up by one
** (to make the compression scheme with with '' as a line delimiter
** work). Fudging with the ranks doesn't seem to make much difference.
*/
int getrank(freq, tfreq, words, emphasized)
int freq;
int tfreq;
int words;
int emphasized;
{
	double d, e, f;
	int tmprank;
	if (freq < 5)
		freq = 5;
	d = 1.0 / (double) tfreq;
	e = (log((double) freq) + 10.0) * d;
	if (!ignoreTotalWordCountWhenRanking)
	{
		e /= words;
	}
	else
	{
		/* scale the rank down a bit. a larger has the effect of
		   making small differences in work frequency wash out */ 
		e /= 100;
	}
	f = e * 10000.0;
	
	/*sprintf(rankstr, "%f", f);
	tmprank = atoi(rankstr);*/
	tmprank = (int) f;
	if (tmprank <= 0)
		tmprank = 1;
	if (emphasized)
		tmprank *= emphasized;
	if (!(tmprank % 128))
		tmprank++;
	
	return tmprank;
}
/* Prints the index information at the head of index files.
*/
void printheader(fp, filename, totalwords, totalfiles, merged)
FILE *fp;
char *filename;
int totalwords;
int totalfiles;
int merged;
{
	char *c;
	
	c = (char *) strrchr(filename, '/');
	
	fprintf(fp, "%sn", INDEXHEADER);
	fprintf(fp, "%sn", INDEXVERSION);
	fprintf(fp, "# %sn", (merged) ? "MERGED INDEX" : "");
	fprintf(fp, "# Name: %sn", (indexn[0] == '') ? "(no name)" :
	indexn);
	fprintf(fp, "# Saved as: %sn", 
				(c == NULL && c + 1 != '') ? filename : c + 1);
	fprintf(fp, "# Counts: ");
	if (totalwords)
		fprintf(fp, "%d words%s", totalwords, (totalfiles) ? ", " : "");
	if (totalfiles)
		fprintf(fp, "%d files", totalfiles);
	fprintf(fp, "n");
	fprintf(fp, "# Indexed on: %sn", getthedate());
	fprintf(fp, "# Description: %sn", (indexd[0] == '') ?
		"(no description)" : indexd);
	fprintf(fp, "# Pointer: %sn", (indexp[0] == '') ?
		"(no pointer)" : indexp);
	fprintf(fp, "# Maintained by: %sn", (indexa[0] == '') ?
		"(no maintainer)" : indexa);
	#ifdef SUPPORT_DOC_PROPERTIES
	fprintf(fp, "# DocumentProperties: %sn", "Enabled");
	#endif
	fprintf(fp, "%s %dn", STEMMINGHEADER, applyStemmingRules);
}
char* getFileNameByFileNum(int filenum)
{
	/* for diagnostics only */
	struct file *filep = filelist;
	while ((filep != NULL) && --filenum)
	{
		filep = filep->next;
	}
	if (filep != NULL)
		return filep->filename;
	else
		return "";
}
/* Print the index entries that hold the word, rank, and other information.
*/
void printindex(ep, fp)
struct entry *ep;
FILE *fp;
{
	int i, rank;
	struct location *lp;
	
	if (ep != NULL) {
		printindex(ep->left, fp);
		if (!isstopword(ep->word)) {
			
			for (i = 0; indexchars[i] != ''; i++)
				if ((ep->word)[0] == indexchars[i] &&
				!offsets[i])
				offsets[i] = ftell(fp);
			
			fprintf(fp, "%s:", ep->word);
			lp = ep->locationlist;
			while (lp != NULL) {
				int totalWords;
				totalWords = gettotalwords(lp->filenum);
				rank = getrank(lp->frequency, ep->tfrequency,
					totalWords, lp->emphasized);
				if (verbose == 4)
				{
					printf("%st%st%dt%dt%dt%dt%dt%dt%dn",
						getFileNameByFileNum(lp->filenum), 
						ep->word, 
						lp->emphasized, 
						rank, 
						lp->frequency, 
						ep->tfrequency, 
						totalWords,
						lp->structure,
						lp->metaName);
				}
				compress(lp->filenum, fp);
				compress(rank, fp);
				compress(lp->structure, fp);
				compress(lp->metaName, fp);
				lp = lp->next;
			}
			fputc(0, fp);
			
		}
		printindex(ep->right, fp);
	}
}
/* Prints the list of stopwords into the index file.
*/
void printstopwords(fp)
FILE *fp;
{
	int hashval;
	struct swline *sp;
	
	offsets[STOPWORDPOS] = ftell(fp);
	for (hashval = 0; hashval < HASHSIZE; hashval++) {
		sp = hashstoplist[hashval];
		while (sp != NULL) {
			fprintf(fp, "%s ", sp->line);
			sp = sp->next;
		}
	}
	fprintf(fp, "n");
}
void writeFileEntry(filep, fp)
     struct file *filep;
     FILE *fp;
{
	fprintf(fp, "%s "%s" %dn", 
		ruleparse(filep->filename),
		filep->title, filep->size);
	#ifdef SUPPORT_DOC_PROPERTIES
storeDocProperties(filep->docProperties, fp);
	freeDocProperties(&filep->docProperties);
	#endif
}
void readFileEntry(fp, line, docProperties)
     FILE* fp;
     char* line;
     struct docPropertyEntry **docProperties;
{
	fgets(line, MAXSTRLEN, fp);
	if (docProperties != NULL)
		*docProperties = NULL;
	#ifdef SUPPORT_DOC_PROPERTIES
	/* read (or skip over) the document properties section  */
	fetchDocProperties(docProperties, fp);
	#endif
}
/* Prints the list of files, titles, and sizes into the index file.
*/
void printfilelist(filep, fp)
struct file *filep;
FILE *fp;
{
	int i;
	
	i = 0;
	offsets[FILELISTPOS] = ftell(fp);
	while (filep != NULL) 
	{
		int filenum = i++;
		addtofilehashlist(filenum, ftell(fp));
		writeFileEntry(filep, fp);
		filep = filep->next;
	}
}
/* Prints the list of metaNames into the file index
*/
void printMetaNames(fp)
FILE *fp;
{
	struct metaEntry* entry;
	char docPropStyle[20];
	
	offsets[METANAMEPOS] = ftell(fp);
	for (entry = metaEntryList; entry; entry = entry->next)
    {
		docPropStyle[0] = '';
		#ifdef SUPPORT_DOC_PROPERTIES
		if (entry->isDocProperty)
		{
			/* write the meta name style:
			 * <name>"0   -> normal meta name [default, so does not have to be written]
			 * <name>"1   -> doc property name
			 * <name>"2   -> both
			 */
			sprintf(docPropStyle, ""%d", (entry->isOnlyDocProperty) ? 1 : 2);
		}
		#endif
		fprintf(fp, "%s%s ", entry->metaName, docPropStyle);
    }
	fprintf(fp, "n");
}
/* Prints the list of file offsets into the index file.
 */
void printfileoffsets(fp)
FILE *fp;
{
	int i;
	
	offsets[FILEOFFSETPOS] = ftell(fp);
	for (i = 0; getfilenum(i) != 0; i++)
		fprintf(fp, "%016li", getfilenum(i));
	fprintf(fp,"n");
}
/* Takes a number and prints it to a file using the simple
** accordion scheme of storing numbers.
*/
void compress(num, fp)
int num;
FILE *fp;
{
	int i, r;
	static char s[8];
	
	i = 0;
	while (num) {
		r = num % 128;
		num /= 128;
		s[i++] = r;
	}
	while (i-- >= 0)
		fputc(s[i] | (i ? 128 : 0), fp);
}
/* Prints out the decompressed values in an index file.*/
void decompress(fp)
FILE *fp;
{
	int c, x, inword, fieldnum;
	long pos;
	char line[MAXSTRLEN], header[MAXHEADCHARS + 1];
	
	readoffsets(fp);
	if (verbose == 4)
	{
		readfileoffsets(fp);
	}
	fseek(fp, 0, 0);
	inword = 1;
	fieldnum = 0;
	
	while (1) {
		c = fgetc(fp);
		ungetc(c, fp);
		if (c == '#') {
			fgets(line, MAXSTRLEN, fp);
			printf("%s", line);
			continue;
		}
		else {
			fgets(header, MAXHEADCHARS + 1, fp);
			printf("%s", header);
			break;
		}
	}
	
	while ((c = fgetc(fp)) != EOF) 
	{
		if (c == ':' && inword) {
			inword = 0;
			putchar(c);
		}
		if (inword)
			putchar(c);
		else {
			x = 0;
			do {
				c = fgetc(fp);
				pos = ftell(fp);
				if (pos == offsets[STOPWORDPOS]) {
					putchar('n');
					while (fgets(line, MAXSTRLEN, fp) != NULL)
					{
						printf("%s", line);
					}
					return;
				}
				if (c == 0) {
					putchar('n');
					inword = 1;
					break;
				}
				x *= 128;
				x += c & 127;
			} while (c & 128);
			if (x)
			{
				if (verbose == 4)
				  {
				    if (fieldnum == 0)
				      {
					char* filename;
					char* junk;
	
					pos = ftell(fp);
					filename = lookupfile(x, fp, NULL);
					junk = strchr(filename, '"');
					*(junk-1) = '';	
					printf(" %s", filename);
					fseek(fp, pos, 0);
				      }
				    else if (fieldnum == 1)
				      {
					
					printf(" %d", x);
				      }
				    else if (fieldnum == 2)
				      {
				      }
				    else if (fieldnum == 3)
				      {
				      }
				  }
				else
				  {
					printf(" %d", x);
				  }
				
				fieldnum++;
				if (fieldnum == 4)
				  fieldnum = 0;
			}
		}
	}
}
/* Parses lines according to the ReplaceRules directives.
*/
char *ruleparse(line)
char *line;
{
	char rule[MAXSTRLEN];
	static char tmpline[MAXSTRLEN], newtmpline[MAXSTRLEN];
	static char line1[MAXSTRLEN], line2[MAXSTRLEN];
	struct swline *tmplist;
	
	if (replacelist == NULL)
		return line;
	
	tmplist = replacelist;
	strcpy(tmpline, line);
	while (1) 
	{
		if (tmplist == NULL)
			return tmpline;
		strcpy(rule, tmplist->line);
		tmplist = tmplist->next;
		if (tmplist == NULL)
			return tmpline;
		if (rule == NULL) {
			replacelist = tmplist;
			return tmpline;
		}
		else {
			if (lstrstr(rule, "replace")) {
				strcpy(line1, tmplist->line);
				tmplist = tmplist->next;
				if (tmplist)
				{
					strcpy(line2, tmplist->line);
					tmplist = tmplist->next;
				}
				else
				{
					/* Handle case where 2nd part of replace rule
					** is an empty string. Config-file parsing
					** idiosyncrasies cause a replace of "x" to ""
					** to incompletely represent the rule.
					*/
					line2[0] = '';
				}
				strcpy(newtmpline, (char *) matchAndChange(tmpline,
					line1, line2));
			}
			else if (lstrstr(rule, "append")) {
				sprintf(newtmpline, "%s%s", tmpline,
					tmplist->line);
				tmplist = tmplist->next;
			}
			else if (lstrstr(rule, "prepend")) {
				sprintf(newtmpline, "%s%s", tmplist->line,
					tmpline);
				tmplist = tmplist->next;
			}
			else if (lstrstr(rule,"remove")) {
				strcpy(newtmpline, (char *)matchAndChange(tmpline,tmplist->line,""));
			}
			strcpy(tmpline, newtmpline);
		}
	}
}
/* Get the MetaData index when the whole tag is passed */
int getMeta(tag, docPropName)
char* tag;
int* docPropName;
{
	char* temp;
	char word[MAXWORDLEN];
	int i;
	struct metaEntry* list;
	
	if (docPropName != NULL)
	{
		*docPropName = 0;
	}
	
	i = 0;
	temp = (char*) lstrstr((char*)tag,(char*) "NAME");
	if (temp == NULL)
		return 1;
	
	temp += strlen("NAME");
	
	/* Get to the '=' sign disreguarding blanks */
	while (temp != NULL) {
		if (strncmp(temp, "=",1))
			temp++;
		else {
			temp++;
			break;
		}
	}
	
	/* Get to the beginning of the word disreguarding blanks and quotes */
	while (temp != NULL) {
		if (!strncmp(temp," ",1) || !strncmp(temp,""",1) )
			temp++;
		else
			break;
	}
	
	/* Copy the word and convert to lowercase */
	while (temp !=NULL && strncmp(temp," ",1) 
		&& strncmp(temp,""",1) && i<= MAXWORDLEN ) {
		word[i] = *temp++;
		word[i] = tolower(word[i]);
		i++;
	}
	
	if (i == MAXWORDLEN)
		word[--i] = '';
	else
		word[i] = '';
	
	for (list = metaEntryList; list != NULL; list = list->next)
	{
		if (!strcmp(list->metaName, word) )
		{
			#ifdef SUPPORT_DOC_PROPERTIES
			if ((docPropName != NULL) && (list->isDocProperty))
			{
				*docPropName = list->index;
			}
			if (list->isOnlyDocProperty)
			{
				/* property is not for indexing, so return generic metaName value */
				return 1;
			}
			#endif
			return list->index;
		}
	}
	
	/* If it is ok not to have the name listed, just index as no-name */
	if (OKNOMETA) {
		/*    printf ("nwarning: metaName %s does not exiest in the user config file", word); */
		return 1;
	}
	else {
		printf ("nerr: INDEXING FAILUREn");
		printf ("err: The metaName %s does not exist in the user config filen", word);
		exit(0);
	}
	
}
/* Parses the Meta tag */
int parseMetaData(tag, filenum, structure, thisFileEntry)
char* tag;
int filenum;
int structure;
struct file* thisFileEntry;
{
	int metaName, j, i, inword, wordcount, emphasized, jstart;
	char* temp;
	char c, word[MAXWORDLEN];
	int docPropName = 0;
	
	wordcount = 0;
	temp = NULL;
	metaName= getMeta(tag, &docPropName);
	temp = (char*) lstrstr((char*) tag,(char*) "CONTENT");
	
	/* if there is no  CONTENT is another tag so just ignore the whole thing
	* the check is done here istead of before because META tags do not have
	* a fixed length that can be checked
	*/
	if (temp != NULL)
    {
		temp += strlen("CONTENT");
		
		/* Get to the " sign disreguarding blanks */
		while (temp != NULL) {
			if (strncmp(temp, """,1))
				temp++;
			else {
				temp++;
				break;
			}
		}
		
		jstart = strlen(tag) - strlen(temp);
		
		if (EMPHASIZECOMMENTS)
			emphasized = 5;
		else
			emphasized = 0;
		
		#ifdef SUPPORT_DOC_PROPERTIES
		if (docPropName)
		{
			temp = strchr(tag + jstart, '"'); /* first quote after start of CONTENT */
			if (temp != NULL)
			{
				*temp = '';	/* terminate CONTENT, temporarily */
				addDocProperty(&thisFileEntry->docProperties, docPropName, tag+jstart);
				*temp = '"';	/* restore string */
			}
		}
		#endif
		
		for (j = jstart, inword = wordcount = 0;(c = tag[j]) != ''; j++) {
			if (!inword) {
				if (iswordchar(c)) {
					i = 0;
					word[i++] = c;
					if (i == MAXWORDLEN)
						i--;
					inword = 1;
				}
			}
			else {
				if (!iswordchar(c)) {
					wordcount++;
					word[i] = '';
					for (i = 0; word[i]; i++)
						word[i] = tolower(word[i]);
					if (isokword(word))
						strcpy(word, (char *)
						convertentities(word));
					/* Get rid of the last specified char's */
					stripIgnoreLastChars(word);
					
					/* Get rid of the first char */
					stripIgnoreFirstChars(word);
					
					if (applyStemmingRules)
					{
						/* apply stemming algorithm to the word to index */
						Stem(word);
					}
					
					if (hasokchars(word) && isokword(word))
						entrylist = (struct entry *)
						addentry(entrylist, word,
						filenum, emphasized, 
						structure, metaName);
					inword = 0;
					if (c == '"' && tag[j-1] != '\')
						break;
				}
				else {
					word[i++] = c;
					if (i == MAXWORDLEN)
						i--;
				}
			}
		}
    }
	return wordcount;
}
/*  These 2 routines fix the problem when a word ends with mutiple
**  IGNORELASTCHAR's (eg, qwerty'. ).  The old code correctly deleted
**  the ".", but didn't check if the new last character ("'") is also
**  an ignore character.
*/
void stripIgnoreLastChars(char *word)
{
	int i;
	
	/* Get rid of specified last char's */
	for (i=0; word[i] != ''; i++)
		;
	/* Iteratively strip off the last character if it's an ignore character */
	while ( isIgnoreLastChar(word[--i]) )
		word[i] = '';
}
int isIgnoreLastChar(char c)
{
	int i;
	
	/*  Returns TRUE if the character is a member of ignorelastchar,
	**  FALSE otherwise.
	*/
	for (i=0; ignorelastchar[i] != ''; i++)
	{
		if (c == ignorelastchar[i])
			return 1;
	}
	
	
	return 0;
}
void stripIgnoreFirstChars(char *word)
{
	int j, k;
	int i = 0;
	
	/* Keep going until a char not to ignore is found */
	while ( isIgnoreFirstChar(word[i]) )
		i++;
	
	/* If all the char's are valid, just return */
	if (0 == i)
		return;
	else
    {
		for ( k=i, j=0; word[k] != ''; j++,k++)
		{
			word[j] = word[k];
		}
		/* Add the NULL */
		word[j] = '';
    }
}
int isIgnoreFirstChar(char c)
{
	int i;
	
	for (i=0; ignorefirstchar[i] != ''; i++)
		if (c == ignorefirstchar[i])
		return 1;
	
	return 0;
}
int notEscaped( char *tag, int j)
{
	if ( j > 1)
    { 
		if ( tag[j - 2] !=  '\')
		{ return 1;}
		else
		{ return 0; }
    }
	else
    { return 1; }
}