swish.h
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:14k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** along with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **-------------------------------------------------
- ** Added support for METADATA
- ** G. Hill ghill@library.berkeley.edu 3/18/97
- **
- ** Added Document Properties support
- ** Mark Gaulin gaulin@designinfo.com 11/24/98
- */
- #include <stdio.h>
- #include <string.h>
- #include <math.h>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <locale.h>
- #include <ctype.h>
- #include "config.h"
- #ifdef NEXTSTEP
- #include <sys/dir.h>
- #else
- #ifdef _WIN32
- #include "win32/dirent.h"
- #else
- #include <dirent.h>
- #endif
- #endif
- #include <ctype.h>
- #include <stdlib.h>
- #include <time.h>
- #include <setjmp.h>
- #ifndef _WIN32
- #include <regex.h>
- #else
- #include "Win32regex.h"
- #endif
- #define VERSION "1.3"
- #define INDEXHEADER "# SWISH format 1.3"
- #define INDEXVERSION "# Swish-e format 1.3"
- #define INDEXFILE "index.swish-e"
- #define STEMMINGHEADER "# Stemming Applied:"
- #define MAXFILELEN 1000
- #define MAXSTRLEN 2000
- #define MAXWORDLEN 1000
- #define MAXTITLELEN 200
- #define MAXSUFFIXLEN 10
- #define MAXENTLEN 10
- #define HASHSIZE 101
- #define BIGHASHSIZE 1009
- #define MAXPAR 10
- #define MAXCHARDEFINED 200
- #define TI_OPEN 1
- #define TI_CLOSE 2
- #define TI_FOUND 4
- #define NOWORD "thisisnotaword"
- #define SECSPERMIN 60
- #define NO_RULE 0
- #define AND_RULE 1
- #define OR_RULE 2
- #define NOT_RULE 3
- #define IN_FILE 1
- #define IN_TITLE 2
- #define IN_HEAD 4
- #define IN_BODY 8
- #define IN_COMMENTS 16
- #define IN_HEADER 32
- #define IN_EMPHASIZED 64
- #define IN_ALL 127
- #define MAXLONGLEN 16
- #define MAXCHARS 135
- #define MAXHEADCHARS MAXLONGLEN * MAXCHARS
- #define METANAMEPOS MAXCHARS - 4
- #define STOPWORDPOS MAXCHARS - 3
- #define FILELISTPOS MAXCHARS - 2
- #define FILEOFFSETPOS MAXCHARS - 1
- /*
- * This structure defines all of the functions that need to
- * be implemented to an Indexing Data Source.
- * Right now there are two Indexing Data Source types:
- * file-system based and an HTTP web crawler.
- * Any Data Source can be created as long as all of the
- * functions below are properly initialized.
- */
- struct _indexing_data_source_def
- {
- const char* IndexingDataSourceName; /* long name for data source */
- const char* IndexingDataSourceId; /* short name for data source */
- void (*indexpath_fn)(char *path); /* routine to index a "path" */
- int (*vgetc_fn)(void *vp); /* get char from "file" */
- int (*vsize_fn)(void *vp); /* get size of "file" */
- int (*parseconfline_fn)(char *line); /* parse config file lines */
- };
- #ifdef SUPPORT_DOC_PROPERTIES
- struct docPropertyEntry
- {
- int metaName; /* meta field identifier; from getMetaName() */
- char *propValue; /* string from META's CONTENTS attribute */
- struct docPropertyEntry *next;
- };
- #else
- struct docPropertyEntry { int x; }; /* bogus, unused structure */
- #endif
- struct metaEntry {
- char* metaName;
- int index;
-
- #ifdef SUPPORT_DOC_PROPERTIES
- /* is this meta field a Document Property? */
- char isDocProperty; /* true is doc property */
- char isOnlyDocProperty; /* true if NOT an indexable meta tag (ie: not in MetaNames) */
- #endif
-
- struct metaEntry* next;
- };
- struct sortresult {
- char *fileinfo;
- int rank;
-
- #ifdef SUPPORT_DOC_PROPERTIES
- /* file position where this document's properties are stored */
- long propPos;
- #endif
-
- struct sortresult *left;
- struct sortresult *right;
- };
- struct result {
- int filenum;
- int rank;
- int structure;
- struct result *next;
- };
- struct file {
- char *filename;
- char *title;
- int size;
-
- #ifdef SUPPORT_DOC_PROPERTIES
- struct docPropertyEntry* docProperties;
- #endif SUPPORT_DOC_PROPERTIES
-
- struct file *next;
- };
- struct filenum {
- int fileshort;
- long filelong;
- struct filenum *next;
- };
- struct location {
- int filenum;
- int frequency;
- int emphasized;
- int structure;
- int metaName;
- struct location *next;
- };
- struct entry {
- char *word;
- int tfrequency;
- struct location *locationlist;
- struct entry *left;
- struct entry *right;
- };
- struct sortentry {
- char *filename;
- char *title;
- struct sortentry *left;
- struct sortentry *right;
- };
- struct swline {
- char *line;
- struct swline *next;
- };
- struct fwordtotal {
- int filenum;
- int totalwords;
- struct fwordtotal *next;
- };
- #ifndef MAIN_FILE
- #define VAR extern
- #else
- #define VAR
- #endif
- VAR struct _indexing_data_source_def *IndexingDataSource;
- VAR struct file *filelist;
- VAR struct entry *entrylist;
- VAR struct swline *replacelist;
- VAR struct swline *searchwordlist;
- VAR struct swline *nocontentslist;
- VAR struct swline *dirlist;
- VAR struct swline *indexlist;
- VAR struct swline *hashstoplist[HASHSIZE];
- VAR char *stopList[HASHSIZE];
- VAR struct result *resulthashlist[HASHSIZE];
- VAR struct fwordtotal *fwordtotals[BIGHASHSIZE];
- VAR struct filenum *filehashlist[BIGHASHSIZE];
- VAR struct metaEntry* metaEntryList;
- VAR long offsets[MAXCHARS];
- VAR char wordchars[MAXCHARDEFINED];
- VAR char beginchars[MAXCHARDEFINED];
- VAR char endchars[MAXCHARDEFINED];
- VAR char ignorelastchar[MAXCHARDEFINED];
- VAR char ignorefirstchar[MAXCHARDEFINED];
- VAR int verbose;
- VAR int minwordlimit;
- VAR int maxwordlimit;
- VAR int bigrank;
- VAR int maxhits;
- VAR int totalwords;
- VAR int followsymlinks;
- VAR int commonerror;
- VAR int stopPos;
- VAR int indexComments;
- VAR int applyStemmingRules; /* added 11/24/98 - MG */
- VAR int useCustomOutputDelimiter; /* added 11/24/98 - MG */
- VAR char customOutputDelimiter[MAXSTRLEN]; /* added 11/24/98 - MG */
- VAR int ignoreTotalWordCountWhenRanking; /* added 11/24/98 - MG */
- VAR char indexn[MAXSTRLEN];
- VAR char indexd[MAXSTRLEN];
- VAR char indexp[MAXSTRLEN];
- VAR char indexa[MAXSTRLEN];
- VAR char errorstr[MAXSTRLEN];
- #ifdef MAIN_FILE
- char *indexchars ="abcdefghijklmnopqrstuvwxyz懒旅呐魄壬仕掏蜗醒矣哉重寠仝圮轃捺徕沅彐玷殛腱眍镳耱篝貊鴾氝