fs.c
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:10k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** long with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **--------------------------------------------------------------------
- */
- #include "swish.h"
- #include "index.h"
- #include "hash.h"
- #include "mem.h"
- #include "file.h"
- #include "string.h"
- #include "list.h"
- static void indexadir(char *dir);
- static void indexafile(char *path);
- static void printfiles(struct sortentry *e);
- static void printdirs(struct sortentry *e);
- static int ishtml(char *filename);
- static int isoktitle(char *title);
- /* file system specific configuration parameters
- **/
- static struct swline *pathconlist = 0;
- static struct swline *dirconlist = 0;
- static struct swline *fileconlist = 0;
- static struct swline *titconlist = 0;
- static struct swline *fileislist = 0;
- static struct swline *suffixlist = 0;
- static struct swline *nocontentslist = 0;
- /* Have we already indexed a file or directory?
- ** This function is used to avoid multiple index entries
- ** or endless looping due to symbolic links.
- */
- static int already_indexed(char *path)
- {
- #ifndef NO_SYMBOLIC_FILE_LINKS
- static struct dev_ino {
- dev_t dev;
- ino_t ino;
- struct dev_ino *next;
- } *inode_hash[BIGHASHSIZE], *p;
-
- struct stat buf;
- char key[34]; /* Hash key -- allow for 64 bit inodes */
- unsigned hashval;
-
- if ( stat( path, &buf ) )
- return 0;
-
- /* Create hash key: string contains device and inode. */
- sprintf( key, "%lx/%lx", (unsigned long)buf.st_dev,
- (unsigned long)buf.st_ino );
-
- hashval = bighash(key); /* Search hash for this file. */
- for ( p = inode_hash[hashval]; p != NULL; p = p->next )
- if ( p->dev == buf.st_dev &&
- p->ino == buf.st_ino )
- { /* We found it. */
- if ( verbose == 3 )
- printf( "Skipping %s: %sn",
- path, "Already indexed." );
- return 1;
- }
-
- /* Not found, make new entry. */
- p = (struct dev_ino*)emalloc(sizeof(struct dev_ino));
- p->dev = buf.st_dev;
- p->ino = buf.st_ino;
- p->next = inode_hash[hashval];
- inode_hash[hashval] = p;
- #endif
-
- return 0;
- }
- /* Recursively goes into a directory and calls the word-indexing
- ** functions for each file that's found.
- */
- static void indexadir(char *dir)
- {
- int badfile;
- DIR *dfd;
- #ifdef NEXTSTEP
- struct direct *dp;
- #else
- struct dirent *dp;
- #endif
- static char s[MAXFILELEN], title[MAXSTRLEN];
- struct sortentry *sortfilelist, *sortdirlist;
- struct swline *tmplist;
-
- sortfilelist = sortdirlist = NULL;
-
- if (islink(dir) && !followsymlinks)
- return;
-
- if ( already_indexed(dir) )
- return;
-
- if (dir[strlen(dir) - 1] == '/')
- dir[strlen(dir) - 1] = ' ';
-
- if ((dfd = opendir(dir)) == NULL)
- return;
-
- while ((dp = readdir(dfd)) != NULL && dirconlist != NULL) {
- badfile = 0;
- tmplist = dirconlist;
- while (tmplist != NULL) {
- if (matchARegex(dp->d_name, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- return;
- }
- closedir(dfd);
- dfd = opendir(dir);
-
- while ((dp = readdir(dfd)) != NULL) {
-
- if ((dp->d_name)[0] == '.')
- continue;
- if (islink(dp->d_name) && !followsymlinks)
- continue;
-
- badfile = 0;
- tmplist = fileislist;
- while (tmplist != NULL) {
- if (matchARegex(dp->d_name, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- continue;
-
- badfile = 0;
- tmplist = fileconlist;
- while (tmplist != NULL) {
- if (matchARegex(dp->d_name, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- continue;
-
- sprintf(s, "%s%s%s", dir, dir[strlen(dir) - 1] == '/' ?
- "" : "/", dp->d_name);
- if (islink(s) && !followsymlinks)
- continue;
-
- badfile = 0;
- tmplist = pathconlist;
- while (tmplist != NULL) {
- if (matchARegex(s, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- continue;
-
- if (!isdirectory(s)) {
-
- if ( already_indexed(s) )
- continue;
-
- if (!isoksuffix(dp->d_name, suffixlist))
- continue;
-
- if (ishtml(s)) {
- strcpy(title, (char *) parsetitle(s, s));
- if (!isoktitle(title))
- continue;
- }
- else {
- if (strrchr(s, '/') != NULL)
- strcpy(title, strrchr(s, '/') + 1);
- else
- strcpy(title, s);
- }
- sortfilelist = (struct sortentry *)
- addsortentry(sortfilelist, s, title);
- }
- else {
- sortdirlist = (struct sortentry *)
- addsortentry(sortdirlist, s, s);
- }
- }
-
- closedir(dfd);
-
- printfiles(sortfilelist);
- printdirs(sortdirlist);
- }
- /* Calls the word-indexing function for a single file.
- */
- static void indexafile(char *path)
- {
- int badfile;
- char *t, title[MAXSTRLEN];
- struct sortentry *fileentry;
- struct swline *tmplist;
-
- if (islink(path) && !followsymlinks)
- return;
-
- if ( already_indexed(path) )
- return;
-
- if (path[strlen(path) - 1] == '/')
- path[strlen(path) - 1] = ' ';
-
- badfile = 0;
- tmplist = fileislist;
- while (tmplist != NULL) {
- if (!matchARegex(path, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- return;
-
- badfile = 0;
- tmplist = fileconlist;
- while (tmplist != NULL) {
- if (matchARegex(path, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- return;
-
- badfile = 0;
- tmplist = pathconlist;
- while (tmplist != NULL) {
- if (matchARegex(path, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- return;
-
- if (!isoksuffix(path, suffixlist))
- return;
-
- if (ishtml(path)) {
- strcpy(title, (char *) parsetitle(path, path));
- if (!isoktitle(title))
- return;
- }
- else {
- if ((t = strrchr(path, '/')) != NULL)
- strcpy(title, t + 1);
- else
- strcpy(title, path);
- }
-
- fileentry = (struct sortentry *) emalloc(sizeof(struct sortentry));
- fileentry->filename = (char *) mystrdup(path);
- fileentry->title = (char *) mystrdup(title);
- fileentry->left = fileentry->right = NULL;
-
- printfiles(fileentry);
- }
- /* Indexes the words in the tree of files in alphabetical order.
- */
- static void printfiles(struct sortentry *e)
- {
- int wordcount;
- char *s;
- FILE *fp;
-
- if (e != NULL) {
- printfiles(e->left);
- if (verbose == 3) {
- if ((s = (char *) strrchr(e->filename, '/')) == NULL)
- printf(" %s", e->filename);
- else
- printf(" %s", s + 1);
- }
- if ((fp = fopen(e->filename, "r" )) != NULL ) {
- wordcount = countwords(fp, e->filename, e->title,
- isoksuffix(e->filename, nocontentslist) && nocontentslist != NULL);
- fclose(fp);
- }
- if (verbose == 3) {
- if (wordcount)
- printf(" (%d words)n", wordcount);
- else
- printf(" (no words)n");
- fflush(stdout);
- }
- free(e->filename);
- free(e->title);
- printfiles(e->right);
- free(e);
- }
- }
- /* Prints out the directory names as things are getting indexed.
- ** Calls indexadir() so directories in the tree are indexed,
- ** in alphabetical order...
- */
- static void printdirs(struct sortentry *e)
- {
- if (e != NULL) {
- printdirs(e->left);
- if (verbose == 3)
- printf("nIn dir "%s":n", e->filename);
- else if (verbose == 2)
- printf("Checking dir "%s"...n", e->filename);
- indexadir(e->filename);
- free(e->filename);
- free(e->title);
- printdirs(e->right);
- free(e);
- }
- }
- /* This checks is a filename has one of the following suffixes:
- ** "htm", "HTM", "html", "HTML", "shtml", "SHTML".
- */
- int ishtml(filename)
- char *filename;
- {
- char *c, suffix[MAXSUFFIXLEN];
-
- c = (char *) strrchr(filename, '.');
-
- if (c == NULL)
- return 0;
- strcpy(suffix, c + 1);
- if (suffix[0] == ' ')
- return 0;
-
- if (!strncmp(suffix, "htm", 3))
- return 1;
- else if (!strncmp(suffix, "HTM", 3))
- return 1;
- else if (!strncmp(suffix, "shtml", 5))
- return 1;
- else if (!strncmp(suffix, "SHTML", 5))
- return 1;
- return 0;
- }
- /* Check if a particular title should be ignored
- ** according to the settings in the configuration file.
- */
- int isoktitle(title)
- char *title;
- {
- int badfile;
- struct swline *tmplist;
-
- badfile = 0;
- tmplist = titconlist;
- while (tmplist != NULL) {
- if (matchARegex(title, tmplist->line)) {
- badfile = 1;
- break;
- }
- tmplist = tmplist->next;
- }
- if (badfile)
- return 0;
- else
- return 1;
- }
- /********************************************************/
- /* "Public" functions */
- /********************************************************/
- void fs_indexpath(char *path)
- {
- if (isdirectory(path)) {
- if (verbose >= 2)
- printf("nChecking dir "%s"...n",
- path);
- indexadir(path);
- }
- else if (isfile(path)) {
- if (verbose >= 2)
- printf("nChecking file "%s"...n",
- path);
- indexafile(path);
- }
- }
- int fs_vgetc(void *vp)
- {
- return fgetc((FILE *)vp);
- }
- int fs_vsize(void *vp)
- {
- struct stat stbuf;
- return fstat(fileno((FILE *)vp), &stbuf) ? -1 : stbuf.st_size;
- }
- int fs_parseconfline(char *line)
- {
- int rv = 0;
-
- if (grabCmdOptions(line, "IndexOnly", &suffixlist)) { rv = 1; }
- else if (lstrstr(line, "FileRules"))
- {
- if (grabCmdOptions(line, "pathname contains", &pathconlist)) { rv = 1; }
- else if (grabCmdOptions(line, "directory contains", &dirconlist)) { rv = 1; }
- else if (grabCmdOptions(line, "filename contains", &fileconlist)) { rv = 1; }
- else if (grabCmdOptions(line, "title contains", &titconlist)) { rv = 1; }
- else if (grabCmdOptions(line, "filename is", &fileislist)) { rv = 1; }
- else if (grabCmdOptions(line, "pathname contains", &pathconlist)) { rv = 1; }
- }
-
- return rv;
- }
- struct _indexing_data_source_def FileSystemIndexingDataSource = {
- "File-System",
- "fs",
- fs_indexpath,
- fs_vgetc,
- fs_vsize,
- fs_parseconfline
- };