搜索引擎

开发平台：
Perl

http.c：源码内容
							/*
** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
**
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
**  long with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**--------------------------------------------------------------------
** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
** 
*/
/*
** http.c
*/
#ifndef _WIN32
#include <unistd.h>
#endif
#include <time.h>
#include <stdarg.h>
#include "swish.h"
#include "index.h"
#include "hash.h"
#include "string.h"
#include "mem.h"
#include "file.h"
#include "http.h"
#include "httpserver.h"
static int maxdepth = 5;
static int delay = 60;
char tmpdir[MAXSTRLEN] = "/var/tmp";
char spiderdirectory[MAXSTRLEN] = "./";
multiswline *equivalentservers = 0;
typedef struct urldepth {
    char *url;
    int depth;
    struct urldepth *next;
} urldepth;
static int already_indexed(char *url);
urldepth *add_url(urldepth *list, char *url, int depth, char *baseurl);
urldepth *add_url(urldepth *list, char *url, int depth, char *baseurl)
{
    urldepth *item;
	
    if (!equivalentserver(url, baseurl)) {
		if (verbose == 3)
			printf( "Skipping %s:  %sn", url, "Wrong method or server." );
		
		
    } else if (maxdepth && (depth >= maxdepth)) {
		if (verbose == 3)
			printf( "Skipping %s:  %sn", url, "Too deep." );
    }else if (nocontentslist && isoksuffix(url, nocontentslist)) {
      if (verbose == 3)
	printf( "Skipping %s: %sn", url, "Wrong suffix." );
      
    } else if (urldisallowed(url)) {
      if (verbose == 3)
	printf( "Skipping %s:  %sn", url, "URL disallowed by robots.txt." );
    } else if (!already_indexed(url)) {
      item = (urldepth *)emalloc(sizeof(urldepth));
      item->url = estrdup(url);
      item->depth = depth;
#if 0
      /* Depth first searching
      **/
      item->next = list;
      list = item;
#else
      /* Breadth first searching
      **/
      item->next = 0;
      if (!list) {
	list = item;
      } else {
	urldepth *walk;
	for (walk = list; walk->next; walk = walk->next) {
	}
	walk->next = item;
      }
#endif
    }
    
    return list;
}
/* Have we already indexed a file or directory?
** This function is used to avoid multiple index entries
** or endless looping due to symbolic links.
*/
static int already_indexed(char *url)
{
  static struct url_info {
    char *url;
    struct url_info *next;
  } *url_hash[BIGHASHSIZE], *p;
  
  int len;
  unsigned hashval;
  
  /* Hash with via the uri alone.  Depending on the equivalent
  ** servers, we may or may not make the decision of the entire
  ** url or just the uri.
  */
  hashval = bighash(url_uri(url,&len)); /* Search hash for this file. */
  for ( p = url_hash[hashval]; p != NULL; p = p->next )
    if ( (strcmp(url, p->url ) == 0) ||
	 (equivalentserver(url, p->url) &&
	  (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)) )
      {                               /* We found it. */
	if (verbose == 3)
	  printf( "Skipping %s:  %sn",
		  url, "Already indexed." );
	return 1;
      }
  
	/* Not found, make new entry. */
  p = (struct url_info*)emalloc(sizeof(struct url_info));
  p->url = estrdup(url);
  p->next = url_hash[hashval];
  url_hash[hashval] = p;
	
  return 0;
}
char *url_method(char *url, int *plen)
{
  char *end;
    if ((end = strstr( url, "://" )) == NULL) {
      return NULL;
    }
    *plen = end - url;
    return url;
}
char *url_serverport(char *url, int *plen)
{
  int methodlen;
  char *serverstart;
  char *serverend;
  
  if (url_method(url, &methodlen) == NULL) {
    return NULL;
  }
  
  /* +3 for 
  **/
  serverstart = url + methodlen + 3;
  if ((serverend = strchr(serverstart, '/')) == NULL) {
    *plen = strlen(serverstart);
  } else {
    *plen = serverend - serverstart;
  }
  
  return serverstart;
}
char *url_uri(char *url, int *plen)
{
  if ((url = url_serverport(url, plen)) == 0) {
    return 0;
  }
  url += *plen;
  *plen = strlen(url);
  return url;
}
#ifdef _WIN32
#include <stdlib.h>		/* _sleep() */
#include <process.h>	/* _getpid() */
#endif
int get(char *contenttype_or_redirect, time_t *plastretrieval, char *url)
{
  char buffer[MAXSTRLEN];
  int code;
  FILE *fp;
  char *command;
#ifdef _WIN32
  char* spiderprog = "swishspider.pl";
  char commandline[] = "perl %s%s %s/swishspider@%ld "%s"";
#else
  char* spiderprog = "swishspider";
  char commandline[] = "%s%s %s/swishspider@%ld '%s'";
#endif
  
  /* Sleep a little so we don't overwhelm the server
  **/
    if ((time(0) - *plastretrieval) < delay )
      {
	int num_sec = delay - (time(0) - *plastretrieval);
#ifdef _WIN32
	_sleep(num_sec);
#else
	sleep(num_sec);
#endif
	}
    *plastretrieval = time(0);
	
    /* URLs can get quite large so don't depend on a fixed size buffer.  The
    ** +32 is for the pid identifier and the trailing null.
    **/
    command = (char *)emalloc(strlen(spiderdirectory) + strlen(url) +
			      strlen(tmpdir) + strlen(commandline) + strlen(spiderprog) + 32);
    sprintf(command, commandline, spiderdirectory, spiderprog, tmpdir, lgetpid(), url);
    
    if (system(command) == 0) {
      sprintf(buffer, "%s/swishspider@%ld.response", tmpdir, lgetpid());
      fp = fopen(buffer, "r");
      fgets(buffer, sizeof(buffer), fp);
      code = atoi(buffer);
      if ((code == 200) || ((code /100) == 3)) {
	/* read content-type  redirect
	**/ 
	fgets(contenttype_or_redirect, MAXSTRLEN, fp);
	*(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = '';
      }
      fclose(fp);
    } else {
      code = 500;
    }
    
    free(command);
    
    return code;
}
int cmdf(int (*cmd)(const char *), char *fmt, ...)
{
    char buffer[MAXSTRLEN];
    va_list vlist;
    
    va_start(vlist, fmt);
    /* vsnprintf(buffer, sizeof buffer, fmt, vlist);*/
    vsprintf(buffer,fmt,vlist);
    va_end(vlist);
	
    return cmd(buffer);
}
char *readline (FILE *fp)
{
	static char *buffer = 0;
    static int buffersize = 512;
	
	
    if (buffer == 0) {
		buffer = (char *)emalloc(buffersize);
    }
    /*
	*Try to read in the line
	*/
	
    if (fgets(buffer, buffersize, fp) == NULL) {
		return NULL;
    }
	
    /*
	* Make sure we read the entire line.  If not, double the buffer
	* size and try to read the rest
	*/
    while (buffer[strlen(buffer) - 1] != 'n') {
		buffer = (char *)erealloc( buffer, buffersize * 2 );
		
		/*
		* The easiest way to verify that this line is okay is to consider
		* the situation where the buffer is 2 bytes longs.  Since fgets()
		* always guarantees to put the trailing NULL, it will have essentially
		* used only 1 bytes.  We double it to four, so we now have the left
		* over byte (that currently contains NULL) in addition to the doubling
		* which gets us to read buffersize + 1.
		*/ 
		if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0) {
			break;
		}
		buffersize *= 2;
    }
	
    return buffer;
}
/* A local version of getpid() so that we don't have to suffer
** a system call each time we need it.
*/
pid_t lgetpid()
{
    static pid_t pid = -1;
    if (pid == -1) {
#ifdef _WIN32
		pid = _getpid();
#else
		pid = getpid();
#endif
    }
    return pid;
}
#if 0
/* Testing the robot rules parsing code...
**/
void http_indexpath(char *url)
{
    httpserverinfo *server = getserverinfo(url);
    robotrules *robotrule;
	
    printf("User-agent: %sn", server->useragent ? server->useragent : "(none)");
    for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next ) {
		printf("Disallow: %sn", robotrule->disallow);
    }
}
#else
/********************************************************/
/*					"Public" functions					*/
/********************************************************/
/* The main entry point for the module.  For fs.c, decides whether this
** is a file or directory and routes to the correct routine.
*/
void http_indexpath(char *url)
{
    urldepth *urllist = 0;
    urldepth *item;
    char buffer[MAXFILELEN];
    char title[MAXSTRLEN];
    char contenttype[MAXSTRLEN];
    int code;
    FILE *fp;
    int wordcount;
    httpserverinfo *server;
    char *link;
	
    /* prime the pump with the first url
    **/
    urllist = add_url(urllist, url, 0, url);
	
    /* retrieve each url and add urls to a certain depth
    **/
    while (urllist) {
		item = urllist;
		urllist = urllist->next;
		
		if (verbose >= 2) {
			printf("retrieving %s (%d)...n", item->url, item->depth);
			fflush(stdout);
		}
		
		/* We don't check if this url is legal here, because we do that
		** before adding to the list.
		**/
		server = getserverinfo(item->url);
		
		if ((code = get(contenttype, &server->lastretrieval, item->url)) == 200) {
			if (strcmp(contenttype, "text/html") == 0) {
				sprintf(buffer, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
				strcpy(title, (char *) parsetitle(buffer, item->url));
			} else {
				if (strrchr(item->url, '/') != NULL)
					strcpy(title, strrchr(item->url, '/') + 1);
				else
					strcpy(title, item->url);
			}
			
			/* index the file
			**/
			sprintf(buffer, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
			if ((fp = fopen(buffer, "r")) != NULL) {
				wordcount = countwords(fp, item->url, title, strncmp(contenttype, "text/", 5) != 0);
				fclose(fp);
				if (verbose == 3) {
					if (wordcount)
						printf(" (%d words)n", wordcount);
					else
						printf(" (no words)n");
					fflush(stdout);
				}
			}
			
			/* add new links
			**/
			sprintf (buffer, "%s/swishspider@%ld.links", tmpdir, lgetpid());
			if ((fp = fopen(buffer, "r")) != NULL) {
				
			/* URLs can get quite large so don't depend on a fixed size buffer
				**/
				while ((link = readline(fp)) != NULL) {
					*(link + strlen(link) - 1) = '';
					urllist = add_url(urllist, link, item->depth + 1, url);
				}
				fclose (fp);
			}
		} else if ((code / 100) == 3) {
			urllist = add_url(urllist, contenttype, item->depth, url );
		}
		
		/* Clean up the files left by swishspider
		**/
		cmdf(unlink, "%s/swishspider@%ld.response", tmpdir, lgetpid());
		cmdf(unlink, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
		cmdf(unlink, "%s/swishspider@%ld.links", tmpdir, lgetpid());
    }
}
#endif
int http_vgetc(void *vp)
{
	return fgetc((FILE *)vp);
}
int http_vsize(void *vp)
{
	struct stat stbuf;
	return fstat(fileno((FILE *)vp), &stbuf) ? -1 : stbuf.st_size;
}
#ifdef _WIN32
#define strncasecmp	strnicmp
#endif
int http_parseconfline(char *line)
{
    int rv = 0;
    static char es[] = "equivalentserver";
    char *word;
    int skiplen;
    multiswline *list;
    struct swline *slist;
	if (grabIntValueField(line, "maxdepth", &maxdepth, 0))	{ rv = 1; }
	else if (grabIntValueField(line, "delay", &delay, 0))	{ rv = 1; }
	else if (grabStringValueField(line, "tmpdir", tmpdir))	{ rv = 1; }
	else if (grabStringValueField(line, "spiderdirectory", spiderdirectory))
	{
		int len = strlen(spiderdirectory);
		rv = 1;
		
		/* Make sure the directory has a trailing slash
		**/
		if (len && (spiderdirectory[len - 1] != '/')) 
		{
			strcat(spiderdirectory, "/");
		}
    }
	else if (strncasecmp(line, es, sizeof(es) - 1) == 0) 
	{
		rv = 1;
		
		/* Add a new list of equivalent servers
		**/
		list = (multiswline *)emalloc(sizeof(multiswline));
		list->next = equivalentservers;
		list->list = 0;
		equivalentservers = list;
		
		line += (sizeof(es) - 1);
		while (*(word = getword(line, &skiplen)) != '') {
			/* Add a new entry to this list
			**/
			slist = (struct swline *)emalloc(sizeof(struct swline));
			slist->line = estrdup(word);
			slist->next = list->list;
			list->list = slist;
			
			/* Move to the next word
			**/
			line += skiplen;
		}
    }
    return rv;
}
struct _indexing_data_source_def HTTPIndexingDataSource = {
  "HTTP-Crawler",
  "http",
  http_indexpath,
  http_vgetc,
  http_vsize,
  http_parseconfline
};