http.c
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:12k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** long with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **--------------------------------------------------------------------
- ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
- **
- */
- /*
- ** http.c
- */
- #ifndef _WIN32
- #include <unistd.h>
- #endif
- #include <time.h>
- #include <stdarg.h>
- #include "swish.h"
- #include "index.h"
- #include "hash.h"
- #include "string.h"
- #include "mem.h"
- #include "file.h"
- #include "http.h"
- #include "httpserver.h"
- static int maxdepth = 5;
- static int delay = 60;
- char tmpdir[MAXSTRLEN] = "/var/tmp";
- char spiderdirectory[MAXSTRLEN] = "./";
- multiswline *equivalentservers = 0;
- typedef struct urldepth {
- char *url;
- int depth;
- struct urldepth *next;
- } urldepth;
- static int already_indexed(char *url);
- urldepth *add_url(urldepth *list, char *url, int depth, char *baseurl);
- urldepth *add_url(urldepth *list, char *url, int depth, char *baseurl)
- {
- urldepth *item;
-
- if (!equivalentserver(url, baseurl)) {
- if (verbose == 3)
- printf( "Skipping %s: %sn", url, "Wrong method or server." );
-
-
- } else if (maxdepth && (depth >= maxdepth)) {
- if (verbose == 3)
- printf( "Skipping %s: %sn", url, "Too deep." );
- }else if (nocontentslist && isoksuffix(url, nocontentslist)) {
- if (verbose == 3)
- printf( "Skipping %s: %sn", url, "Wrong suffix." );
-
- } else if (urldisallowed(url)) {
- if (verbose == 3)
- printf( "Skipping %s: %sn", url, "URL disallowed by robots.txt." );
- } else if (!already_indexed(url)) {
- item = (urldepth *)emalloc(sizeof(urldepth));
- item->url = estrdup(url);
- item->depth = depth;
- #if 0
- /* Depth first searching
- **/
- item->next = list;
- list = item;
- #else
- /* Breadth first searching
- **/
- item->next = 0;
- if (!list) {
- list = item;
- } else {
- urldepth *walk;
- for (walk = list; walk->next; walk = walk->next) {
- }
- walk->next = item;
- }
- #endif
- }
-
- return list;
- }
- /* Have we already indexed a file or directory?
- ** This function is used to avoid multiple index entries
- ** or endless looping due to symbolic links.
- */
- static int already_indexed(char *url)
- {
- static struct url_info {
- char *url;
- struct url_info *next;
- } *url_hash[BIGHASHSIZE], *p;
-
- int len;
- unsigned hashval;
-
- /* Hash with via the uri alone. Depending on the equivalent
- ** servers, we may or may not make the decision of the entire
- ** url or just the uri.
- */
- hashval = bighash(url_uri(url,&len)); /* Search hash for this file. */
- for ( p = url_hash[hashval]; p != NULL; p = p->next )
- if ( (strcmp(url, p->url ) == 0) ||
- (equivalentserver(url, p->url) &&
- (strcmp(url_uri(url, &len), url_uri(p->url, &len)) == 0)) )
- { /* We found it. */
- if (verbose == 3)
- printf( "Skipping %s: %sn",
- url, "Already indexed." );
- return 1;
- }
-
- /* Not found, make new entry. */
- p = (struct url_info*)emalloc(sizeof(struct url_info));
- p->url = estrdup(url);
- p->next = url_hash[hashval];
- url_hash[hashval] = p;
-
- return 0;
- }
- char *url_method(char *url, int *plen)
- {
- char *end;
- if ((end = strstr( url, "://" )) == NULL) {
- return NULL;
- }
- *plen = end - url;
- return url;
- }
- char *url_serverport(char *url, int *plen)
- {
- int methodlen;
- char *serverstart;
- char *serverend;
-
- if (url_method(url, &methodlen) == NULL) {
- return NULL;
- }
-
- /* +3 for
- **/
- serverstart = url + methodlen + 3;
- if ((serverend = strchr(serverstart, '/')) == NULL) {
- *plen = strlen(serverstart);
- } else {
- *plen = serverend - serverstart;
- }
-
- return serverstart;
- }
- char *url_uri(char *url, int *plen)
- {
- if ((url = url_serverport(url, plen)) == 0) {
- return 0;
- }
- url += *plen;
- *plen = strlen(url);
- return url;
- }
- #ifdef _WIN32
- #include <stdlib.h> /* _sleep() */
- #include <process.h> /* _getpid() */
- #endif
- int get(char *contenttype_or_redirect, time_t *plastretrieval, char *url)
- {
- char buffer[MAXSTRLEN];
- int code;
- FILE *fp;
- char *command;
- #ifdef _WIN32
- char* spiderprog = "swishspider.pl";
- char commandline[] = "perl %s%s %s/swishspider@%ld "%s"";
- #else
- char* spiderprog = "swishspider";
- char commandline[] = "%s%s %s/swishspider@%ld '%s'";
- #endif
-
- /* Sleep a little so we don't overwhelm the server
- **/
- if ((time(0) - *plastretrieval) < delay )
- {
- int num_sec = delay - (time(0) - *plastretrieval);
- #ifdef _WIN32
- _sleep(num_sec);
- #else
- sleep(num_sec);
- #endif
- }
- *plastretrieval = time(0);
-
- /* URLs can get quite large so don't depend on a fixed size buffer. The
- ** +32 is for the pid identifier and the trailing null.
- **/
- command = (char *)emalloc(strlen(spiderdirectory) + strlen(url) +
- strlen(tmpdir) + strlen(commandline) + strlen(spiderprog) + 32);
- sprintf(command, commandline, spiderdirectory, spiderprog, tmpdir, lgetpid(), url);
-
- if (system(command) == 0) {
- sprintf(buffer, "%s/swishspider@%ld.response", tmpdir, lgetpid());
- fp = fopen(buffer, "r");
- fgets(buffer, sizeof(buffer), fp);
- code = atoi(buffer);
- if ((code == 200) || ((code /100) == 3)) {
- /* read content-type redirect
- **/
- fgets(contenttype_or_redirect, MAXSTRLEN, fp);
- *(contenttype_or_redirect + strlen(contenttype_or_redirect) - 1) = ' ';
- }
- fclose(fp);
- } else {
- code = 500;
- }
-
- free(command);
-
- return code;
- }
- int cmdf(int (*cmd)(const char *), char *fmt, ...)
- {
- char buffer[MAXSTRLEN];
- va_list vlist;
-
- va_start(vlist, fmt);
- /* vsnprintf(buffer, sizeof buffer, fmt, vlist);*/
- vsprintf(buffer,fmt,vlist);
- va_end(vlist);
-
- return cmd(buffer);
- }
- char *readline (FILE *fp)
- {
- static char *buffer = 0;
- static int buffersize = 512;
-
-
- if (buffer == 0) {
- buffer = (char *)emalloc(buffersize);
- }
- /*
- *Try to read in the line
- */
-
- if (fgets(buffer, buffersize, fp) == NULL) {
- return NULL;
- }
-
- /*
- * Make sure we read the entire line. If not, double the buffer
- * size and try to read the rest
- */
- while (buffer[strlen(buffer) - 1] != 'n') {
- buffer = (char *)erealloc( buffer, buffersize * 2 );
-
- /*
- * The easiest way to verify that this line is okay is to consider
- * the situation where the buffer is 2 bytes longs. Since fgets()
- * always guarantees to put the trailing NULL, it will have essentially
- * used only 1 bytes. We double it to four, so we now have the left
- * over byte (that currently contains NULL) in addition to the doubling
- * which gets us to read buffersize + 1.
- */
- if (fgets(buffer + buffersize - 1, buffersize + 1, fp) == 0) {
- break;
- }
- buffersize *= 2;
- }
-
- return buffer;
- }
- /* A local version of getpid() so that we don't have to suffer
- ** a system call each time we need it.
- */
- pid_t lgetpid()
- {
- static pid_t pid = -1;
- if (pid == -1) {
- #ifdef _WIN32
- pid = _getpid();
- #else
- pid = getpid();
- #endif
- }
- return pid;
- }
- #if 0
- /* Testing the robot rules parsing code...
- **/
- void http_indexpath(char *url)
- {
- httpserverinfo *server = getserverinfo(url);
- robotrules *robotrule;
-
- printf("User-agent: %sn", server->useragent ? server->useragent : "(none)");
- for (robotrule = server->robotrules; robotrule; robotrule = robotrule->next ) {
- printf("Disallow: %sn", robotrule->disallow);
- }
- }
- #else
- /********************************************************/
- /* "Public" functions */
- /********************************************************/
- /* The main entry point for the module. For fs.c, decides whether this
- ** is a file or directory and routes to the correct routine.
- */
- void http_indexpath(char *url)
- {
- urldepth *urllist = 0;
- urldepth *item;
- char buffer[MAXFILELEN];
- char title[MAXSTRLEN];
- char contenttype[MAXSTRLEN];
- int code;
- FILE *fp;
- int wordcount;
- httpserverinfo *server;
- char *link;
-
- /* prime the pump with the first url
- **/
- urllist = add_url(urllist, url, 0, url);
-
- /* retrieve each url and add urls to a certain depth
- **/
- while (urllist) {
- item = urllist;
- urllist = urllist->next;
-
- if (verbose >= 2) {
- printf("retrieving %s (%d)...n", item->url, item->depth);
- fflush(stdout);
- }
-
- /* We don't check if this url is legal here, because we do that
- ** before adding to the list.
- **/
- server = getserverinfo(item->url);
-
- if ((code = get(contenttype, &server->lastretrieval, item->url)) == 200) {
- if (strcmp(contenttype, "text/html") == 0) {
- sprintf(buffer, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
- strcpy(title, (char *) parsetitle(buffer, item->url));
- } else {
- if (strrchr(item->url, '/') != NULL)
- strcpy(title, strrchr(item->url, '/') + 1);
- else
- strcpy(title, item->url);
- }
-
- /* index the file
- **/
- sprintf(buffer, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
- if ((fp = fopen(buffer, "r")) != NULL) {
- wordcount = countwords(fp, item->url, title, strncmp(contenttype, "text/", 5) != 0);
- fclose(fp);
- if (verbose == 3) {
- if (wordcount)
- printf(" (%d words)n", wordcount);
- else
- printf(" (no words)n");
- fflush(stdout);
- }
- }
-
- /* add new links
- **/
- sprintf (buffer, "%s/swishspider@%ld.links", tmpdir, lgetpid());
- if ((fp = fopen(buffer, "r")) != NULL) {
-
- /* URLs can get quite large so don't depend on a fixed size buffer
- **/
- while ((link = readline(fp)) != NULL) {
- *(link + strlen(link) - 1) = ' ';
- urllist = add_url(urllist, link, item->depth + 1, url);
- }
- fclose (fp);
- }
- } else if ((code / 100) == 3) {
- urllist = add_url(urllist, contenttype, item->depth, url );
- }
-
- /* Clean up the files left by swishspider
- **/
- cmdf(unlink, "%s/swishspider@%ld.response", tmpdir, lgetpid());
- cmdf(unlink, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
- cmdf(unlink, "%s/swishspider@%ld.links", tmpdir, lgetpid());
- }
- }
- #endif
- int http_vgetc(void *vp)
- {
- return fgetc((FILE *)vp);
- }
- int http_vsize(void *vp)
- {
- struct stat stbuf;
- return fstat(fileno((FILE *)vp), &stbuf) ? -1 : stbuf.st_size;
- }
- #ifdef _WIN32
- #define strncasecmp strnicmp
- #endif
- int http_parseconfline(char *line)
- {
- int rv = 0;
- static char es[] = "equivalentserver";
- char *word;
- int skiplen;
- multiswline *list;
- struct swline *slist;
- if (grabIntValueField(line, "maxdepth", &maxdepth, 0)) { rv = 1; }
- else if (grabIntValueField(line, "delay", &delay, 0)) { rv = 1; }
- else if (grabStringValueField(line, "tmpdir", tmpdir)) { rv = 1; }
- else if (grabStringValueField(line, "spiderdirectory", spiderdirectory))
- {
- int len = strlen(spiderdirectory);
- rv = 1;
-
- /* Make sure the directory has a trailing slash
- **/
- if (len && (spiderdirectory[len - 1] != '/'))
- {
- strcat(spiderdirectory, "/");
- }
- }
- else if (strncasecmp(line, es, sizeof(es) - 1) == 0)
- {
- rv = 1;
-
- /* Add a new list of equivalent servers
- **/
- list = (multiswline *)emalloc(sizeof(multiswline));
- list->next = equivalentservers;
- list->list = 0;
- equivalentservers = list;
-
- line += (sizeof(es) - 1);
- while (*(word = getword(line, &skiplen)) != ' ') {
- /* Add a new entry to this list
- **/
- slist = (struct swline *)emalloc(sizeof(struct swline));
- slist->line = estrdup(word);
- slist->next = list->list;
- list->list = slist;
-
- /* Move to the next word
- **/
- line += skiplen;
- }
- }
- return rv;
- }
- struct _indexing_data_source_def HTTPIndexingDataSource = {
- "HTTP-Crawler",
- "http",
- http_indexpath,
- http_vgetc,
- http_vsize,
- http_parseconfline
- };