httpserver.c
上传用户:qin5330
上传日期:2007-01-05
资源大小:114k
文件大小:10k
- /*
- ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
- ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
- **
- ** This program and library is free software; you can redistribute it and/or
- ** modify it under the terms of the GNU (Library) General Public License
- ** as published by the Free Software Foundation; either version 2
- ** of the License, or any later version.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU (Library) General Public License for more details.
- **
- ** You should have received a copy of the GNU (Library) General Public License
- ** long with this program; if not, write to the Free Software
- ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- **--------------------------------------------------------------------
- ** All the code in this file added by Ron Klachko ron@ckm.ucsf.edu 9/98
- **
- */
- /*
- ** httpserver.c
- */
- #ifndef _WIN32
- #include <unistd.h>
- #endif
- #include <time.h>
- #include <stdarg.h>
- #include "swish.h"
- #include "string.h"
- #include "mem.h"
- #include "http.h"
- #include "httpserver.h"
- /* The list of servers that we are acting on.
- **/
- static httpserverinfo *servers = 0;
- static void parserobotstxt(FILE *fp, httpserverinfo *server);
- static char *isolatevalue(char *line, char *keyword, int *plen);
- static int serverinlist(char *url, struct swline *list);
- /* Find the robot rules for this URL. If haven't retrieved them
- ** yet, do so now.
- **/
- httpserverinfo *getserverinfo(char *url)
- {
- httpserverinfo *server;
- char *method;
- int methodlen;
- char *serverport;
- int serverportlen;
- char contenttype[MAXSTRLEN];
- char buffer[MAXSTRLEN];
- FILE *fp;
-
- if ((method = url_method(url, &methodlen)) == 0) {
- return 0;
- }
- if ((serverport = url_serverport(url, &serverportlen)) == 0) {
- return 0;
- }
-
- /* Search for the rules
- **/
- for (server = servers; server; server = server->next) {
- if (equivalentserver(url, server->baseurl)) {
- return server;
- }
- }
-
- /* Create a new entry for this server and add it to the list.
- **/
- server = (httpserverinfo *)emalloc(sizeof(httpserverinfo));
-
- /* +3 for the ://, +1 for the trailing /, +1 for the terminating null
- **/
- server->baseurl = (char *)emalloc(methodlen + serverportlen + 5);
- sprintf (server->baseurl, "%.*s://%.*s/", methodlen, method, serverportlen, serverport );
-
- server->lastretrieval = 0;
- server->robotrules = 0;
- server->next = servers;
- servers = server;
-
- /* Only http(s) servers can full rules, all the other ones just get dummies
- ** (this is useful for holding last retrieval)
- **
- ** http://info.webcrawler.com/mak/projects/robots/norobots.html holds what
- ** many people consider the official web exclusion rules. Unfortunately,
- ** the rules are not consistent about how records are formed. One line
- ** states "the file consists of one or more records separated by one or more
- ** blank lines" while another states "the record starts with one or more User-agent
- ** lines, followed by one or more Disallow lines."
- **
- ** So, does a blank line after a User-agent line end a record? The spec is
- ** unclear on this matter. If the next legal line afer the blank line is
- ** a Disallow line, the blank line should most likely be ignored. But what
- ** if the next line is another User-agent line? For example:
- **
- ** User-agent: MooBot
- **
- ** User-agent: CreepySpider
- ** Disallow: /cgi-bin
- **
- ** One interpretation (based on blank lines termination records) is that MooBot
- ** may visit any location (since there are no Disallows for it). Another
- ** interpretation (based on records needing both User-agent and Disallow lines)
- ** is that MooBot may not visit /cgi-bin
- **
- ** While poking around, I found at least one site (www.sun.com) that uses blank
- ** lines within records. Because of that, I have decided to rely on records
- ** having both User-agent and Disallow lines (the second interpretation above).
- **/
- if (strncmp(server->baseurl, "http", 4) == 0) {
- sprintf(buffer, "%srobots.txt", server->baseurl);
- if (get(contenttype, &server->lastretrieval, buffer) == 200) {
- sprintf(buffer, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
- fp = fopen(buffer, "r");
- parserobotstxt(fp, server);
- fclose(fp);
- }
-
- cmdf(unlink, "%s/swishspider@%ld.response", tmpdir, lgetpid());
- cmdf(unlink, "%s/swishspider@%ld.contents", tmpdir, lgetpid());
- cmdf(unlink, "%s/swishspider@%ld.links", tmpdir, lgetpid());
- }
-
- return server;
- }
- int urldisallowed(char *url)
- {
- httpserverinfo *server;
- robotrules *rule;
- char *uri;
- int urilen;
-
- if ((server = getserverinfo(url)) == 0) {
- return 1;
- }
- if ((uri = url_uri(url, &urilen)) == 0) {
- return 1;
- }
-
- for (rule = server->robotrules; rule; rule = rule->next) {
- if (strncmp(uri, rule->disallow, strlen(rule->disallow)) == 0) {
- return 1;
- }
- }
-
- return 0;
- }
- static char useragent[] = "user-agent:";
- static char disallow[] = "disallow:";
- static char swishspider[] = "swishspider";
- static void parserobotstxt(FILE *fp, httpserverinfo *server)
- {
- char buffer[MAXSTRLEN];
- enum {START, USERAGENT, DISALLOW} state = START;
- enum {SPECIFIC, GENERIC, SKIPPING} useragentstate = SKIPPING;
- char *p;
- int len;
- robotrules *entry;
- robotrules *entry2;
-
- server->useragent = 0;
-
- while (fgets(buffer, sizeof buffer, fp) != 0) {
- /* Remove end of line indicators
- **/
- while ((*(buffer + strlen(buffer) - 1) == 'r') ||
- (*(buffer + strlen(buffer) - 1) == 'n')) {
- *(buffer + strlen(buffer) - 1) = ' ';
- }
-
- if ((*buffer == '#') || (*buffer == ' '))
- continue;
-
- if (strncasecmp(buffer, useragent, sizeof(useragent) - 1) == 0) {
- switch (state) {
- case DISALLOW:
- /* Since we found our specific user-agent, we can
- ** skip the rest of the file.
- **/
- if (useragentstate == SPECIFIC) {
- return;
- }
-
- useragentstate = SKIPPING;
-
- /* explict fallthrough */
-
- case START:
- case USERAGENT:
- state = USERAGENT;
-
- if (useragentstate != SPECIFIC) {
- p = isolatevalue(buffer, useragent, &len);
-
- if ((len == (sizeof(swishspider) - 1)) &&
- (strncasecmp(p, swishspider, sizeof(swishspider) - 1) == 0) ) {
- useragentstate = SPECIFIC;
-
- /* We might have already parsed generic rules,
- ** so clean them up if necessary.
- */
- if (server->useragent) {
- free(server->useragent);
- }
- for (entry = server->robotrules; entry; ) {
- entry2 = entry->next;
- free(entry);
- entry = entry2;
- }
- server->robotrules = 0;
-
- server->useragent = (char *)emalloc(len + 1);
- strncpy(server->useragent, p, len);
- *(server->useragent + len) = ' ';
-
- }
- else if ((len == 1) && (*p == '*')) {
- useragentstate = GENERIC;
- server->useragent = (char *)emalloc(2);
- strcpy(server->useragent, "*");
- }
-
- }
-
-
- break;
-
- }
- }
-
- if (strncasecmp(buffer, disallow, sizeof(disallow) - 1) == 0) {
- state = DISALLOW;
- if (useragentstate != SKIPPING) {
- p = isolatevalue(buffer, disallow, &len);
- if (len) {
- entry = (robotrules *)emalloc(sizeof(robotrules));
- entry->next = server->robotrules;
- server->robotrules = entry;
- entry->disallow = (char *)emalloc(len + 1);
- strncpy(entry->disallow, p, len);
- *(entry->disallow + len) = ' ';
- }
- }
- }
- }
- }
- static char *isolatevalue(char *line, char *keyword, int *plen)
- {
- /* Find the beginning of the value
- **/
- for (line += strlen(keyword); isspace(*line); line++ ) {
- }
-
- /* Strip off trailing spaces
- **/
- for (*plen = strlen(line); isspace(*(line + *plen - 1)); (*plen)--) {
- }
-
- return line;
- }
- int equivalentserver(char *url, char *baseurl)
- {
- char *method;
- int methodlen;
- char *serverport;
- int serverportlen;
- char *basemethod;
- int basemethodlen;
- char *baseserverport;
- int baseserverportlen;
- multiswline *walk;
-
- method = url_method(url, &methodlen);
- serverport = url_serverport(url, &serverportlen);
- basemethod = url_method(baseurl, &basemethodlen);
- baseserverport = url_serverport(baseurl, &baseserverportlen);
-
- if (!method || !serverport || !basemethod || !baseserverport) {
- return 0;
- }
-
- /* If this is the same server, we just go for it
- **/
- if ((methodlen == basemethodlen) && (serverportlen == baseserverportlen) &&
- (strncasecmp(method, basemethod, methodlen) == 0) &&
- (strncasecmp(serverport, baseserverport, serverportlen) == 0)) {
- return 1;
- }
-
- /* Do we find the method/server info for this and the base url
- ** in the same equivalence list?
- **/
- for (walk = equivalentservers; walk; walk = walk->next ) {
- if (serverinlist(url, walk->list) &&
- serverinlist(baseurl, walk->list)) {
- return 1;
- }
- }
-
- return 0;
- }
- static int serverinlist(char *url, struct swline *list)
- {
- char *method;
- int methodlen;
- char *serverport;
- int serverportlen;
- char *listmethod;
- int listmethodlen;
- char *listserverport;
- int listserverportlen;
-
- method = url_method(url, &methodlen);
- serverport = url_serverport(url, &serverportlen);
- if (!method || !serverport) {
- return 0;
- }
-
- for ( ; list; list = list->next) {
- listmethod = url_method(list->line, &listmethodlen);
- listserverport = url_serverport(list->line, &listserverportlen);
- if (listmethod && listserverport) {
- if ((methodlen == listmethodlen) && (serverportlen == listserverportlen) &&
- (strncasecmp(method, listmethod, methodlen) == 0) &&
- (strncasecmp(serverport, listserverport, serverportlen) == 0)) {
- return 1;
- }
- }
- }
- return 0;
- }