FetchRobotsTXTTaskImpl.java
上传用户:qing5858
上传日期:2015-10-27
资源大小:6056k
文件大小:4k
源码类别:

搜索引擎

开发平台:

Java

  1. package net.javacoding.jspider.core.task.work;
  2. import net.javacoding.jspider.api.model.HTTPHeader;
  3. import net.javacoding.jspider.api.model.Site;
  4. import net.javacoding.jspider.core.SpiderContext;
  5. import net.javacoding.jspider.core.logging.LogFactory;
  6. import net.javacoding.jspider.core.event.CoreEvent;
  7. import net.javacoding.jspider.core.event.impl.*;
  8. import net.javacoding.jspider.core.task.WorkerTask;
  9. import net.javacoding.jspider.core.util.http.HTTPHeaderUtil;
  10. import java.io.*;
  11. import java.net.*;
  12. /**
  13.  * $Id: FetchRobotsTXTTaskImpl.java,v 1.19 2003/04/25 21:29:05 vanrogu Exp $
  14.  */
  15. public class FetchRobotsTXTTaskImpl extends BaseWorkerTaskImpl {
  16.     protected URL url;
  17.     protected Site site;
  18.     public FetchRobotsTXTTaskImpl(SpiderContext context, URL url, Site site) {
  19.         super(context, WorkerTask.WORKERTASK_SPIDERTASK);
  20.         this.url = url;
  21.         this.site = site;
  22.     }
  23.     public void prepare() {
  24.         context.throttle(site);
  25.     }
  26.     public void execute() {
  27.         CoreEvent event = null;
  28.         URLConnection connection = null;
  29.         InputStream inputStream = null;
  30.         int httpStatus = 0;
  31.         HTTPHeader[] headers = null;
  32.         try {
  33.             connection = url.openConnection();
  34.             // RFC states that redirects should be followed.
  35.             // see: http://www.robotstxt.org/wc/norobots-rfc.txt
  36.             ((HttpURLConnection) connection).setInstanceFollowRedirects(true);
  37.             connection.setRequestProperty("User-agent", site.getUserAgent() );
  38.             context.preHandle(connection, site);
  39.             long start = System.currentTimeMillis();
  40.             connection.connect();
  41.             if (connection instanceof HttpURLConnection) {
  42.                 httpStatus = ((HttpURLConnection) connection).getResponseCode();
  43.                 switch (httpStatus) {
  44.                     case HttpURLConnection.HTTP_MOVED_PERM:
  45.                     case HttpURLConnection.HTTP_MOVED_TEMP:
  46.                         return;
  47.                     default:
  48.                         break;
  49.                 }
  50.             }
  51.             inputStream = new BufferedInputStream(connection.getInputStream());
  52.             ByteArrayOutputStream os = new ByteArrayOutputStream();
  53.             InputStream is = new BufferedInputStream(inputStream);
  54.             try {
  55.                     int i = is.read();
  56.                     while (i != -1) {
  57.                         os.write(i);
  58.                         i = is.read();
  59.                     }
  60.             } catch (IOException e) {
  61.                 LogFactory.getLog(FetchRobotsTXTTaskImpl.class).error("i/o exception during fetch robots.txt",e);
  62.             }
  63.             String contentType = connection.getContentType();
  64.             int size = connection.getContentLength();
  65.             int timeMs = (int) (System.currentTimeMillis() - start);
  66.             headers = HTTPHeaderUtil.getHeaders(connection);
  67.             if (httpStatus >= 200 && httpStatus < 303) {
  68.                 event = new RobotsTXTSpideredOkEvent(url,context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers);
  69.             } else if (httpStatus >= 400 && httpStatus < 500) {
  70.                 event = new RobotsTXTUnexistingEvent(url,context, url, httpStatus, connection, headers, null);
  71.             } else {
  72.                 event = new RobotsTXTSpideredErrorEvent(url,context, url, httpStatus, connection, headers, null);
  73.             }
  74.         } catch (FileNotFoundException e) {
  75.             headers = HTTPHeaderUtil.getHeaders(connection);
  76.             event = new RobotsTXTUnexistingEvent(url,context, url, 404, connection, headers, e);
  77.         } catch (Exception e) {
  78.             event = new RobotsTXTSpideredErrorEvent(url,context, url, httpStatus, connection, headers, e);
  79.         } finally {
  80.             notifyEvent(url, event);
  81.             if (inputStream != null) {
  82.                 try {
  83.                     inputStream.close();
  84.                 } catch (IOException e) {
  85.                     LogFactory.getLog(FetchRobotsTXTTaskImpl.class).error("i/o exception closing inputstream",e);
  86.                 }
  87.             }
  88.         }
  89.     }
  90. }