SpiderHttpURLTask.java
上传用户:qing5858
上传日期:2015-10-27
资源大小:6056k
文件大小:4k
源码类别:

搜索引擎

开发平台:

Java

  1. package net.javacoding.jspider.core.task.work;
  2. import net.javacoding.jspider.api.model.HTTPHeader;
  3. import net.javacoding.jspider.api.model.Site;
  4. import net.javacoding.jspider.core.SpiderContext;
  5. import net.javacoding.jspider.core.logging.LogFactory;
  6. import net.javacoding.jspider.core.event.CoreEvent;
  7. import net.javacoding.jspider.core.event.impl.*;
  8. import net.javacoding.jspider.core.task.WorkerTask;
  9. import net.javacoding.jspider.core.util.http.HTTPHeaderUtil;
  10. import net.javacoding.jspider.core.util.URLUtil;
  11. import java.io.*;
  12. import java.net.*;
  13. /**
  14.  *
  15.  * $Id: SpiderHttpURLTask.java,v 1.19 2003/04/10 16:19:14 vanrogu Exp $
  16.  *
  17.  * @author G黱ther Van Roey
  18.  */
  19. public class SpiderHttpURLTask extends BaseWorkerTaskImpl {
  20.     protected URL url;
  21.     protected Site site;
  22.     public SpiderHttpURLTask(SpiderContext context, URL url, Site site) {
  23.         super(context, WorkerTask.WORKERTASK_SPIDERTASK);
  24.         this.url = url;
  25.         this.site = site;
  26.     }
  27.     public void prepare() {
  28.         context.throttle(site);
  29.     }
  30.     public void execute() {
  31.         CoreEvent event = null;
  32.         URLConnection connection = null;
  33.         InputStream inputStream = null;
  34.         int httpStatus = 0;
  35.         HTTPHeader[] headers = null;
  36.         try {
  37.             connection = url.openConnection();
  38.             if (connection instanceof HttpURLConnection) {
  39.                 ((HttpURLConnection) connection).setInstanceFollowRedirects(false);
  40.             }
  41.             connection.setRequestProperty("User-agent", site.getUserAgent());
  42.             context.preHandle(connection, site);
  43.             long start = System.currentTimeMillis();
  44.             connection.connect();
  45.             if (connection instanceof HttpURLConnection) {
  46.                 httpStatus = ((HttpURLConnection) connection).getResponseCode();
  47.                 switch (httpStatus) {
  48.                     case HttpURLConnection.HTTP_MOVED_PERM:
  49.                     case HttpURLConnection.HTTP_MOVED_TEMP:
  50.                         String redirectURL = connection.getHeaderField("location");
  51.                         notifyEvent(url, new URLFoundEvent(context, url, URLUtil.normalize(new URL(redirectURL))));
  52.                         break;
  53.                     default:
  54.                         break;
  55.                 }
  56.             }
  57.             inputStream = new BufferedInputStream(connection.getInputStream());
  58.             ByteArrayOutputStream os = new ByteArrayOutputStream();
  59.             InputStream is = new BufferedInputStream(inputStream);
  60.             //int size = connection.getContentLength();
  61.             int size = 0;
  62.             try {
  63.                     int i = is.read();
  64.                     while (i != -1) {
  65.                         size++;
  66.                         os.write(i);
  67.                         i = is.read();
  68.                     }
  69.             } catch (IOException e) {
  70.                 LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception during fetch",e);
  71.             }
  72.             String contentType = connection.getContentType();
  73.             int timeMs = (int) (System.currentTimeMillis() - start);
  74.             headers = HTTPHeaderUtil.getHeaders(connection);
  75.             if (httpStatus >= 200 && httpStatus < 303) {
  76.                 event = new URLSpideredOkEvent(context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers);
  77.             } else {
  78.                 event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, null);
  79.             }
  80.             context.postHandle(connection, site);
  81.         } catch (FileNotFoundException e) {
  82.             headers = HTTPHeaderUtil.getHeaders(connection);
  83.             event = new URLSpideredErrorEvent(context, url, 404, connection, headers, e);
  84.         } catch (Exception e) {
  85.             LogFactory.getLog(this.getClass()).error("exception during spidering", e);
  86.             event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, e);
  87.         } finally {
  88.             notifyEvent(url, event);
  89.             if (inputStream != null) {
  90.                 try {
  91.                     inputStream.close();
  92.                 } catch (IOException e) {
  93.                     LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception closing inputstream",e);
  94.                 }
  95.             }
  96.         }
  97.     }
  98. }