InterpreteHTMLTask.java
上传用户:qing5858
上传日期:2015-10-27
资源大小:6056k
文件大小:4k
源码类别:

搜索引擎

开发平台:

Java

  1. package net.javacoding.jspider.core.task.work;
  2. import net.javacoding.jspider.api.model.*;
  3. import net.javacoding.jspider.api.event.resource.*;
  4. import net.javacoding.jspider.core.SpiderContext;
  5. import net.javacoding.jspider.core.model.EMailAddressInternal;
  6. import net.javacoding.jspider.core.logging.LogFactory;
  7. import net.javacoding.jspider.core.event.CoreEvent;
  8. import net.javacoding.jspider.core.event.impl.*;
  9. import net.javacoding.jspider.core.task.WorkerTask;
  10. import net.javacoding.jspider.core.util.html.URLFinder;
  11. import net.javacoding.jspider.core.util.html.URLFinderCallback;
  12. import net.javacoding.jspider.core.util.EMailAddressUtil;
  13. import java.io.*;
  14. import java.net.URL;
  15. /**
  16.  *
  17.  * $Id: InterpreteHTMLTask.java,v 1.15 2003/04/10 16:19:14 vanrogu Exp $
  18.  *
  19.  * @author G黱ther Van Roey
  20.  */
  21. public class InterpreteHTMLTask extends BaseWorkerTaskImpl implements URLFinderCallback {
  22.     protected FetchedResource spideredResource;
  23.     protected URL url;
  24.     protected URL contextURL;
  25.     public InterpreteHTMLTask(SpiderContext context, FetchedResource resource) {
  26.         super(context, WorkerTask.WORKERTASK_THINKERTASK);
  27.         this.spideredResource = resource;
  28.         url = spideredResource.getURL();
  29.         contextURL = url;
  30.     }
  31.     public void prepare() {
  32.     }
  33.     public void execute() {
  34.         CoreEvent event = null;
  35.         try {
  36.             InputStream inputStream = spideredResource.getInputStream();
  37.             BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
  38.             String line = br.readLine();
  39.             while (line != null) {
  40.                 URLFinder.findURLs(this, line);
  41.                 line = br.readLine();
  42.             }
  43.             event = new ResourceParsedOkEvent(context, url);
  44.         } catch (IOException e) {
  45.             LogFactory.getLog(InterpreteHTMLTask.class).error("i/o exception during parse", e);
  46.             event = new ResourceParsedErrorEvent(context, url, e);
  47.         } catch (Exception e) {
  48.             LogFactory.getLog(InterpreteHTMLTask.class).error("exception during parse", e);
  49.             event = new ResourceParsedErrorEvent(context, url, e);
  50.         } finally {
  51.             notifyEvent(url, event );
  52.         }
  53.     }
  54.     public void urlFound(URL foundURL) {
  55.         if (EMailAddressUtil.isEMailAddress(foundURL)) {
  56.             String emailAddress = EMailAddressUtil.getEMailAddress(foundURL);
  57.             EMailAddress address = context.getStorage().getEMailAddressDAO().find(emailAddress);
  58.             if (address == null) {
  59.                 address = new EMailAddressInternal(emailAddress);
  60.                 context.getEventDispatcher().dispatch(new EMailAddressDiscoveredEvent(this.spideredResource, emailAddress));
  61.             }
  62.             context.getStorage().getEMailAddressDAO().register(spideredResource, address);
  63.             context.getEventDispatcher().dispatch(new EMailAddressReferenceDiscoveredEvent(this.spideredResource, address));
  64.         } else {
  65.             notifyEvent(url, new URLFoundEvent(context, url, foundURL));
  66.         }
  67.     }
  68.     public void malformedUrlFound(String malformedURL) {
  69.         context.getEventDispatcher().dispatch(new MalformedURLFoundEvent(context.getStorage().getResourceDAO().getResource(url), malformedURL));
  70.     }
  71.     public URL getContextURL() {
  72.         return contextURL;
  73.     }
  74.     public void setContextURL(URL url) {
  75.         this.contextURL = url;
  76.     }
  77.     public void malformedContextURLFound(String malformedURL) {
  78.         context.getEventDispatcher().dispatch(new MalformedBaseURLFoundEvent(spideredResource, malformedURL));
  79.     }
  80. }