AgentImpl.java
上传用户:qing5858
上传日期:2015-10-27
资源大小:6056k
文件大小:10k
源码类别:

搜索引擎

开发平台:

Java

  1. package net.javacoding.jspider.core.impl;
  2. import net.javacoding.jspider.api.event.resource.*;
  3. import net.javacoding.jspider.api.event.site.*;
  4. import net.javacoding.jspider.api.model.*;
  5. import net.javacoding.jspider.core.Agent;
  6. import net.javacoding.jspider.core.SpiderContext;
  7. import net.javacoding.jspider.core.dispatch.EventDispatcher;
  8. import net.javacoding.jspider.core.event.CoreEvent;
  9. import net.javacoding.jspider.core.event.CoreEventVisitor;
  10. import net.javacoding.jspider.core.event.impl.*;
  11. import net.javacoding.jspider.core.exception.SpideringDoneException;
  12. import net.javacoding.jspider.core.exception.TaskAssignmentException;
  13. import net.javacoding.jspider.core.logging.Log;
  14. import net.javacoding.jspider.core.logging.LogFactory;
  15. import net.javacoding.jspider.core.model.SiteInternal;
  16. import net.javacoding.jspider.core.storage.Storage;
  17. import net.javacoding.jspider.core.task.*;
  18. import net.javacoding.jspider.core.task.work.*;
  19. import net.javacoding.jspider.core.util.URLUtil;
  20. import java.io.ByteArrayInputStream;
  21. import java.net.URL;
  22. /**
  23.  *
  24.  * $Id: AgentImpl.java,v 1.32 2003/04/29 17:53:47 vanrogu Exp $
  25.  *
  26.  * @author G黱ther Van Roey
  27.  */
  28. public class AgentImpl implements Agent, CoreEventVisitor {
  29.     protected Storage storage;
  30.     protected SpiderContext context;
  31.     protected EventDispatcher eventDispatcher;
  32.     protected Scheduler scheduler;
  33.     protected Log log;
  34.     public AgentImpl(SpiderContext context) {
  35.         this.context = context;
  36.         this.storage = context.getStorage();
  37.         this.eventDispatcher = context.getEventDispatcher();
  38.         this.scheduler = new SchedulerFactory().createScheduler(context);
  39.         log = LogFactory.getLog(Agent.class);
  40.     }
  41.     public synchronized void start() {
  42.         URL baseURL = context.getBaseURL();
  43.         visit(null, new URLFoundEvent(context, null, baseURL));
  44.         notifyAll();
  45.     }
  46.     public synchronized void flagDone(WorkerTask task) {
  47.         scheduler.flagDone(task);
  48.         notifyAll();
  49.     }
  50.     public synchronized WorkerTask getThinkerTask() throws TaskAssignmentException {
  51.         while (true) {
  52.             try {
  53.                 return scheduler.getThinkerTask();
  54.             } catch (SpideringDoneException e) {
  55.                 throw e;
  56.             } catch (TaskAssignmentException e) {
  57.                 try {
  58.                     wait();
  59.                 } catch (InterruptedException e1) {
  60.                     Thread.currentThread().interrupt();
  61.                 }
  62.             }
  63.         }
  64.     }
  65.     public synchronized WorkerTask getSpiderTask() throws TaskAssignmentException {
  66.         while (true) {
  67.             try {
  68.                 return scheduler.getFethTask();
  69.             } catch (SpideringDoneException e) {
  70.                 throw e;
  71.             } catch (TaskAssignmentException e) {
  72.                 try {
  73.                     wait();
  74.                 } catch (InterruptedException e1) {
  75.                     Thread.currentThread().interrupt();
  76.                 }
  77.             }
  78.         }
  79.     }
  80.     /**
  81.      * @param foundURL
  82.      */
  83.     public synchronized void scheduleForSpidering(URL foundURL) {
  84.         URL siteURL = URLUtil.getSiteURL(foundURL);
  85.         Site site = storage.getSiteDAO().find(siteURL);
  86.         scheduler.schedule(new SpiderHttpURLTask(context, foundURL, site));
  87.         notifyAll();
  88.     }
  89.     public synchronized void scheduleForParsing(URL url) {
  90.         scheduler.schedule(new InterpreteHTMLTask(context, (FetchedResource) storage.getResourceDAO().getResource(url)));
  91.         notifyAll();
  92.     }
  93.     public synchronized void registerEvent(URL url, CoreEvent event) {
  94.         event.accept(url, this);
  95.         notifyAll();
  96.     }
  97.     public void visit(URL url, CoreEvent event) {
  98.         log.error("ERROR -- UNHANDLED COREEVENT IN AGENT !!!");
  99.     }
  100.     public void visit(URL url, URLSpideredOkEvent event) {
  101.         storage.getResourceDAO().setSpidered(url, event);
  102.         eventDispatcher.dispatch(new ResourceFetchedEvent(storage.getResourceDAO().getResource(url)));
  103.         scheduler.schedule(new DecideOnParsingTask(context, url));
  104.     }
  105.     public void visit(URL url, URLSpideredErrorEvent event) {
  106.         storage.getResourceDAO().setError(url, event);
  107.         eventDispatcher.dispatch(new ResourceFetchErrorEvent(storage.getResourceDAO().getResource(url), event.getHttpStatus()));
  108.     }
  109.     public void visit(URL url, ResourceParsedOkEvent event) {
  110.         storage.getResourceDAO().setParsed(url, event);
  111.         eventDispatcher.dispatch(new ResourceParsedEvent(storage.getResourceDAO().getResource(url)));
  112.     }
  113.     public void visit(URL url, ResourceParsedErrorEvent event) {
  114.         storage.getResourceDAO().setError(url, event);
  115.     }
  116.     public void visit(URL url, URLFoundEvent event) {
  117.         URL foundURL = event.getFoundURL();
  118.         URL siteURL = URLUtil.getSiteURL(foundURL);
  119.         Site site = storage.getSiteDAO().find(siteURL);
  120.         boolean newResource = (storage.getResourceDAO().getResource(foundURL) == null);
  121.         if (site == null) {
  122.             site = storage.getSiteDAO().createSite(siteURL);
  123.             context.registerNewSite(site);
  124.             storage.getSiteDAO().save(site);
  125.             eventDispatcher.dispatch(new SiteDiscoveredEvent(site));
  126.             if (site.getFetchRobotsTXT()) {
  127.                 if (site.mustHandle()) {
  128.                     URL robotsTXTUrl = URLUtil.getRobotsTXTURL(siteURL);
  129.                     scheduler.schedule(new FetchRobotsTXTTaskImpl(context, robotsTXTUrl, site));
  130.                     if (newResource) {
  131.                         scheduler.block(siteURL, new DecideOnSpideringTask(context, new URLFoundEvent(context, url, foundURL)));
  132.                     }
  133.                 }
  134.             } else {
  135.                 if (site.mustHandle()) {
  136.                     ((SiteInternal) site).registerRobotsTXTSkipped();
  137.                     context.registerRobotsTXTSkipped(site);
  138.                     eventDispatcher.dispatch(new RobotsTXTSkippedEvent(site));
  139.                     if (newResource) {
  140.                         scheduler.schedule(new DecideOnSpideringTask(context, event));
  141.                     }
  142.                 }
  143.                 notifyAll();
  144.             }
  145.         } else if (site.isRobotsTXTHandled()) {
  146.             if (newResource) {
  147.                 scheduler.schedule(new DecideOnSpideringTask(context, event));
  148.             }
  149.             notifyAll();
  150.         } else {
  151.             if (site.mustHandle()) {
  152.                 if (newResource) {
  153.                     scheduler.block(siteURL, new DecideOnSpideringTask(context, new URLFoundEvent(context, url, foundURL)));
  154.                 }
  155.             }
  156.         }
  157.         if (newResource) {
  158.             storage.getResourceDAO().registerURL(foundURL);
  159.             if ( !site.mustHandle()) {
  160.                 storage.getResourceDAO().setIgnoredForFetching(foundURL, event);
  161.             }
  162.             eventDispatcher.dispatch(new ResourceDiscoveredEvent(storage.getResourceDAO().getResource(foundURL)));
  163.         }
  164.         storage.getResourceDAO().registerURLReference(foundURL, url);
  165.         if (url != null) {
  166.             eventDispatcher.dispatch(new ResourceReferenceDiscoveredEvent(storage.getResourceDAO().getResource(url), storage.getResourceDAO().getResource(foundURL)));
  167.         }
  168.     }
  169.     public void visit(URL url, RobotsTXTSpideredOkEvent event) {
  170.         URL robotsTxtURL = event.getRobotsTXTURL();
  171.         URL siteURL = URLUtil.getSiteURL(robotsTxtURL);
  172.         SiteInternal site = (SiteInternal) storage.getSiteDAO().find(siteURL);
  173.         DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL);
  174.         for (int i = 0; i < tasks.length; i++) {
  175.             scheduler.schedule(tasks[i]);
  176.         }
  177.         storage.getResourceDAO().registerURL(robotsTxtURL);
  178.         storage.getResourceDAO().setSpidered(robotsTxtURL, event);
  179.         storage.getResourceDAO().setIgnoredForParsing(robotsTxtURL);
  180.         Resource resource = storage.getResourceDAO().getResource(robotsTxtURL);
  181.         byte[] bytes = event.getBytes();
  182.         site.registerRobotsTXT();
  183.         eventDispatcher.dispatch(new ResourceDiscoveredEvent(resource));
  184.         eventDispatcher.dispatch(new ResourceFetchedEvent(resource));
  185.         eventDispatcher.dispatch(new RobotsTXTFetchedEvent(site, new String(bytes)));
  186.         context.registerRobotsTXT(site, new ByteArrayInputStream(bytes));
  187.         storage.getSiteDAO().save(site);
  188.     }
  189.     public void visit(URL url, RobotsTXTSpideredErrorEvent event) {
  190.         URL robotsTxtURL = event.getRobotsTXTURL();
  191.         URL siteURL = URLUtil.getSiteURL(robotsTxtURL);
  192.         Site site = storage.getSiteDAO().find(siteURL);
  193.         ((SiteInternal) site).registerRobotsTXTError();
  194.         DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL);
  195.         for (int i = 0; i < tasks.length; i++) {
  196.             scheduler.schedule(tasks[i]);
  197.         }
  198.         storage.getResourceDAO().registerURL(robotsTxtURL);
  199.         storage.getResourceDAO().setError(robotsTxtURL, event);
  200.         eventDispatcher.dispatch(new RobotsTXTFetchErrorEvent(site, event.getException()));
  201.         context.registerRobotsTXTError(site);
  202.         storage.getSiteDAO().save(site);
  203.     }
  204.     public void visit(URL url, RobotsTXTUnexistingEvent event) {
  205.         URL robotsTxtURL = event.getRobotsTXTURL();
  206.         URL siteURL = URLUtil.getSiteURL(robotsTxtURL);
  207.         Site site = storage.getSiteDAO().find(siteURL);
  208.         ((SiteInternal) site).registerNoRobotsTXTFound();
  209.         DecideOnSpideringTask[] tasks = scheduler.unblock(siteURL);
  210.         for (int i = 0; i < tasks.length; i++) {
  211.             scheduler.schedule(tasks[i]);
  212.         }
  213.         storage.getSiteDAO().save(site);
  214.         eventDispatcher.dispatch(new RobotsTXTMissingEvent(site));
  215.     }
  216. }