LinkExtractor.java
资源名称:08.zip [点击查看]
上传用户:ynjin1970
上传日期:2014-10-13
资源大小:6438k
文件大小:4k
源码类别:
中间件编程
开发平台:
Visual C++
- //package org.apache.lucene.index;
- import java.io.*;
- import org.htmlparser.Node;
- import org.htmlparser.Parser;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.util.ParserException;
- import java.util.*;
- import java.net.*;
- /**
- * LinkExtractor extracts all the links from the given webpage
- * and prints them on standard output.
- */
- public class LinkExtractor {
- private String location;
- private Parser parser;
- private static int b=0;
- private static int tID;
- private static int iNode;
- public final int DEEP=3; //遍历的深度
- public static Vector svecLink, svecOutlink;
- public static String hostName;
- public static boolean bl;
- public LinkExtractor(String location) {
- this.location = location;
- hostName=GetHostName(location);
- System.out.println("主机名称是 "+hostName);
- bl=false;
- svecLink=new Vector();
- svecOutlink=new Vector();
- svecLink.add(location);
- }
- public void extractLinks(String loc) throws ParserException {
- System.out.println("Parsing "+loc+" for links...");
- Vector vecTemp=new Vector();
- try {
- this.parser = new Parser(loc); // Create the parser object
- parser.registerScanners(); // Register standard scanners (Very Important)
- bl=true;
- }
- catch (ParserException e) {
- bl=false;
- e.printStackTrace();
- }
- String ss,str1;
- URL wwwurl;
- boolean byes;
- int a=0;
- b++;
- Node [] links = parser.extractAllNodesThatAre(LinkTag.class);
- for (int i = 0;i < links.length;i++) {
- if(bl)
- {
- byes=true;
- System.out.println("Total url is "+links.length+"This page has url "+i);
- LinkTag linkTag = (LinkTag)links[i];
- str1=linkTag.getLink();
- // System.out.println("the url is "+str1);&&!svecOutlink.contains(str1)
- if(str1.equals("")) continue;
- if(str1.charAt(str1.length()-1)=='/'
- ||str1.charAt(str1.length()-1)=='\')
- str1=str1.substring(0,str1.length()-1);
- if(!svecLink.contains(str1))
- {
- try
- {
- wwwurl=new URL(str1);
- wwwurl.getContent();
- }
- catch(MalformedURLException e)
- {
- byes=false;
- }
- catch(IOException e)
- {
- byes=false;
- }
- if(GetHostName(str1).equals(hostName) && byes)
- {
- a++;
- tID++;
- svecLink.add(str1);
- vecTemp.add(str1);
- System.out.println("the url is "+str1);
- }
- else
- {
- svecOutlink.add(str1);
- }
- }
- }
- }
- String strNew;
- if(a>0&&b<=DEEP)
- {
- for(int i=0;i<vecTemp.size();i++)
- {
- strNew=(String)vecTemp.get(i);
- System.out.println("this is "+strNew);
- extractLinks(strNew);
- }
- }
- }
- boolean linkAttribute(String strLink)
- {
- return true;
- }
- static void printCol(Enumeration col)
- {
- String str;
- while(col.hasMoreElements())
- {
- str=(String)col.nextElement();
- System.out.println(str);
- }
- }
- public String GetHostName(String hostname)
- {
- URL aurl;
- String ss=" ";
- try
- {
- aurl=new URL(hostname);
- ss=aurl.getHost();
- }
- catch(MalformedURLException e)
- {
- e.printStackTrace();
- //return "null";
- }
- return ss;
- }
- public static void main(String[] args) {
- /*
- if (args.length<0) {
- System.err.println("Syntax Error : Please provide the location(URL or file) to parse");
- System.exit(-1);
- }*/
- Vector allLink=new Vector();
- String strNew,strall1,strall2,str;
- String ss="http://www.dlut.edu.cn/";
- LinkExtractor linkExtractor = new LinkExtractor(ss);
- try {
- linkExtractor.extractLinks(ss);
- Enumeration col=svecLink.elements();
- while(col.hasMoreElements())
- {
- str=(String)col.nextElement();
- System.out.println(str);
- }
- }
- catch (ParserException e) {
- e.printStackTrace();
- }
- }
- }