HtmlParserGetLexerUrls.java
上传用户:cctqzzy
上传日期:2022-03-14
资源大小:12198k
文件大小:2k
- package chapter9;
- import org.htmlparser.util.*;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.*;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.nodes.TextNode;
- import org.htmlparser.lexer.*;
- import org.htmlparser.lexer.Stream;
- import org.htmlparser.Node;
- import java.io.*;
- import java.net.*;
- import org.htmlparser.http.ConnectionManager;
- import org.htmlparser.visitors.TextExtractingVisitor;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.filters.HasSiblingFilter;
- import org.htmlparser.util.ParserException;
- public class HtmlParserGetLexerUrls {
- public static void main (String[] args) throws ParserException
- {
- try {
- getLexerUrls("http://www.bnu.edu.cn/","GB2312");
- } catch(ParserException e)
- {
- e.printStackTrace();
- }
- }
-
- public static void getLexerUrls(String url , String pageEncoding) throws ParserException
- {
- Node node = null;
- Lexer lexer = null;
- try {
- ConnectionManager connmgr;
- connmgr = Page.getConnectionManager(); // 生成链接管理器
- lexer = new Lexer(connmgr.openConnection(url)); // 生成分析器
- lexer.getPage().setEncoding(pageEncoding); // 设置网页编码
- node = lexer.nextNode();
- while(node != null ) { // 循环遍历每个节点
- System.out.println(node.toString());
- node = lexer.nextNode();
- }
- } catch (ParserException e) {
- e.printStackTrace();
- }
- }
-
- }