- package chapter2;
- import java.io.*;
- import java.net.*;
- import java.io.File;
- import java.io.FileReader;
- import java.io.FileWriter;
- public class WebParserFilter {
- private static String src_File_Path = "D:\workshop\ch2\htmlsrc.html";
- private static String dst_File_Path = "D:\workshop\ch2\puresrc.txt";
- public static void main(String[] args) throws IOException {
- try {
- ParserFilter();
- } catch (IOException e) {
- System.err.println("下载失败,请检查输入地址是否正确。");
- System.exit(1);
- }
- }
- public static void ParserFilter() throws IOException {
- try {
- int j = 0;
- boolean bflag = true;
- boolean bContent = true;
- StringBuffer sBuffer = new StringBuffer(8096*2);
- char[] cBuffer = new char[8096*2];
- char[] dstBuffer = new char[8096*2];
- int nCount = 0;
- File srcfile = new File(src_File_Path);
- FileReader fpReader = new FileReader(srcfile);
- File dstfile = new File(dst_File_Path);
- FileWriter fpWriter = new FileWriter(dstfile);
- nCount = fpReader.read(cBuffer);
- for(int i = 0; i < nCount;i++)
- {
- if( bContent == false )
- {
- if(cBuffer[i] == '>')
- bContent = true;
- else
- continue;
- } else {
- if(cBuffer[i] == '<')
- {
- bContent = false;
- continue;
- } else if(cBuffer[i] == 'n' || cBuffer[i] == ' ' || cBuffer[i] == ' ' || cBuffer[i] == ' ')
- {
- continue;
- }else if( cBuffer[i] == '&' && cBuffer[i+1] == 'n'
- && cBuffer[i+2] == 'b' && cBuffer[i+3] == 's'
- && cBuffer[i+4] == 'p' && cBuffer[i+5] == ';')
- {
- i =i+5;
- continue;
- }
- dstBuffer[j++] = cBuffer[i];
- }
- }
- bflag = true;
- for(int m = 0; m < j; m++)
- { // 英文和数字不拆分
- if( ( dstBuffer[m] <= 'Z' && dstBuffer[m] >= 'A' )
- || ( dstBuffer[m] <= 'z' && dstBuffer[m] >= 'a' )
- || ( dstBuffer[m] <= '9' && dstBuffer[m] >= '0' ))
- {
- if( bflag == false)
- {
- sBuffer.append(' ');
- }
- sBuffer.append(dstBuffer[m]);
- bflag = true;
- //sBuffer.append(' ');
- } else {
- // 过滤标点符号
- if( dstBuffer[m] == '、' || dstBuffer[m] == '|'
- || dstBuffer[m] == '”' || dstBuffer[m] == ':'
- || dstBuffer[m] == ';' || dstBuffer[m] == '.')
- {
- sBuffer.append(' ');
- continue;
- }
- if( bflag == true)
- {
- sBuffer.append(' ');
- }
- // 中文字符用空格分离
- sBuffer.append(dstBuffer[m]);
- sBuffer.append(' ');
- bflag = false;
- }
- }
- System.out.println(sBuffer.toString());
- fpWriter.write(sBuffer.toString());
- fpReader.close();
- fpWriter.close();
- } catch (UnknownHostException e) {
- System.err.println("无法访问指定主机.");
- System.exit(1);
- } catch (IOException e) {
- throw e;
- }
- }
- }