URLFinder.java
上传用户:qing5858
上传日期:2015-10-27
资源大小:6056k
文件大小:3k
- package net.javacoding.jspider.core.util.html;
- import net.javacoding.jspider.core.util.URLUtil;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.StringTokenizer;
- /**
- * $Id: URLFinder.java,v 1.9 2003/04/10 16:19:17 vanrogu Exp $
- */
- public class URLFinder {
- public static final String basePattern = "<base href=";
- public static final String[] patterns = {
- "href=",
- "src=",
- "background="
- };
- public static void findURLs(URLFinderCallback callback, String line) {
- findBase(callback, line, basePattern);
- for (int i = 0; i < patterns.length; i++) {
- String pattern = patterns[i];
- findURLs(callback, line, pattern);
- }
- }
- protected static void findBase(URLFinderCallback callback, String line, String pattern) {
- String lineLowerCase = line.toLowerCase();
- int pos = lineLowerCase.indexOf(pattern);
- if ( pos != -1 ) {
- String url = "";
- try {
- url = extractURL(line, pos + pattern.length());
- URL baseURL = URLUtil.normalize(new URL(url));
- callback.setContextURL(baseURL);
- } catch (MalformedURLException e) {
- callback.malformedContextURLFound(url);
- }
- }
- }
- protected static void findURLs(URLFinderCallback callback, String line, String pattern) {
- String lineLowerCase = line.toLowerCase();
- int pos = lineLowerCase.indexOf(pattern);
- while (pos != -1) {
- String uri = "";
- try {
- uri = extractURL(line, pos + pattern.length());
- URL baseURL = callback.getContextURL();
- if ( ! URLUtil.isFileSpecified(baseURL)) {
- // Force a slash in case of a folder (to avoid buggy relative refs)
- baseURL = new URL(baseURL.toString() + "/");
- }
- URL foundURL = URLUtil.normalize(new URL(baseURL, uri));
- callback.urlFound(foundURL);
- } catch (MalformedURLException e) {
- callback.malformedUrlFound(uri);
- }
- pos = lineLowerCase.indexOf(pattern, pos + pattern.length());
- }
- }
- protected static String extractURL(String string, int pos) {
- char c = string.charAt(pos);
- String ret = "";
- if (c == ''' || c == '"') {
- string = string.substring(pos + 1);
- } else {
- string = string.substring(pos);
- }
- if (string.length() > 0) {
- c = string.charAt(0);
- if (c == ''' || c == '"' || c == '>') {
- ret = "";
- } else {
- StringTokenizer st = new StringTokenizer(string, " "'>");
- ret = st.nextToken();
- }
- }
- int p = ret.indexOf('#');
- if (p > -1) {
- return ret.substring(0, p);
- } else {
- return ret;
- }
- }
- }