SpiderBaixaURL.java
上传用户:yadaxi
上传日期:2013-07-11
资源大小:20k
文件大小:12k
- import java.io.*;
- import java.net.*;
- //import java.util.*;
- import java.util.regex.*;
- import java.sql.*;
- public class SpiderBaixaURL extends Thread
- {
- URL url_a_baixar;
- Statement stmt = null;
- ResultSet rsVistos = null;
- ResultSet rsPerVeure = null;
- ResultSet rs = null;
- concurrencia conc;
- boolean control;
- Connection conn;
- int n_links=0; //number of links the current url has to all pages (own, and other)
- int n_links_other=0; //number of links the page has to other pages
- int id_scanning;
- public SpiderBaixaURL(concurrencia conc,Connection conn,Statement stmt)
- {
- this.conc=conc;
-
- try{
- this.conn=conn;
- this.stmt=stmt;
-
- primerABaixar();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
- public void run()
- {
- try
- {
- InputStream b;
- String r = url_a_baixar.toString();
- String nomfitx, nomfitx2;
- URL url_a_baixar1 = new URL(r);
- URLConnection url_conn;
- url_conn = url_a_baixar1.openConnection();
- //nomfitx = url_a_baixar.getFile();
- //we get the id of the page we are scanning
- id_scanning();
-
- sense_zip(url_a_baixar);
-
- freeResultSets();
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- }
-
-
- public void sense_zip(URL b)
- {
- try
- {
- String a=new String();
- String ascii=new String();
- String c=new String();
- Character f= new Character('1');
- int i;
-
- System.out.println("Downloading Page..."+url_a_baixar.toString());
-
- BufferedReader dis = new BufferedReader(new InputStreamReader( b.openStream()));
-
- i=dis.read();
- f= new Character((char)i);
-
- c=f.toString();
- a+=c;
- while(i!=-1)
- {
- i=dis.read();
- f= new Character((char)i);
- c=f.toString();
-
- if(i!=-1)
- a+=c;
- }
-
- /* a=eliminaExpresion("<font[^>]*>",a);
- a=eliminaExpresion("</font>",a);
- a=eliminaExpresion("<FONT[^>]*>",a);
- a=eliminaExpresion("</FONT>",a);
- a=eliminaExpresion("<SCRIPT[^>]*>.*</SCRIPT>",a);
- a=eliminaExpresion("<script[^>]*>.*</script>",a);
- */
- HTML2ASCII d = new HTML2ASCII(b.openStream());
-
- do
- {
- i=d.read();
- f= new Character((char)i);
- c=f.toString();
- if(i!=-1)
- ascii+=c;
- }while(i!=-1);
- ascii=ascii.toLowerCase();
-
- //control de la concurrencia
- control=false;
- while(!control)
- {
- System.out.print(".");
- if(!conc.isLocked())
- {
- conc.lock();
- afegirHTML(a,url_a_baixar.toString());
- coincidencies coin = new coincidencies(url_a_baixar,ascii,a,conn,stmt);
- control=true;
- conc.unLock();
- }
- else
- {
- System.out.print(".");
- control=false;
- this.sleep(5000);
- }
- }
-
- System.out.println("Pagina en mem騬ia.");
- Pattern p;
- Matcher m;
- boolean resultado;
- //MAIL p = Pattern.compile("([a-z0-9_]|\-|\.)+@(([a-z0-9_]|\-)+\.)+[a-z]{2,4}");
- p = Pattern.compile("(www\.*)+(([a-z0-9_]|\-)+\.)+[a-z]{2,4}");
- m = p.matcher(a);
-
- resultado=m.find();
- i=0;
- //marquem com a visitada la url q estem llegint
-
- while(resultado)
- {
- i++;
- String nova_adreca= new String(a.subSequence(m.start(),m.end()).toString());
-
- //comprovem que comen鏸 amb http://
- if(!nova_adreca.startsWith("http://"))
- {
- nova_adreca="http://"+nova_adreca;
- }
-
-
-
- //afegim cadascuna de les adreces trobades
- afegirNovaURL(nova_adreca);
-
-
-
- // System.out.println(i+". "+nova_adreca);
- resultado=m.find();
-
- //el poses a vistos i l'elimines
- }
-
- linkCounter();
-
- }
- catch(IOException e)
- {
- posarAVistos(url_a_baixar.toString());
- System.out.println("La URL "+url_a_baixar.toString()+" no existeix.");
- }
- catch(Exception e)
- {
- e.printStackTrace();
- }
- finally
- {
- // it is a good idea to release
- // resources in a finally{} block
- // in reverse-order of their creation
- // if they are no-longer needed
- if (rs != null)
- {
- try
- {
- rs.close();
- }
- catch (SQLException sqlEx)
- { // ignore
- }
- rs = null;
- }
- }
- }
-
-
- //This method will delete all tags that contains format and javascript. Like <font...> and <script>...
- public String eliminaExpresion(String pattern,String text)
- {
- Pattern p;
- Matcher m;
- boolean resultado=true;
- String paraula="";
-
- try
- {
- text=text.replaceAll(pattern,"");//we delete the tag
- }
- catch(Exception e)
- {
- System.out.println("ERROR: Replacing Pattern!");
- e.printStackTrace();
- }
-
- return text;
- }
-
-
- public void posarAVistos(String novaAdreca)
- {
- try
- {
- if(stmt.execute("SELECT * FROM vistes WHERE url='"+novaAdreca+"'"))
- {
- try
- {
- int id_pagina;
- //si existeix ens quedem amb el seu id
- rsPerVeure = stmt.getResultSet();
- rsPerVeure.first();
- id_pagina=rsPerVeure.getInt("id");
- }
- catch(SQLException ex)
- {
- stmt.execute("insert into vistes (url) values ('"+novaAdreca+"')");
- }
- }
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- }
-
- // l'elimines de la taula de per veure
- eliminaPerVeure();
- }
-
- public void eliminaPerVeure()
- {
-
- try{
- // eliminem la url que acabem de fer
- stmt.execute("DELETE FROM perVeure where url ='"+url_a_baixar.toString()+"' ");
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- ex.printStackTrace();
- }
- //fi
-
-
-
- }
-
- public void id_scanning()
- {
- try
- {
- rs = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url_a_baixar.toString()+"'");
-
- rs.first();
- id_scanning=rs.getInt("id");//we have the id
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- ex.printStackTrace();
- }
- }
-
- //this method will count the number of links on the web we are scanning
- public void linkCounter()
- {
- try
- {
- //Afegim la nova URL
- int propis=n_links-n_links_other;
-
- ResultSet rs = stmt.executeQuery("SELECT * FROM vistes WHERE id ='"+id_scanning+"' ");
- //rs.first();
-
- rs.absolute(1); // moves the cursor to the first row of rs
- rs.updateInt("n_links_propis",propis);
- rs.updateRow();
-
-
- //stmt.execute("insert into vistes (n_links_propis) values ('"+propis+"')");
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- ex.printStackTrace();
-
- }
- }
- public void linkA(String newLink)
- {
-
- try
- {
- int id_scanned,n_times=1;
-
- //have we allready visited the page?
- rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+newLink+"'");
-
- if(countItems(rsPerVeure)>0)
- {
- rsPerVeure.first();
- id_scanned=rsPerVeure.getInt("id");
-
- if(id_scanned!=id_scanning)
- {
- //how many times have we linked to this page
- rsPerVeure = stmt.executeQuery("SELECT COUNT(*) FROM links WHERE pagina_els_rep='"+id_scanned+"' AND pagina_envia='"+id_scanning+"'");
-
- if(countItems(rsPerVeure)>0)
- {
- rsPerVeure.first();
- n_times=rsPerVeure.getInt(1);
-
- n_times++;
- }
- else
- {
- n_times=1;
- }
-
- stmt.execute("insert into links (pagina_els_rep,pagina_envia,num) values ('"+id_scanned+"','"+id_scanning+"','"+n_times+"')");
- }
- }
- else // we havn't visited yet the web page that we link at, so we can't take the id
- { // TODO in a future!
-
-
- System.out.println("Link a: "+newLink+" no ha estat possible");
-
-
- }
-
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- ex.printStackTrace();
- }
- }
-
- public void afegirNovaURL(String novaAdreca)
- {
-
- linkA(novaAdreca);
- n_links++;
-
- if((!hiEs(novaAdreca,new String("perVeure")))&&(!hiEs(novaAdreca,new String("vistes"))))
- {
- n_links_other++;
- System.out.println("AFEGIDA "+novaAdreca);
-
- try{
- //Afegim la nova URL
- stmt.execute("insert into perVeure (url) values ('"+novaAdreca+"')");
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- }
- //fi
- }
- }
-
- public void primerABaixar()
- {
- try{
- // baixem les dades del servidor MYSQL
- if (stmt.execute("SELECT * FROM perVeure ORDER BY id DESC"))
- {
-
- rsPerVeure = stmt.getResultSet();
- rsPerVeure.last();
- url_a_baixar=new URL(rsPerVeure.getString(2));
- System.out.println("A Baixar!: "+url_a_baixar.toString());
- // rsPerVeure.close();
- posarAVistos(url_a_baixar.toString());
- }
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- }
- catch(Exception ex )
- {
- ex.printStackTrace();
- }
- //fi
- }
- public boolean hiEs(String url,String taula)
- {
- int i=0;
- try
- {
- ResultSet rs = stmt.executeQuery("SELECT * FROM "+taula+" WHERE url='"+url+"'");
-
- while (rs.next())
- {
- //System.out.println(i);
- i++;
- }
- // rs.close();
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- }
-
- if(i>0)
- return true;
- else
- return false;
-
- }
- public void afegirHTML(String a,String url)
- {
- try
- {
- ResultSet rs = stmt.executeQuery("SELECT * FROM vistes WHERE url ='"+url+"' ");
- //rs.first();
-
- rs.absolute(1); // moves the cursor to the first row of rs
- rs.updateString(3,a);
- rs.updateRow(); // updates the row in the data source
- //rs.close();
-
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- }
- }
- public void freeResultSets()
- {
-
- if (rs != null)
- {
- try
- {
- rs.close();
- }
- catch (SQLException sqlEx)
- { // ignore
- }
- rs = null;
- }
- if (rsPerVeure != null)
- {
- try
- {
- rsPerVeure.close();
- }
- catch (SQLException sqlEx)
- { // ignore
- }
- rs = null;
- }
- if (rsVistos != null)
- {
- try
- {
- rsVistos.close();
- }
- catch (SQLException sqlEx)
- { // ignore
- }
- rs = null;
- }
-
- }
-
- //count the number of items are in this query
- public int countItems(ResultSet rs)
- {
- int i=0;
- try{
-
- while(rs.next())
- i++;
-
- }
- catch(Exception e)
- {
- System.out.println("ERROR: while counting items");
- e.printStackTrace();
-
-
- }
- return i;
- }
-
- }