coincidencies.java
上传用户:yadaxi
上传日期:2013-07-11
资源大小:20k
文件大小:9k
- import java.util.regex.*;
- import java.net.*;
- import java.sql.*;
- public class coincidencies
- {
- static Statement stmt = null;
- static ResultSet rsPerVeure = null;
- String paraula="";
- Connection conn;
-
- public coincidencies(URL url,String ascii,String textSencer,Connection conn,Statement stmt)
- {
- int id_pagina=0,i=0,id_paraula=0;
-
- try
- {
- ascii=ascii.replaceAll(" "," ");
- ascii=ascii.replaceAll(" "," ");
- ascii=ascii.replaceAll("nbsp;"," ");
- ascii=ascii.replaceAll("nbsp"," ");
-
- Class.forName("com.mysql.jdbc.Driver").newInstance();
-
- //Connection conn = DriverManager.getConnection("jdbc:mysql://192.168.1.2/spider?user=spider&password=spider");
-
- //stmt = conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE,ResultSet.CONCUR_UPDATABLE);
- this.conn=conn;
- this.stmt=stmt;
-
- //mirem si la p鄃ina ja existeix a la nostra base de dades
- //if (stmt.execute("SELECT * FROM vistes WHERE url='"+url.toString()+"'"))
- rsPerVeure = stmt.executeQuery("SELECT * FROM vistes WHERE url='"+url.toString()+"'");
- if(this.countItems(rsPerVeure)>0)
- {
- //Si existeix agafem el seu id
- // rsPerVeure = stmt.getResultSet();
- rsPerVeure.first();
- id_pagina=rsPerVeure.getInt("id");
- }
- else//if the program runs wel this part never have to run
- {
- // sino en creem un de nou i l'insertem a la BD
- stmt.execute("INSERT INTO vistes (url) VALUES ('"+url.toString()+"')");
-
- rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url.toString()+"'");
- //agafem el seu id
- rsPerVeure.first();
- id_pagina=rsPerVeure.getInt(1);
- }
-
- //comen鏴m a parsejar!
- Pattern p;
- Matcher m;
- boolean resultado;
- p = Pattern.compile("\w+");
- m = p.matcher(ascii);
- resultado=m.find();
- while(resultado)
- {
- paraula= new String(ascii.subSequence(m.start(),m.end()).toString());
-
- paraula=paraula.replaceAll("\<.*?\>","").toLowerCase();
- //comprovem si la paraula trobada existeix a la BD
-
- rsPerVeure = stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'");
-
- //System.out.println(countItems(rsPerVeure));
- if(countItems(rsPerVeure)>0)
- {
- //si existeix ens quedem amb el seu id
- // rsPerVeure = stmt.getResultSet();
- rsPerVeure.first();
- id_paraula=rsPerVeure.getInt(1);
- }
- else
- {
- // System.out.println("6");
- // sino en creem un de nou i l'insertem a la BD
- if(paraula.length()<50)
- {
- stmt.execute("INSERT INTO paraules (paraula) VALUES ('"+paraula+"')");
- rsPerVeure=stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'");
-
- //Si existeix agafem el seu id
- //rsPerVeure = stmt.getResultSet();
- rsPerVeure.first();
- id_paraula=rsPerVeure.getInt(1);
- }
- }
- if(paraula.length()<50)//max length of a word declared on the database
- {
- rsPerVeure = stmt.executeQuery("SELECT * FROM coincidencies WHERE paraula='"+id_paraula+"' AND pagina='"+id_pagina+"'");
-
- int n_coincidencies=0,id_coincidencies=0;
-
- if(countItems(rsPerVeure)>0)
- {
-
- //si existeix ens quedem amb el seu id
- //rsPerVeure = stmt.getResultSet();
- rsPerVeure.first();
- n_coincidencies=rsPerVeure.getInt("n_coincidencies");
- id_coincidencies=rsPerVeure.getInt("id");
- n_coincidencies++;
- // System.out.println("9");
- //ja existeix, augmentem la coincidencia
- rsPerVeure.updateString("n_coincidencies",new Integer(n_coincidencies).toString());
- rsPerVeure.updateRow();
-
- //ResultSet rs = stmt.executeQuery("UPDATE coincidencies SET n_coincidencies = '"+n_coincidencies+"' WHERE id ='"+id_coincidencies+"' ");
- }
- else
- stmt.execute("INSERT INTO coincidencies (paraula,pagina,n_coincidencies) VALUES ('"+id_paraula+"','"+id_pagina+"','1')");
- }
-
- //busquem la seguent paraula
- resultado=m.find();
- }
- // System.out.println("Indexat: "+paraula);
-
-
- comprovaCoincidencies(textSencer,id_pagina,"<b>.+</b>");
- comprovaCoincidencies(textSencer,id_pagina,"<B>.+</B>");
- comprovaCoincidencies(textSencer,id_pagina,"<h1>.+</h1>");
- comprovaCoincidencies(textSencer,id_pagina,"<H1>.+</H1>");
- freeResultSets();
-
- }
- catch(SQLException ex)
- {
- // handle any errors
- System.out.println("SQLException: " + ex.getMessage());
- System.out.println("SQLState: " + ex.getSQLState());
- System.out.println("VendorError: " + ex.getErrorCode());
- return;
- }
- catch(Exception e)
- {
- e.printStackTrace();
- return;
- }
- }
-
- public void freeResultSets()
- {
- if (rsPerVeure != null)
- {
- try
- {
- rsPerVeure.close();
- }
- catch (SQLException sqlEx)
- { // ignore
- }
- }
- /* if (stmt != null)
- {
- try
- {
- stmt.close();
- }
- catch (SQLException sqlEx)
- {
- // ignore
- }
- stmt = null;
- } */
-
-
- }
-
-
- public int countItems(ResultSet rs)
- {
- int i=0;
- try{
-
- while(rs.next())
- i++;
-
- }
- catch(Exception e)
- {
- System.out.println("ERROR: while counting items");
- e.printStackTrace();
-
-
- }
- return i;
- }
-
- //this method count the words of a web page
- public int wordCount(String ascii)
- {
- Pattern p;
- Matcher m;
- int wordCount=0;
- boolean resultado;
-
- p = Pattern.compile("\w+");
- m = p.matcher(ascii);
- resultado=m.find();
-
- while(resultado)
- {
- wordCount++;
- resultado=m.find();
- }
- return wordCount;
-
-
- }
- //we find the words that are between <b></b> and <h1></h1> and <u></u>
- //if we find words we increase the value of n_coincidencies on the table coincidencies
- //with this we can give more importance to the words that are between tags <b></b> and <h1></h1> and <u></u>
- //
- //words that ar between tags and:
- //length less than 20 caracters (not all the page)
- public void comprovaCoincidencies(String text,int id_pagina,String pattern)
- {
- Pattern p,p1;
- Matcher m,m1;
- boolean resultado;
- String paraula="",subParaula="";
- int id_paraula=0,coincidencies=0;
-
- try
- {
- text=text.toLowerCase();
- p = Pattern.compile(pattern);
- // p1 = Pattern.compile("<b>.+[</b>]");
- m = p.matcher(text);
-
- resultado=m.find();
-
- while(resultado)
- {
- paraula= new String(text.subSequence(m.start(),m.end()).toString());
- paraula=paraula.replaceAll("\<.*?\>","");//delete <b> </b> tags
- paraula=paraula.replaceAll("!","");//delete the signs that won't be indexed
- paraula=paraula.replaceAll("