All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.utils.lsa.util.WebPageComparator Maven / Gradle / Ivy

/*
 * Copyright (2010) Fondazione Bruno Kessler (FBK)
 * 
 * FBK reserves all rights in the Program as delivered.
 * The Program or any portion thereof may not be reproduced
 * in any form whatsoever except as provided by license
 * without the written consent of FBK.  A license under FBK's
 * rights in the Program may be available directly from FBK.
 */

package eu.fbk.utils.lsa.util;

import eu.fbk.utils.lsa.BOW;
import eu.fbk.utils.lsa.LSM;
import eu.fbk.utils.lsa.LSSimilarity;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.TreeMap;

//
public class WebPageComparator {

    /**
     * Define a static logger variable so that it references the
     * Logger instance named WebPageComparator.
     */
    static Logger logger = Logger.getLogger(WebPageComparator.class.getName());

    //
    public WebPageComparator(URL page, URL[] concept, LSSimilarity lss) throws IOException {
        logger.info("parsing " + page + "...");
        BOW bow = new BOW(toText(page));
        logger.info("size bow " + bow.size());
        BOW[] bows = new BOW[concept.length];
        for (int i = 0; i < concept.length; i++) {
            logger.info("parsing concept " + concept[i]);
            bows[i] = new BOW(toText(concept[i]));
            logger.info("size concept " + i + " " + bows[i].size());
            float f = lss.compare(bow, bows[i]);
            logger.info(i + " = " + f);
        }

        //logger.info(toText(page));
        interactive(concept, bows, lss);
    } // end constructor

    //
    public void interactive(URL[] concept, BOW[] bows, LSSimilarity lss) throws IOException {
        InputStreamReader reader = null;
        BufferedReader myInput = null;
        while (true) {
            System.out.println("\nPlease write a query and type  to continue (CTRL C to exit):");

            reader = new InputStreamReader(System.in);
            myInput = new BufferedReader(reader);
            String query = myInput.readLine().toString().toLowerCase();

            BOW bow = new BOW(toText(new URL(query)));
            logger.info("size bow " + bow.size());
            logger.info("bow " + bow);
            SortedMap map = new TreeMap(new Comparator() {

                public int compare(Float o1, Float o2) {
                    //Float f1 = (Float) o1;
                    //Float f2 = (Float) o2;
                    //float diff = f2 - f1;
                    if (o1.floatValue() == o2.floatValue()) {
                        return 0;
                    } else if (o1.floatValue() < o2.floatValue()) {
                        return -1;
                    }

                    return 1;
                }

            } // end FloatComparator

            );
            for (int i = 0; i < bows.length; i++) {

                if (bows[i].size() > 0) {
                    //logger.info("size concept " + i + " " + bows[i].size());
                    float f = lss.compare(bow, bows[i]);
                    //logger.info(i + ":" + concept[i] + " = " + f);
                    //System.out.println(i + "\t" + concept[i] + "\t" + f + "\t(" + bows[i].size() + ")");
                    map.put(f, concept[i].toString());

                }
            } // end for i

            PrintWriter pw = new PrintWriter(new FileWriter("output.html"));
            pw.println("");
            logger.info("map size " + map.size());
            Iterator it = map.keySet().iterator();
            int j = 0;
            while (it.hasNext()) {
                Float f = it.next();
                String s = map.get(f);
                pw.println("");
                System.out.println(j + "\t" + f + "\t" + s);
            }
            pw.println("
" + (++j) + "" + f + "" + s + "
"); pw.flush(); pw.close(); } // end while(true) } //end interactive // public class FloatComparator implements Comparator { public int compare(Object o1, Object o2) { Float f1 = (Float) o1; Float f2 = (Float) o2; float diff = f2 - f1; if (diff > 0) { return 1; } else if (diff < 0) { return -1; } return 0; } public boolean equals(Object obj) { return true; } } // end FloatComparator // public String toText(URL url) { Parser parser = null; StringBuilder sb = new StringBuilder(); try { URLConnection con = url.openConnection(); parser = new Parser(con); //NodeList list = parser.parse(null); NodeList list = parser.extractAllNodesThatMatch(new TagNameFilter("P")); // do something with your list of nodes. SimpleNodeIterator it = list.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); sb.append(node.toPlainTextString()); sb.append("\n"); } } catch (EncodingChangeException ece) { logger.error(ece); //... do whatever necessary to reset your state here //try { // reset the parser parser.reset(); // try again with the encoding now in force // parser.parse(...); } //catch (ParserException pe) //{ //logger.error(pe); //} } catch (ParserException pe) { logger.error(pe); } catch (IOException e) { logger.error(e); } logger.info(url + "\n" + sb.length()); return sb.toString(); } // end toText // public static void main(String[] args) throws Exception { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "log-config.txt"; } PropertyConfigurator.configure(logConfig); if (args.length <= 6) { System.out.println( "Usage: java -mx512M eu.fbk.utils.lsa.util.WebPageComparator input threshold size dim idf page concepts+"); System.exit(1); } File Ut = new File(args[0] + "-Ut"); File Sk = new File(args[0] + "-S"); File r = new File(args[0] + "-row"); File c = new File(args[0] + "-col"); File df = new File(args[0] + "-df"); double threshold = Double.parseDouble(args[1]); int size = Integer.parseInt(args[2]); int dim = Integer.parseInt(args[3]); boolean rescaleIdf = Boolean.parseBoolean(args[4]); LSM lsm = new LSM(Ut, Sk, r, c, df, dim, rescaleIdf); LSSimilarity lss = new LSSimilarity(lsm, size); URL page = new URL(args[5]); URL[] concept = new URL[args.length - 6]; for (int i = 0; i < concept.length; i++) { concept[i] = new URL("http://it.wikipedia.org/wiki/" + args[i + 6]); } new WebPageComparator(page, concept, lss); } // end main } // end WebPageComparator




© 2015 - 2025 Weber Informatics LLC | Privacy Policy