nlp.NGramOverlap Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
There is a newer version: 1.1
package nlp;

import nlp.corpora.ShuZiInsQA;

import java.io.IOException;
import java.util.*;

/**
 * Created by apease on 10/1/15.
 */
public class NGramOverlap {

    TFIDF tfidf = null;
    public boolean debug = false;

    /** ***************************************************************
     */
    public NGramOverlap(TFIDF tf) throws IOException {

        //System.out.println("Info in TFIDF(): Initializing");
        tfidf = tf;
    }

    /** ***************************************************************
     * @return an integer score of the number of shared ngrams (minus
     * punctuation)
     */
    public int nGramOverlap(String x, String y, int n) {

        //System.out.println("NGramOverlap.overlap(): testing: " + x + " \nand:\n" + y);
        int overlap = 0;
        HashSet common = new HashSet<>();
        String str1 = tfidf.removePunctuation(x);
        //str1 = tfidf.removeStopWords(str1);
        ArrayList s1 = new ArrayList();
        String[] sspl = str1.split(" ");
        s1.addAll(Arrays.asList(sspl));

        String str2 = tfidf.removePunctuation(y);
        //str2 = tfidf.removeStopWords(str2);
        ArrayList s2 = new ArrayList();
        s2.addAll(Arrays.asList(str2.split(" ")));

        for (int i = 0; i < s1.size()+1 - n; i++) {
            StringBuffer s1tok = new StringBuffer();
            for (int z = 0; z < n; z++)
                s1tok.append(s1.get(i + z));
            for (int j = 0; j < s2.size()+1 - n; j++) {
                StringBuffer s2tok = new StringBuffer();
                for (int z = 0; z < n; z++)
                    s2tok.append(s2.get(j+z));
                //System.out.println("'" + s1tok + "' '" + s2tok + "'");
                if (s1tok.toString().equals(s2tok.toString())) {
                    overlap++;
                    common.add(s1tok.toString());
                    //System.out.println("NGramOverlap.nGramOverlap(): match: " + s1tok);
                }
            }
        }
        if (common.size() > 0)
            System.out.println("NGramOverlap.overlap(): common tokens: " + common);
        return overlap;
    }

    /** ***************************************************************
     * @return an integer score of the number of shared ngrams (minus
     * stopwords and punctuation)
     */
    public int cachedNGramOverlap(HashMap> questions,
                                  HashMap> answers, int n) {

        //System.out.println("NGramOverlap.cachedNGramOverlap():  " + questions + " \nand:\n" + answers);
        HashSet qs = questions.get(n);
        if (qs == null) return 0;
        HashSet as = answers.get(n);
        HashSet result = new HashSet<>(qs);
        result.retainAll(as);
        int overlap = result.size();
        if (debug)
            System.out.println("NGramOverlap.cachedNGramOverlap(): common tokens: " + result);
        return overlap;
    }

    /** ***************************************************************
     * @return a map of scores and the set of document IDs that have that
     * score, which is a count of token overlap with the question

    public TreeMap> computeNGramOverlap(String question) {

        ProgressPrinter pp = new ProgressPrinter(10);
        System.out.print("TokenOverlap.computerNGramOverlap(): ");
        TreeMap> result = new TreeMap<>();
        for (String line : tfidf.lines) {
            pp.tick();
            //if (tfidf.lines.indexOf(line) == 8362)
            //    System.out.println("TokenOverlap.computeOverlap(): " + line);
            int score = nGramOverlap(question, line, 2);
            if (score == 0)
                continue;
            float fscore = (float) score;
            ArrayList al = new ArrayList();
            if (result.containsKey(fscore))
                al = result.get(fscore);
            al.add(tfidf.lines.indexOf(line));
            result.put(fscore,al);
        }
        System.out.println();
        return result;
    }
*/
    /** ***************************************************************
     * @return a map of scores and the set of document IDs that have that
     * score, which is a count of token overlap with the question
     */
    public TreeMap> nGramRank(ShuZiInsQA.Dev dev,
                                                       List toScoreIDs,
                                                       ArrayList>> answerNgrams,
                                                       TreeMap> scoredIDs, int n) {

        TreeMap> result = new TreeMap<>();
        result.putAll(scoredIDs);
        for (String id : toScoreIDs) {
            int intID = Integer.parseInt(id);
            int score = cachedNGramOverlap(dev.questionNgrams, answerNgrams.get(intID), n);
            if (debug)
                System.out.println("NGramOverlap.nGramRank(): id: " + id + " as int: " + intID + " with score: " + score);
            if (score == 0)
                continue;
            float fscore = (float) score;
            ArrayList al = new ArrayList();
            if (result.containsKey(fscore))
                al = result.get(fscore);
            al.add(intID);
            result.put(fscore,al);
        }
        if (debug)
            System.out.println("NGramOverlap.nGramRank(): result " + result);
        return result;
    }

    /** ***************************************************************
     */
    public static void main(String[] args) {

        TFIDF cb = null;
        TokenOverlap to = null;
        NGramOverlap ng = null;
        try {
            cb = new TFIDF("/home/apease/Sigma/KBs/stopwords.txt");
            ng = new NGramOverlap(cb);
        }
        catch (IOException ioe) {
            System.out.println("Error in NGramOverlap.main()");
            ioe.printStackTrace();
        }
        System.out.println(ng.nGramOverlap("John likes big trees in the night", "John likes small leaves in the night", 3));
    }
}