nlp.NGramOverlap Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
package nlp;
import nlp.corpora.ShuZiInsQA;
import java.io.IOException;
import java.util.*;
/**
* Created by apease on 10/1/15.
*/
public class NGramOverlap {
TFIDF tfidf = null;
public boolean debug = false;
/** ***************************************************************
*/
public NGramOverlap(TFIDF tf) throws IOException {
//System.out.println("Info in TFIDF(): Initializing");
tfidf = tf;
}
/** ***************************************************************
* @return an integer score of the number of shared ngrams (minus
* punctuation)
*/
public int nGramOverlap(String x, String y, int n) {
//System.out.println("NGramOverlap.overlap(): testing: " + x + " \nand:\n" + y);
int overlap = 0;
HashSet common = new HashSet<>();
String str1 = tfidf.removePunctuation(x);
//str1 = tfidf.removeStopWords(str1);
ArrayList s1 = new ArrayList();
String[] sspl = str1.split(" ");
s1.addAll(Arrays.asList(sspl));
String str2 = tfidf.removePunctuation(y);
//str2 = tfidf.removeStopWords(str2);
ArrayList s2 = new ArrayList();
s2.addAll(Arrays.asList(str2.split(" ")));
for (int i = 0; i < s1.size()+1 - n; i++) {
StringBuffer s1tok = new StringBuffer();
for (int z = 0; z < n; z++)
s1tok.append(s1.get(i + z));
for (int j = 0; j < s2.size()+1 - n; j++) {
StringBuffer s2tok = new StringBuffer();
for (int z = 0; z < n; z++)
s2tok.append(s2.get(j+z));
//System.out.println("'" + s1tok + "' '" + s2tok + "'");
if (s1tok.toString().equals(s2tok.toString())) {
overlap++;
common.add(s1tok.toString());
//System.out.println("NGramOverlap.nGramOverlap(): match: " + s1tok);
}
}
}
if (common.size() > 0)
System.out.println("NGramOverlap.overlap(): common tokens: " + common);
return overlap;
}
/** ***************************************************************
* @return an integer score of the number of shared ngrams (minus
* stopwords and punctuation)
*/
public int cachedNGramOverlap(HashMap> questions,
HashMap> answers, int n) {
//System.out.println("NGramOverlap.cachedNGramOverlap(): " + questions + " \nand:\n" + answers);
HashSet qs = questions.get(n);
if (qs == null) return 0;
HashSet as = answers.get(n);
HashSet result = new HashSet<>(qs);
result.retainAll(as);
int overlap = result.size();
if (debug)
System.out.println("NGramOverlap.cachedNGramOverlap(): common tokens: " + result);
return overlap;
}
/** ***************************************************************
* @return a map of scores and the set of document IDs that have that
* score, which is a count of token overlap with the question
public TreeMap> computeNGramOverlap(String question) {
ProgressPrinter pp = new ProgressPrinter(10);
System.out.print("TokenOverlap.computerNGramOverlap(): ");
TreeMap> result = new TreeMap<>();
for (String line : tfidf.lines) {
pp.tick();
//if (tfidf.lines.indexOf(line) == 8362)
// System.out.println("TokenOverlap.computeOverlap(): " + line);
int score = nGramOverlap(question, line, 2);
if (score == 0)
continue;
float fscore = (float) score;
ArrayList al = new ArrayList();
if (result.containsKey(fscore))
al = result.get(fscore);
al.add(tfidf.lines.indexOf(line));
result.put(fscore,al);
}
System.out.println();
return result;
}
*/
/** ***************************************************************
* @return a map of scores and the set of document IDs that have that
* score, which is a count of token overlap with the question
*/
public TreeMap> nGramRank(ShuZiInsQA.Dev dev,
List toScoreIDs,
ArrayList>> answerNgrams,
TreeMap> scoredIDs, int n) {
TreeMap> result = new TreeMap<>();
result.putAll(scoredIDs);
for (String id : toScoreIDs) {
int intID = Integer.parseInt(id);
int score = cachedNGramOverlap(dev.questionNgrams, answerNgrams.get(intID), n);
if (debug)
System.out.println("NGramOverlap.nGramRank(): id: " + id + " as int: " + intID + " with score: " + score);
if (score == 0)
continue;
float fscore = (float) score;
ArrayList al = new ArrayList();
if (result.containsKey(fscore))
al = result.get(fscore);
al.add(intID);
result.put(fscore,al);
}
if (debug)
System.out.println("NGramOverlap.nGramRank(): result " + result);
return result;
}
/** ***************************************************************
*/
public static void main(String[] args) {
TFIDF cb = null;
TokenOverlap to = null;
NGramOverlap ng = null;
try {
cb = new TFIDF("/home/apease/Sigma/KBs/stopwords.txt");
ng = new NGramOverlap(cb);
}
catch (IOException ioe) {
System.out.println("Error in NGramOverlap.main()");
ioe.printStackTrace();
}
System.out.println(ng.nGramOverlap("John likes big trees in the night", "John likes small leaves in the night", 3));
}
}