relations.RuleLearner Maven / Gradle / Ivy
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package relations;
import corpora.DataLoader;
import utils.DBUtils;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.Statement;
import java.util.*;
/**
*
* @author Chinh
*/
public class RuleLearner {
SenAnalyzer analyzer; // use default database
SenSimplifier simp;
public RuleLearner(){
}
public RuleLearner(DBUtils sr, DBUtils dest) {
analyzer = new SenAnalyzer(sr, dest);
simp = analyzer.simp;
}
public static void main(String[] args) {
DBUtils db1 = new DBUtils();
String dbsrc = null;
if (args.length == 1) {
dbsrc = args[0];
}
else {
System.out.println("No annotated database declared.");
System.exit(1);
}
// dbsrc = "D:/DataNLP/Dev2011/Data";
db1.openDB(dbsrc);
RuleLearner learner = new RuleLearner(db1, db1);
learner.LearnData();
db1.shutdownDB();
}
String current_txt = "";
String curr_pmid = "";
int curr_sen_id = 0;
List detectedTrgs = null;
Map> TGCount = new HashMap>();
Map> subTG = new HashMap>();
Map> sharedTG = new HashMap>();
Map sameChunk = new HashMap();
String tokens[];
Set validTG = new HashSet(); // detected tg match with annotated TG
Map matchTG = new HashMap(); // annotated TG <-> detectedTG
public void LearnData() {
int total = 0, skip_pro = 0, miss_pro = 0, skip_dic = 0, miss_vb = 0;
analyzer.init();
Set usedTG = new HashSet();
List ids = simp.loadPMIDs();
//ids.clear();
//ids.add("PMC-1920263-13-RESULTS-05");
System.out.println("Total abstract: " + ids.size());
List elist;
List trgList;
Map TGmap = new HashMap();
Map EVmap = new HashMap();
Map protMap = new HashMap();
ChunkAnalyzer op = new ChunkAnalyzer();
Word tg, pr, pr2;
TData pro;
List[] out;
List prep;
Map mtrg[] = new HashMap[SenSimplifier.trigger_type.length];
Map rules[] = new Map[SenSimplifier.trigger_type.length];
boolean add;
for (int x = 0; x < SenSimplifier.trigger_type.length; x++) {
rules[x] = new HashMap();
mtrg[x] = new HashMap();
}
int miss_events = 0, skip_trg = 0, success = 0, unknown = 0;
int ev_type;
boolean print = false;
KeyData kdata;
int counter[] = new int[SenSimplifier.trigger_type.length];
int totals[] = new int[SenSimplifier.trigger_type.length];
int map_count[] = new int[SenSimplifier.trigger_type.length];
int mis_count[] = new int[SenSimplifier.trigger_type.length];
int mis_trg[] = new int[SenSimplifier.trigger_type.length];
int total_sen = 0, s_events = 0, skip_sen =0;
boolean ev1, ev2;
int lv1 = 0, lv2 = 0, lv3 = 0, lv0 = 0;
int sen_begin, sen_end;
for (String id : ids) {
EVmap.clear();
TGmap.clear();
out = analyzer.analyze(id);
elist = simp.loadEvent(id);
trgList = simp.loadTrigger(id);
for (TData dt : trgList) { // prepare hash for trigger
TGmap.put(dt.tid, dt);
}
for (EData ev : elist) {
EVmap.put(ev.eid, ev);
}
for (EData ed : elist) {
ed.init(analyzer.proMap, TGmap, EVmap);
}
curr_pmid = id;
List events[] = analyzer.splitEvents(elist);
for (int i = 0; i < analyzer.senpos.length; i++) {
usedTG.clear();
validTG.clear();
matchTG.clear();
curr_sen_id = i;
detectedTrgs = analyzer.detectedTrg[i];
total += events[i].size(); // count events
print = false;
if (out[i] == null) {
s_events += events[i].size(); // miss due to no trg/pro
for (EData ed : events[i]) {
ev_type = SenSimplifier.hashType.get(ed.type);
mis_count[ev_type]++;
}
if(events[i].size()>0){
skip_sen++;
}
continue;
}
total_sen++; // sen has event
sen_begin = analyzer.senpos[i];
sen_end = analyzer.senpos[i] + analyzer.longsen[i].length();
tokens = analyzer.tokenList.get(i);
prep = analyzer.getPreps(tokens);
protMap.clear();
for (Word w : analyzer.detectedPro[i]) {
protMap.put(w.word, w); // name -> word
}
op.curr_text = analyzer.shortsen[i]; // for debuging
current_txt = analyzer.shortsen[i];
op.analyzeChunk(out[i], analyzer.tagList.get(i), tokens); // split chunks into clauses
countTrg(op);
// map trg into detetced tg, put true detected trg to valid list
for(EData ev:events[i]){
tg = analyzer.findTrigger(ev.trgdata.list, analyzer.detectedTrg[i]);
if(tg!=null){
matchTG.put(ev.trgdata, tg);
validTG.add(tg);
}
}
for (EData ev : events[i]) { // only simple events
curr_event = ev;
pr2 = null;
ev1 = false;
ev2 = false;
ev_type = SenSimplifier.hashType.get(ev.type);
totals[ev_type]++; // count event per type
if (!inSentence(sen_begin, sen_end, ev)) {
skip_pro++;
continue; // Skip: pro belongs to the other sentence
}
if(!usedTG.contains(ev.trgdata)){
add=true;
usedTG.add(ev.trgdata);
}else{
add =false ;
}
tg = matchTG.get(ev.trgdata);
if (tg == null) {
skip_trg++;
mis_trg[ev_type]++;
Counter ct = mtrg[ev_type].get(ev.trgdata.name.toLowerCase());
if (ct != null) {
ct.inc();
} else {
ct = new Counter();
mtrg[ev_type].put(ev.trgdata.name.toLowerCase(), ct);
}
continue;
}
kdata = simp.sharedDic.get(tg.word);
if ((kdata.keytype == 1 && !kdata.type.equals(ev.type))) {
skip_dic++;
continue; // wrong type
}
// determining theme type and theme2/cause type
if (ev.data1 instanceof TData) { // pro
pro = (TData) ev.data1;
pr = protMap.get(pro.new_name); // TData -> Word
} else {
pro = ((EData) ev.data1).trgdata;
ev1 = true;
pr = matchTG.get(pro);
if (pr == null) {
skip_trg++;
continue;
}
}
if (ev.data2 != null) { // binding event
TData pro2 = ev.data2;
pr2 = protMap.get(pro2.new_name);
} else if (ev.ecause != null) { // regulatory event
if (ev.ecause instanceof TData) { // cause as pro
TData pro2 = (TData) ev.ecause;
pr2 = protMap.get(pro2.new_name);
} else { // cause as event
ev2 = true;
TData pro2 = ((EData) ev.ecause).trgdata;
pr2 = matchTG.get(pro2);
if (pr2 == null) {
skip_trg++;
continue;
}
}
}
// now find chunk containing trigger and pro
boolean found = false;
// Map event to BasicChunk or VerbChunk
for (BasicChunk bs : op.bsList) {
if (bs.belongTO(tg, pr, pr2)) {
found = eventToNP(bs, tg, pr, pr2, prep, rules, ev_type, ev1, ev2,add);
break;
}
}
if (!found) {
for (VerbChunk vc : op.verbList) {
if (vc.belongTO(tg, pr, pr2)) {
found = eventToVP(vc, tg, pr, pr2, prep, rules, ev_type, ev1, ev2,add);
break;
}
}
}
if (found) {
success++;
map_count[ev_type]++;
switch (ev.getLevel(0)) {
case 0:
lv0++;
break;
case 1:
lv1++;
break;
case 2:
lv2++;
break;
default:
lv3++;
break;
}
} else {
counter[ev_type]++;
miss_events++;
if(!print){
//System.out.println(curr_pmid+" "+curr_sen_id+ current_txt);
print =true;
}
//System.out.println("--->" +ev.getTxt(0));
}
}// event
if(print){
unknown++;
//System.out.println("");
}
}// sentence
}// id
/**
* Debuging information ; just comment if not needed
*/
System.out.println("Sub trg list:-------------------------------------------------------------------------------------");
for(String s: subTG.keySet()){
Map ct = subTG.get(s);
Map ct2 = sharedTG.get(s);
if(ct2!=null){
for(String w: ct2.keySet()){
ct.remove(w);
}
}
System.out.println(s+ " number of sub tg: "+ct.size());
for(String w: ct.keySet()){
System.out.println(" --> "+w+" "+ct.get(w).count);
}
System.out.println("");
}
System.out.println("---------------------------------------------------------------------- end sub tg list ----------");
System.out.println("---------------------------------------------------------------------- end sub tg list ----------");
System.out.println("---Number of sentence with miss events "+unknown+" Total: "+total_sen+" Skip sen: "+
skip_sen+" Rc1:(miss) "+unknown*1f/total_sen+" Rc2:(skip) "+skip_sen*1f/total_sen);
System.out.println("-----------------------------------------------------------------------------------------------------------------------\n");
for (int k = 0; k < 9; k++) {
System.out.println(SenSimplifier.trigger_type[k] + " : Total: " + totals[k] + " map events: " + map_count[k] + " | miss map: " + counter[k] + " ->> recall: " + (1f * map_count[k] / totals[k]));
System.out.println(" Miss due to no trg/pro: " + mis_count[k] + " | miss due to dict: " + mis_trg[k]);
}
System.out.println("---------------------------------------------------------------------------------------------------------------------\n");
System.out.println("Total events skip due to no trg/pro: " + s_events + " mis pro: " + miss_pro);
System.out.println(" Total events:" + total + " | events map: " + success + " -> Recall: " + success * 1f / total + " | skip trg: " + skip_trg + " | Skip dic: " + skip_dic + " | skip pro: " + skip_pro + " | miss: " + miss_events);
System.out.println("Number of trig as VB is missed " + miss_vb);
System.out.println("");
System.out.println("Level 0: " + lv0 + " | Level 1: " + lv1 + " | Level 2: " + lv2 + " Level 3: " + lv3);
System.out.println("Noun count: " + nppt + " Verb count: " + vppt + " | Number of event with level >=2 " + vppt2);
System.out.println("----------------missing trgger for each event type-------------------------------");
for (int i = 0; i <= 8; i++) {
System.out.println(SenSimplifier.trigger_type[i] + " total " + mtrg[i].size());
for (String s : mtrg[i].keySet()) {
// System.out.println(s+" --> "+mtrg[i].get(s).count);
}
System.out.println("");
}
System.out.println("--------------------------------------------------------------same chunk , tg > pro------------: "+sameChunk.size());
for(String s: sameChunk.keySet()){
System.out.println(s+ " "+sameChunk.get(s).count);
}
System.out.println("---------------------------------------------------------------------------------");
System.out.println("Combining rules....");
//Map rset = combiningRules(rules);
System.out.println("---> Storing patterns.....");
storePatterns(rules, analyzer.db);
//System.out.println("Storing rules .....");
//storeRuleSet(rset, analyzer.db);
System.out.println("");
}
private void countTrg(ChunkAnalyzer ac) {
Set bc = new HashSet();
List verb = new ArrayList();
Set used = new HashSet();
for (BasicChunk bs : ac.bsList) {
bc.add(bs);
}
for (VerbChunk vc : ac.verbList) {
bc.add(vc.subject);
bc.add(vc.object);
verb.add(vc.verb);
}
for (BasicChunk bs : bc) {
for (Chunk c : bs.chunkList) {
for (Word w : c.trigs) {
if (!used.contains(w)) {
add2Map(w.pos_tag, w.word, "NP");
used.add(w);
}
}
}
}
for (Chunk c : verb) {
for (Word w : c.trigs) {
if (!used.contains(w)) {
add2Map(w.pos_tag, w.word, "VP");
used.add(w);
}
}
}
}
private void add2Map(String pos, String tg, String type) {
Map ct = TGCount.get(tg);
if (ct == null) {
ct = new HashMap();
TGCount.put(tg, ct);
}
String key = pos + type;
Counter c = ct.get(key);
if (c == null) {
c = new Counter(1);
ct.put(key, c);
} else {
c.inc();
}
}
private boolean inSentence(int begin, int end, EData ev) {
boolean theme = false, cause = false;
TData tg = ev.trgdata;
if (tg.list[0] >= begin && tg.list[1] <= end) { // same trigger
//check theme
if (ev.data1 instanceof TData) {
TData pr1 = (TData) ev.data1;
if (pr1.list[0] >= begin && pr1.list[1] <= end) {
theme = true;
}
} else {
EData ev1 = (EData) ev.data1;
theme = inSentence(begin, end, ev1);
}
// check cause
if (ev.ecause != null) {
if (ev.ecause instanceof EData) {
cause = inSentence(begin, end, (EData) ev.ecause);
} else {
TData pr2 = (TData) ev.ecause;
if (pr2.list[0] >= begin && pr2.list[1] <= end) {
cause = true;
}
}
} else if (ev.data2 != null) { // theme2
TData pr2 = (TData) ev.data2;
if (pr2.list[0] >= begin && pr2.list[1] <= end) {
cause = true;
}
} else {
cause = true;
}
if (theme && cause) {
return true;
}
}
return false;
}
/**
* Map event into BasicChunk
*
* @param bs: chunk
* @param tg: trigger
* @param pr: protein 1
* @param pr2: theme2
* @param words: preposition list
* @param rules: array of map of each event type
* @param ev_type: event type
*/
int in_distance = 0;
int in_total = 0;
int has_trg = 0;
int in_count = 0;
int prep_order_count = 0;
boolean debug = false;
private boolean eventToNP(BasicChunk bs, Word tg, Word pr, Word pr2, List prep, Map rules[],
int ev_type, boolean evt1, boolean evt2, boolean add) {
boolean prep1_pos, prep2_pos = false, prep_order = false, in_chunk = false;
String prep1 = "", prep2 = "", ctype, pos_type;
int count1, count2 = 0;// for theme2/cause
int verb_type;
boolean has_theme2 = false;
String themeTrg = "", causeTrg = "";
bs.mergeNP();
if (bs.is_merged) {
ctype = "CP";
} else {
ctype = "NP";
}
if (evt1) { // theme
themeTrg = pr.word;
}
if (evt2) { //cause
causeTrg = pr2.word;
}
pos_type = tg.pos_tag;
prep1_pos = pr.pos > tg.pos ? true : false; // prep 1 position
nppt++; // NP pattern
if (pr2 == null) { // only theme
if(bs.isSameChunk(tg, pr) && !evt1){
Counter ct = sameChunk.get(tg.word);
if(ct==null){
ct = new Counter(1);
sameChunk.put(tg.word, ct);
}else{
ct.inc();
}
}
count1 = bs.countChunks(tg, pr);
if (prep1_pos) { // TG - PRO
prep1 = getPrep(tg, pr, bs);
if(ev_type<=5){
Word sub_tg = findTrg(tg,pr.pos, bs);
if(sub_tg!=null){ // found sub-trg
String key = tg.word+tg.pos_tag;
Map ct = subTG.get(key);
if(ct==null){
ct = new HashMap();
subTG.put(key, ct);
}
Counter c = ct.get(sub_tg.word+sub_tg.pos_tag);
if(c==null){
c = new Counter(1);
ct.put(sub_tg.word+sub_tg.pos_tag, c);
}else {
c.inc();
}
}
}
} else { // PRO - TG
if (!evt1 && bs.inChunk(tg, pr)) {
in_chunk = true;
count1 = 0;
} else if (evt1&& tg.pos == pr.pos) {
in_chunk =true ;
count1 = 0;
}
}
if(in_chunk && !evt1){
Chunk tgc = bs.getChunk(tg.pos);
if(tgc.is_inChunk(tg,tokens)&& tgc.trigs.size()==2){
//System.out.println("---->"+ tgc.getText()+"\n"+curr_pmid+" "+curr_sen_id);
//bs.printChunk();
//System.out.println("");
}
}
} else {
has_theme2 = true; // has cause/theme2
prep2_pos = pr2.pos > tg.pos ? true : false;
count1 = bs.countChunks(tg, pr);
count2 = bs.countChunks(tg, pr2);
if (prep1_pos && prep2_pos) { // both are behind trig
if (pr.pos > pr2.pos) { // reg only: TG - cause - theme
prep1 = getPrep2(pr2, pr, bs); // theme
prep2 = getPrep(tg, pr2, bs); // cause
prep_order = true; // need to swich order
} else { // binding & reg : TG- theme - cause/theme2
prep2 = getPrep2(pr, pr2, bs); //cause
prep1 = getPrep(tg, pr, bs); // theme
}
} else if (prep1_pos && !prep2_pos) { // cause - tg - theme -> reg only
prep1 = getPrep(tg, pr, bs);
prep2 = getPrepFront(tg, pr2, bs);
} else if (!prep1_pos && prep2_pos) { // theme - tg - cause | theme1 - tg - theme2
prep1 = getPrepFront(tg, pr, bs);
prep2 = getPrep(tg, pr2, bs);
} else { // both are in front of tg
//binding & reg
if (pr.pos > pr2.pos) { // cause/theme2 - theme -TG: should skip this
prep1 = getPrepFront(tg, pr, bs);
prep2 = getPrep2(pr2, pr, bs);
prep_order =true;
} else { // Reg only : theme - cause/theme2 - TG
prep1 = getPrep2(pr, pr2, bs);
prep2 = getPrepFront(tg, pr2, bs);
}
}
if (ev_type == 5) {
in_chunk = bs.inChunk(tg, pr);
// count1 = 0;
} else if (ev_type > 5 && !evt2 && !prep2_pos) { // cause must be pro
in_chunk = bs.inChunk(tg, pr2);
// count2 = 0;
}
}// pr2 condition
verb_type = 0;
Rules rl = rules[ev_type].get(tg.word);
if (rl == null) {
rl = new Rules(ev_type, tg.word);
rules[ev_type].put(tg.word, rl);
}
if (ev_type < 5) {
rl.addPattern(verb_type, pos_type, ctype, prep1_pos, prep1, in_chunk, count1, themeTrg,add);
} else if (ev_type == 5) {
rl.addPattern(verb_type, pos_type, ctype, prep1_pos, prep2_pos, prep_order, prep1, prep2, has_theme2, in_chunk, count1, count2, themeTrg,add);
} else {// regulatory events
rl.addPattern(verb_type, pos_type, ctype, prep1_pos, prep2_pos, prep_order, prep1, prep2, has_theme2, in_chunk, count1, count2, evt1, evt2, themeTrg, causeTrg,add);
}
return true;
}
private boolean eventToVP(VerbChunk vc, Word tg, Word pr, Word pr2, List prep, Map rules[],
int ev_type, boolean evt1, boolean evt2,boolean add) {
boolean prep1_pos, prep2_pos = false, prep_order = false, in_chunk = false;
String prep1 = "", prep2 = "", ctype, pos_type;
int count1 = 0, count2 = 0;// count2 -> for theme2/cause
boolean has_theme2 = false;
String childTrg = "", parentTrg = "";
if (vc.subject.belongTO(tg, pr, pr2)) {
eventToNP(vc.subject, tg, pr, pr2, prep, rules, ev_type, evt1, evt2,add);
} else if (vc.object.belongTO(tg, pr, pr2)) {
eventToNP(vc.object, tg, pr, pr2, prep, rules, ev_type, evt1, evt2,add);
} else if (vc.verb.contains(tg)) { // verb contains trigger
vppt++;
if (curr_event.getLevel(0) >= 2) {
vppt2++;
}
ctype = "VP";
pos_type = tg.pos_tag;
prep1_pos = tg.pos < pr.pos ? true : false;
if (pr2 == null) {
if (vc.subject.containsKey(pr)) {
count1 = vc.subject.getChunkPos(pr.pos);
// or relative clause
prep1 = getPrepFront(tg, pr, vc.subject);
} else if (vc.object.containsKey(pr)) {
count1 = vc.object.getChunkPos(pr.pos);
prep1 = getPrep(tg, pr, vc.object);
}
if (evt1) {
childTrg = pr.word;
}
} else { // Pr2!=null
//for both binding and regulatory events
has_theme2 = true;
prep2_pos = tg.pos < pr2.pos ? true : false;
if (prep1_pos && prep2_pos) { // both are behind trig
count1 = vc.object.getChunkPos(pr.pos);
count2 = vc.object.getChunkPos(pr2.pos);
if(pr.pos< pr2.pos){
prep1 = getPrep(tg, pr, vc.object);
prep2 = getPrep2(pr, pr2, vc.object);
}else {
prep2 = getPrep(tg, pr2, vc.object);
prep1 = getPrep2(pr2, pr, vc.object);
}
} else if (prep1_pos && !prep2_pos) { // cause - tg - theme
prep1 = getPrep(tg, pr, vc.object);
prep2 = getPrepFront(tg, pr2, vc.subject);
count1 = vc.object.getChunkPos(pr.pos);
count2 = vc.subject.getChunkPos(pr2.pos);
} else if (!prep1_pos && prep2_pos) { // theme - tg - cause
prep1 = getPrepFront(tg, pr, vc.subject);
prep2 = getPrep(tg, pr2, vc.object);
count2 = vc.object.getChunkPos(pr2.pos);
count1 = vc.subject.getChunkPos(pr.pos);
} else if (!prep1_pos && !prep2_pos) { // both are in front of tg
// reg event: few cases, can skip
prep1 = getPrep2(pr, pr2, vc.subject);
prep2 = getPrepFront(tg, pr2, vc.subject);
count1 = vc.subject.getChunkPos(pr.pos);
count2 = vc.subject.getChunkPos(pr2.pos);
if (ev_type > 5) {
return false; // skip this case
}
}
if (ev_type > 5) { // for binding events: the order of proteins always are PR1-PR2 if they are in the same side
if (evt1) {
childTrg = pr.word;
}
if (evt2) {
parentTrg = pr2.word;
}
}
}
Rules rl = rules[ev_type].get(tg.word);
if (rl == null) {
rl = new Rules(ev_type, tg.word);
rules[ev_type].put(tg.word, rl);
}
if (ev_type < 5) {
rl.addPattern(vc.verb_type, pos_type, ctype, prep1_pos, prep1, in_chunk, count1, childTrg,add);
} else if (ev_type == 5) {
rl.addPattern(vc.verb_type, pos_type, ctype, prep1_pos, prep2_pos, prep_order, prep1, prep2, has_theme2, in_chunk, count1, count2, childTrg,add);
} else {
rl.addPattern(vc.verb_type, pos_type, ctype, prep1_pos, prep2_pos, prep_order, prep1, prep2, has_theme2, in_chunk, count1, count2, evt1, evt2, childTrg, parentTrg,add);
}
} else { // merge subject and subject
BasicChunk new_ch = new BasicChunk();
new_ch.addChunk(vc.subject);
new_ch.addChunk(vc.verb);
new_ch.addChunk(vc.object);
new_ch.is_merged = true;
return eventToNP(new_ch, tg, pr, pr2, prep, rules, ev_type, evt1, evt2,add);
}
return true;
}
public String getPrepFront(Word tg, Word pr, BasicChunk bs) {
int st1, st2;
st1 = bs.getChunkPos(tg.pos);
st2 = bs.getChunkPos(pr.pos);
if (Math.min(st1, st2) >= 0 && st1 > st2) { // both in bs and PRO - TG
Chunk c = bs.chunkList.get(st1 - 1);
if (c.type.endsWith("PP") && preps.contains(c.txt)) {
return c.txt;
}
} else if (st1 < 0 && st2 > 0) { // PRO
Chunk c = bs.chunkList.get(bs.chunkList.size() - 1);
if (c.type.endsWith("PP")) {
return c.txt;
}
}
return "";
}
public String getPrep(Word tg, Word pr, BasicChunk bs) {
int pos1 = bs.getChunkPos(tg.pos);
int pos2 = bs.getChunkPos(pr.pos);
if (Math.min(pos1, pos2) >= 0 && pos1 < pos2) { // NP
Chunk c = bs.chunkList.get(pos1 + 1);
if (c.type.endsWith("PP") && prepmap.contains(c.txt)) {
return c.txt;
}
} else if (pos1 < 0 && pos2 > 0) { // VP
Chunk c = bs.chunkList.get(0);
if (c.type.endsWith("PP")&& prepmap.contains(c.txt)) {
return c.txt;
}
}
return "";
}
public String getPrep2(Word pr1, Word pr2, BasicChunk bs) {
int pos1 = bs.getChunkPos(pr1.pos);
int pos2 = bs.getChunkPos(pr2.pos);
if (Math.min(pos1, pos2) >= 0 && pos1 < pos2) { // NP
int i= pos1+1;
Chunk c;
while(i< pos2){
c = bs.chunkList.get(i);
if (c.type.endsWith("PP")&& prepmap.contains(c.txt)) {
return c.txt;
}
i++ ;
}
}
return "";
}
private Word findTrg(Word tg, int pos2, BasicChunk bs) {
int pos1 = tg.pos ;
if(pos2-pos1>10){
return null;
}
Chunk c1 = bs.getChunk(pos1);
Chunk c2 = bs.getChunk(pos2);
int begin = c1.begin;
int end = c2.end;
for(Chunk c: bs.chunkList){
if(c.begin>=begin && c.end<=end){
for(Word w: c.trigs){
if(!validTG.contains(w) && w.pos > pos1){
return w ;
}else if(validTG.contains(w)&& w.pos > pos1){
String key = tg.word+tg.pos_tag;
Map ct = sharedTG.get(key);
if(ct==null){
ct = new HashMap();
sharedTG.put(key, ct);
}
Counter count = ct.get(w.word+w.pos_tag);
if(count==null){
count = new Counter(1);
ct.put(w.word+w.pos_tag, count);
}else {
count.inc();
}
}
}
}
}
return null;
}
public void storePatterns(Map[] map, DBUtils db) {
DBUtils dbs;
dbs = db;
Connection cons;
Statement stms;
PreparedStatement ps;
try {
System.out.println("----> Storing patterns.....");
dbs.dropTable("Patterns");
cons = dbs.getConnection();
stms = cons.createStatement();
String sql = "CREATE CACHED TABLE PATTERNS(TRGKEY VARCHAR(80), TYPE VARCHAR(25), verb_type int, POS varchar(5), "
+ "chunk_type varchar(5), pos1 boolean, pos2 boolean, prep1 varchar(10), prep2 varchar(10), prep_order boolean,"
+ "has_theme2 boolean,in_chunk boolean, chunk1 int, chunk2 int, event1 boolean, event2 boolean,"
+ "trg1 varchar(2000), trg2 varchar(2000), pcount int, detected int)";
stms.executeUpdate(sql);
ps = cons.prepareStatement("INSERT INTO Patterns(trgkey,type,verb_type, pos, chunk_type,pos1, pos2,"
+ "prep1,prep2,prep_order,has_theme2,in_chunk,chunk1,chunk2,event1,event2,trg1,trg2,pcount,detected) VALUES(?,?,?,?,?,?,?,?,?,?,?"
+ ",?,?,?,?,?,?,?,?,?)");
Map m;
String type;
Rules rule;
RuleData p;
for (int i = 0; i < map.length; i++) {
m = map[i];
type = SenSimplifier.trigger_type[i];
for (String s : m.keySet()) {
rule = m.get(s);
for (String st : rule.map.keySet()) {
p = rule.map.get(st);
ps.setString(1, s);
ps.setString(2, type);
ps.setInt(3, p.verb_type);
ps.setString(4, p.POS);
ps.setString(5, p.chunk_type);
ps.setBoolean(6, p.theme_pos);
ps.setBoolean(7, p.cause_pos);
ps.setString(8, p.prep1);
ps.setString(9, p.prep2);
ps.setBoolean(10, p.prep_order);
ps.setBoolean(11, p.has_cause);
ps.setBoolean(12, p.in_chunk);
ps.setInt(13, p.dist1);
ps.setInt(14, p.dist2);
ps.setBoolean(15, p.event1);
ps.setBoolean(16, p.event2);
ps.setString(17, p.mapToString(p.childMap));
ps.setString(18, p.mapToString(p.parentMap));
ps.setInt(19, p.count);
p.detected = getCountDetectedTG(s,p.POS+p.chunk_type);
ps.setInt(20, p.detected);
ps.executeUpdate();
}
}
}
ps.close();
System.out.println("---DONE---> Saving patterns");
} catch (Exception e) {
System.out.println("ERORR here ");
System.out.println(e);
}
}
private void combiningNP(Map rl, Map ruleset, int i) {
RuleSet rs;
String rkey;
int order1, order2;
Rules rule;
List ls;
String keys[] = {"NNNP", "VBNP", "JJNP", "NNCP", "VBCP", "JJCP",}; // noun, adj, vb in noun phrase
for (String s : rl.keySet()) {
rule = rl.get(s);
rule.initMap();
// process NP chunk
// cases to considered: PRO-TG (PRO)*
//TG prep1 PRO1 (prep2 PRO2)*
for (String subkey : keys) {
ls = rule.getEvalRules(subkey); // all NP pattern of current trg
if (ls == null) {
continue;
}
rkey = s + i + subkey; // trg + type + pos(first two letters) + chunk type
// has NP patterns
rs = new RuleSet();
order1 = 0;
order2 = 0;
for (RuleData dt : ls) {
if (dt.in_chunk) { // in chunk case
rs.in_chunk = true;
rs.inchunk_count += dt.count; // count number of inchunk event
if (!dt.has_cause) { // all event type without theme2/cause
if (dt.event1) {
rs.ecount += dt.count;
} else {
rs.pcount += dt.count;
}
} else if (i == 5 && dt.has_cause) { // theme2 PRO-TG PRO
rs.t2count += dt.count;
rs.pcount += dt.count;
rs.dist2 = Math.max(rs.dist2, dt.dist2);
} else if (i > 5 && dt.theme_pos && dt.has_cause) { // has cause
rs.pcause += dt.count; // assume only pro is cause : PRO - TG Theme (Pro/Evt)
if (dt.event1) {
rs.ecount += dt.count;
} else {
rs.pcount += dt.count;
}
rs.dist1 = Math.max(rs.dist1, dt.dist1);
}
} else if (dt.theme_pos) { // for all POS : TG - prep - PRO
if ((dt.POS.startsWith("NN") && !dt.prep1.isEmpty()) || !dt.POS.startsWith("NN")) {// (NN && prep) or (VB/JJ)
if (!dt.has_cause) { // all event type, no theme2 / cause
if (i <= 5) {
rs.pcount += dt.count;
} else {
if (dt.event1) {
rs.ecount += dt.count;
} else {
rs.pcount += dt.count;
}
}
rs.dist1 = Math.max(rs.dist1, dt.dist1);
} else if (dt.cause_pos && !dt.prep2.isEmpty() && dt.POS.startsWith("NN")) { // TG-prep1-PRO1-prep2-PRO2 ; only NNx
if (i == 5) {
rs.t2count += dt.count;
rs.pcount += dt.count;
} else {
if (dt.event1) {
rs.ecount += dt.count;
} else {
rs.pcount += dt.count;
}
if (dt.event2) {
rs.ecause += dt.count;
} else {
rs.pcause += dt.count;
}
}
rs.dist1 = Math.max(rs.dist1, dt.dist1);
rs.dist2 = Math.max(rs.dist2, dt.dist2);
}
}
} else if (i == 5 && !dt.theme_pos && ((dt.has_cause && dt.cause_pos) || !dt.POS.startsWith("NN"))) { // Binding: PRO1 - TG - PRO2
rs.in_front += dt.count;
if (!dt.prep2.isEmpty()) {
rs.prep_2.add(dt.prep2);
}
if (!dt.prep1.isEmpty()) {
rs.prep_1.add(dt.prep1);
}
rs.dist1 = Math.max(rs.dist1, dt.dist1);
rs.dist2 = Math.max(rs.dist2, dt.dist2);
rs.pcount += dt.count;
rs.dist2 = Math.max(rs.dist2, dt.dist2);
}
}
rs.detected = getCountDetectedTG(s, subkey);
if (order2 > order1) {
rs.order = false;
}
if (rs.getFreq() >= 2) {
ruleset.put(rkey, rs);
}
}
}
}
/**
* Get number of detected tg based on POS and chunk type
*
* @param tg: trigger
* @param subkey: POS + chunk type which contains tg
* @return
*/
private int getCountDetectedTG(String tg, String subkey) {
Map ct = TGCount.get(tg);
if (ct != null) {
if (subkey.endsWith("CP")) { // CP->NP
subkey = subkey.substring(0, subkey.length()-2) + "NP";
}
Counter c = ct.get(subkey); //subkey: NNNP;VBNP;JJNP;VBVP;JJVP
return c != null ? c.count : 0;
}
return 0;
}
private void combiningVP(Map rl, Map ruleset, int i) {
RuleSet rs;
String rkey;
int order1;
Rules rule;
List ls;
String keys[] = {"VBVP", "JJVP"}; // noun, adj, vb in noun phrase
for (String s : rl.keySet()) {
rule = rl.get(s);
rule.initMap();
//** VP JJ
for (String subkey : keys) {
ls = rule.getEvalRules(subkey); // all NP pattern of current trg
if (ls == null) {
continue;
}
rkey = s + i + subkey;
rs = new RuleSet();
order1 = 0;
for (RuleData dt : ls) {
if (dt.count < 2 && i < 5) {
//continue;
}
if (!dt.has_cause) {
if (i <= 5) {
rs.pcount += dt.count;
} else {
if (dt.event1) {
rs.ecount += dt.count;
} else {
rs.pcount += dt.count;
}
}
} else { // has theme2/cause
if (i == 5) {
rs.t2count += dt.count;
rs.pcount += dt.count;
} else {
if (dt.event1) {
rs.ecount += dt.count;
} else {
rs.pcount += dt.count;
}
if (dt.event2) {
rs.ecause += dt.count;
} else {
rs.pcause += dt.count;
}
}
}
if (dt.verb_type == 1 && dt.POS.equals("VBN") && dt.theme_pos) { //
order1 += dt.count;
} else if (dt.verb_type == 1 && dt.POS.equals("VBN") && !dt.theme_pos) {
order1 -= dt.count;
}
rs.dist1 = Math.max(rs.dist1, dt.dist1);
rs.dist2 = Math.max(rs.dist2, dt.dist2);
}
rs.detected = getCountDetectedTG(s,subkey);
if (order1 > 0) {
rs.order = false;
}
if (rs.getFreq() >= 2) {
ruleset.put(rkey, rs);
}
}
}
}
public Map combiningRules(Map[] rules) {
Map ruleset = new HashMap();
Map rl;
for (int i = 0; i < 9; i++) {
rl = rules[i];
combiningNP(rl, ruleset, i);
combiningVP(rl, ruleset, i);
}
return ruleset;
}
public String setToStr(Set set) {
String txt = "";
for (String s : set) {
txt += s + " ";
}
return txt;
}
public String mapToStr(Map> map) {
String txt = "";
for (String s : map.keySet()) {
txt += s + ":";
for (String key : map.get(s)) {
txt += key + " ";
}
txt += "|";
}
return txt;
}
public void storeRuleSet(Map map, DBUtils db) {
DBUtils dbs;
dbs = db;
Connection cons;
Statement stms;
PreparedStatement ps;
RuleSet rs;
try {
System.out.println("----> Storing rulesets.....");
dbs.dropTable("RuleSet");
cons = dbs.getConnection();
stms = cons.createStatement();
String sql = "CREATE CACHED TABLE RULESET(KEY VARCHAR(80), INCHUNK BOOLEAN, DIST1 INT, DIST2 INT, PREP VARCHAR(100), "
+ "PREP2 VARCHAR(300), T_ORDER BOOLEAN, PCOUNT INT, ECOUNT INT, T2COUNT INT, PCAUSE INT, ECAUSE INT, "
+ "INPREP VARCHAR(100), in_front int, prep_1 varchar(100), prep_2 varchar(100), detected int, inchunk_count int, apply int)";
stms.executeUpdate(sql);
ps = cons.prepareStatement("INSERT INTO RULESET(key,inchunk,dist1,dist2,prep,prep2,t_order,pcount,ecount,t2count,pcause,"
+ "ecause,INPREP, in_front, prep_1,prep_2, detected, inchunk_count,apply) "
+ "VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
for (String s : map.keySet()) {
rs = map.get(s);
ps.setString(1, s);
ps.setBoolean(2, rs.in_chunk);
ps.setInt(3, rs.dist1);
ps.setInt(4, rs.dist2);
ps.setString(5, setToStr(rs.prep));
ps.setString(6, mapToStr(rs.prep2));
ps.setBoolean(7, rs.order);
ps.setInt(8, rs.pcount);
ps.setInt(9, rs.ecount);
ps.setInt(10, rs.t2count);
ps.setInt(11, rs.pcause);
ps.setInt(12, rs.ecause);
ps.setString(13, setToStr(rs.inchunk_prep));
ps.setInt(14, rs.in_front);
ps.setString(15, setToStr(rs.prep_1));
ps.setString(16, setToStr(rs.prep_2));
ps.setInt(17,rs.detected);
ps.setInt(18,rs.inchunk_count);
ps.setInt(19,rs.apply);
ps.executeUpdate();
}
ps.close();
System.out.println("---DONE---> Saving patterns");
} catch (Exception e) {
System.out.println("ERORR here ");
System.out.println(e.getLocalizedMessage());
}
}
public final static Set preps = new HashSet();
public final static Set prepmap = SenSimplifier.prepmap ;
static {
preps.add("by");
preps.add("through");
preps.add("after");
}
int nppt = 0, vppt = 0, vppt2 = 0;
EData curr_event = null;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy