relations.TriggerLearner Maven / Gradle / Ivy
package relations;
import corpora.DataLoader;
import utils.DBUtils;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.*;
/**
*
* @author Chinh
* @Date: Oct 27, 2010
* @Revision Jan 22, 2011
*/
public class TriggerLearner {
Map[] maptype = new HashMap[trigger_type.length];
DBUtils db_sr, db_dest;
Connection con;
Statement stmt;
ResultSet rs;
PreparedStatement ps;
SenSimplifier sim;
int nr_event = 9;
public List proWord = new ArrayList();
public List prepWord = new ArrayList();
public List relWords = new ArrayList();
int fthreshold = 2; // thredhold to drop a trigger if its frequency is lower this value
public TriggerLearner(DBUtils sr, DBUtils dbs) {
db_sr = sr;
db_dest = dbs;
try {
con = db_sr.getConnection();
stmt = con.createStatement();
} catch (Exception e) {
System.out.println(e.getCause());
}
// trigger type <-> index
for (int i = 0; i < trigger_type.length; i++) {
hashType.put(trigger_type[i], i);
}
// list of none-relation
for (int i = 0; i < none_rel.length; i++) {
notrigger.put(none_rel[i], none_rel[i]);
}
sim = new SenSimplifier(db_sr);
}
public void clearList() {
proWord.clear();
prepWord.clear();
relWords.clear();
}
/**
* Read raw data from tables, generate statistical values
*/
public void preparedData() {
String sql =
"select t_type,txt, count(txt) as num from (select event.t_type, triggers.txt from event, triggers " + "where event.PMID = triggers.PMID and event.TRIG_ID = triggers.TID) group by t_type,txt order by num desc";
Map item;
String t_type, txt;
int count, len = trigger_type.length;
List keylist = new ArrayList();
Map mapkey = new HashMap();
String wds[];
int idx;
try {
System.out.println("Creating trigger list .....");
stmt.execute("DROP TABLE trigdata if exists");
stmt.execute("CREATE CACHED TABLE TRIGDATA(KEY VARCHAR(80), expr INT, trans INT , " + "catabo int, phospho INT, local INT, bind int, pos_reg int, reg int, neg_reg int, ubi int, prot_mod int, deacet int, acet int, tt int) ");
ps =
con.prepareStatement("insert into trigdata(key,expr,trans,catabo,phospho,local," + "bind,pos_reg,reg,neg_reg,ubi,prot_mod,deacet,acet,tt) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
for (int i = 0; i < len; i++) {
maptype[i] = new HashMap(); // map of trigger type
}
rs = stmt.executeQuery(sql);
while (rs.next()) {
t_type = rs.getString(1);
idx = SenSimplifier.hashType.get(t_type);
item = maptype[idx]; // hash map of event type
txt = rs.getString(2).toLowerCase();
if (notrigger.containsKey(txt)) {
continue; // skip this trigger since it in non-trigger list
}
count = rs.getInt(3);
if (!item.containsKey(txt)) {
item.put(txt, new Counter(count)); // add new entry
} else {
Counter c = item.get(txt); // increase frequency
c.add(count);
}
if (!mapkey.containsKey(txt)) {
mapkey.put(txt, txt);
keylist.add(txt);
}
}
rs.close();
Map array[] = maptype; // number of event types
int data[] = new int[len];
for (String s : keylist) { // loop over key list
for (int i = 0; i < len; i++) { // loop over event type
data[i] = 0;
if (array[i].containsKey(s)) {
data[i] = array[i].get(s).getValue();
}
}
// store to TRIGDATA table
saveData(s, data); // key -> [count1, count2, ....count9]: frequency of each event coresspond to this
// key
}
// Now prepared list of trigger for each event type
ps.close();
} catch (Exception e) {
e.printStackTrace();
}
}
Map tmap = new HashMap(); // shared dictionary
/**
* Count number of events have frequency higher the given threshold.
*
* @param data
* @return
*/
private List getType(int[] data) {
int freq = fthreshold; // Change threshold here
List list = new ArrayList();
for (int i = 0; i < 9; i++) { // 9 event types
if (data[i] >= freq) {
list.add(i);
}
}
if (list.size() == 1) {
return list;
} else { // for share/mix trigger ; increase threshold to 6
int i = 0, idx;
while (i < list.size()) {
idx = list.get(i);
if (data[idx] < freq + 3) {
list.remove(i);
continue;
} else {
i++;
}
}
}
return list;
}
private void saveKeyData(Map map) {
Connection con1;
Statement stmt1;
PreparedStatement ps1;
String sql =
" Insert into keydata(key,ktype,type,freq,total, pcount, ecount,pcause,ecause,t2count,child, parent) " + "values(?,?,?,?,?,?,?,?,?,?,?,?)";
System.out.println("Saving trigger data..........");
try {
System.out.println("---> Saving dictionary, number of entries: " + map.size());
con1 = db_dest.getConnection();
stmt1 = con1.createStatement();
stmt1.execute("DROP TABLE keydata if exists");
stmt1.execute(" Create cached table KEYDATA(key varchar(80), ktype int, type varchar(50), freq int, total int, pcount int," + "ecount int, pcause int, ecause int, t2count int,child varchar(2000), parent varchar(2000))");
ps1 = con1.prepareStatement(sql);
KeyData dt;
int total;
List ls = new ArrayList();
for (String s : map.keySet()) {
dt = map.get(s);
total = dt.found;
if (total < fthreshold) {
continue;
}
ls.clear();
if (dt.keytype == 1) {
ls.add(dt);
} else {
ls.addAll(dt.getMap().values());
}
for (KeyData k : ls) {
ps1.setString(1, k.key);
ps1.setInt(2, k.keytype);
ps1.setString(3, k.type);
ps1.setInt(4, k.freq);
ps1.setInt(5, total);
ps1.setInt(6, k.pcount);
ps1.setInt(7, k.ecount);
ps1.setInt(8, k.pcause);
ps1.setInt(9, k.ecause);
ps1.setInt(10, k.t2count);
ps1.setString(11, k.set2String(k.child));
ps1.setString(12, k.set2String(k.parent));
ps1.executeUpdate();
}
}
ps1.close();
} catch (Exception e) {
System.out.println(e.getCause());
}
System.out.println("Saving trigger data... done!");
}
/**
* Creating trigger list from training data
*/
public void createTriggers() {
preparedData();
generateKeyData();
}
public static void main(String[] args) {
String dbsr = null;
String dbdst = null;
if (args.length == 2) {
dbsr = args[0];
dbdst = args[1];
}
else {
System.out.println("No input and output folder declared.");
System.exit(1);
}
// dbsr = "D:/DataNLP/Mix2011/Data";
// dbdst = "D:/DataNLP/Dev2011/Data";
DBUtils dbsrc = new DBUtils();
dbsrc.openDB(dbsr); // source database
DBUtils dbdest = new DBUtils();
dbdest.openDB(dbdst); // destination database
TriggerLearner learner = new TriggerLearner(dbsrc, dbdest); // store all triggers (both train and dev) into
// train DB
learner.createTriggers();
dbdest.shutdownDB();
// dbsrc.closeDB();
}
private void saveData(String key, int[] data) {
int len = trigger_type.length;
int sc[] = new int[len];
int sum = 0;
try {
for (int i = 0; i < len; i++) {
sc[i] = data[i];
score[i] += sc[i]; // total frequency of this trigger
sum += sc[i];
}
if (sum <= fthreshold) {
return;
}
ps.setString(1, key);
for (int i = 0; i < len; i++) {
ps.setInt(i + 2, sc[i]);
}
ps.setInt(len + 2, sum);
ps.executeUpdate();
} catch (Exception e) {
System.out.println(e.getCause());
}
}
public boolean initEventData(String pid) {
plist = sim.loadProtein(pid); // load Protein list
tlist = sim.loadTrigger(pid); // load Trigger list based on PMID
elist = sim.loadEvent(pid); // load Event list
// preparing Protein Map
mprotein.clear();
for (TData dt : plist) {
mprotein.put(dt.tid, dt); // map
}
// preparing trigger map
mtrigger.clear();
for (TData dt : tlist) {
mtrigger.put(dt.tid, dt); // map
}
// preparing event map
mevent.clear();
for (EData edt : elist) {
mevent.put(edt.eid, edt); // map
}
for (EData ed : elist) {
ed.init(mprotein, mtrigger, mevent);
}
return true;
}
private boolean inSentence(int begin, int end, EData ev) {
boolean theme = false, cause = false;
TData tg = ev.trgdata;
if (tg.list[0] >= begin && tg.list[1] <= end) { // same trigger
// check theme
if (ev.data1 instanceof TData) {
TData pr1 = (TData) ev.data1;
if (pr1.list[0] >= begin && pr1.list[1] <= end) {
theme = true;
}
} else {
EData ev1 = (EData) ev.data1;
theme = inSentence(begin, end, ev1);
}
// check cause
if (ev.ecause != null) {
if (ev.ecause instanceof EData) {
cause = inSentence(begin, end, (EData) ev.ecause);
} else {
TData pr2 = (TData) ev.ecause;
if (pr2.list[0] >= begin && pr2.list[1] <= end) {
cause = true;
}
}
} else if (ev.data2 != null) { // theme2
TData pr2 = (TData) ev.data2;
if (pr2.list[0] >= begin && pr2.list[1] <= end) {
cause = true;
}
} else {
cause = true;
}
if (theme && cause) {
return true;
}
}
return false;
}
private String getKey(String tg, Set dic) {
if (dic.contains(tg)) {
return tg;
}
String txt[] = tg.split(" ");
for (int i = txt.length - 1; i >= 0; i--) {
if (dic.contains(txt[i])) {
return txt[i];
}
}
return null;
}
public void generateKeyData() {
TData tg, pro, tg2;
int ev_type = 0;
List pmids = sim.loadPMIDs();
sim.loadSimpleDic();// load trigger from trigger data
SenAnalyzer analyzer = new SenAnalyzer(sim);
Object data1, data2;
Map[] allTrg = new HashMap[trigger_type.length];
Map keys = new HashMap();
for (int i = 0; i < trigger_type.length; i++) {
allTrg[i] = new HashMap();
}
Map sharedTrg = new HashMap();
Map tempMap = new HashMap();
TriggerData tgdt, tmtrg;
Set dict = sim.simpleDic;
Counter ct;
int counter[];
Map usedTG = new HashMap();
String tg_value, tg_value2, tg_key;
int sen_begin, sen_end;
try {
System.out.println("Generating trigger related data.....");
for (String id : pmids) { // list of abstract
if (analyzer.initData(id)) {
initEventData(id); // load events, triggers, proteins
List events[] = analyzer.splitEvents(elist);
for (int i = 0; i < analyzer.shortsen.length; i++) {
tempMap.clear();
usedTG.clear();
sen_begin = analyzer.senpos[i];
sen_end = analyzer.senpos[i] + analyzer.longsen[i].length();
for (EData ed : events[i]) {
ev_type = hashType.get(ed.type);
if (!inSentence(sen_begin, sen_end, ed)) {
continue; // Skip: pro belongs to the other sentence
}
tg = ed.getTrigger();
data1 = ed.data1;
tg_value = tg.name.toLowerCase();
if (tg_value.contains("-") && tg_value.length() > 8) {
String ww[] = tg_value.split("-");
if (ww.length == 2 && dict.contains(ww[0] + ww[1])) {
tg_value = ww[0] + ww[1];
}
}
// tg_value = getKey(tg_value, dict);
if (dict.contains(tg_value)) { // known trigger
if (!usedTG.containsKey(tg)) {
usedTG.put(tg, tg);
} else {
continue;// this trigger has been used
}
// setup shared triggers
tg_key = tg_value + tg.list[0] + "" + tg.list[1];
if (!tempMap.containsKey(tg_key)) {
tempMap.put(tg_key, ed.type);
} else {
String old_type = tempMap.get(tg_key);
if (!old_type.equals(ed.type)) {
Counter c = sharedTrg.get(tg_value);
if (c == null) {
c = new Counter(1);
sharedTrg.put(tg_value, c);
} else {
c.inc();
}
}
}
// count trigger frequency
if ((counter = keys.get(tg_value)) == null) {
counter = new int[trigger_type.length + 1];
keys.put(tg_value, counter);
}
counter[ev_type]++;
// count theme/cause
tgdt = allTrg[ev_type].get(tg_value);
if (tgdt == null) {
tgdt = new TriggerData(tg_value, tg.type);
allTrg[ev_type].put(tg_value, tgdt);
}
if (data1 instanceof TData) {
tgdt.pcount++; // protein
} else {// theme1 is an event
tgdt.ecount++; // event
EData obj = (EData) data1;
int idx = hashType.get(obj.type);
tg2 = obj.getTrigger();
String tg2_value = tg2.name.toLowerCase();
if (dict.contains(tg2_value)) { // known trigger
tgdt.child[idx].add(tg2_value);
}
}
if (ed.data2 != null) { // binding event
tgdt.t2_count++;
}
if (ed.ecause != null) {
if (ed.ecause instanceof TData) {
tgdt.pcause++;
} else {
tgdt.ecause++;
EData obj = (EData) ed.ecause;
tg2 = obj.getTrigger();
String tg2_value = tg2.name.toLowerCase();
if (dict.contains(tg2_value)) { // known trigger
tgdt.parent.add(tg2_value);
}
}
}
}
} // end event loop
// count detected triggers
if (analyzer.detectedTrg[i].size() > 0 && analyzer.detectedPro[i].size() > 0) {
for (Word w : analyzer.detectedTrg[i]) {
counter = keys.get(w.word);
if (counter == null) {
counter = new int[trigger_type.length + 1];
}
counter[trigger_type.length]++;
}
}
}
}
}
List list;
int skip = 0, idx;
Map mdict = new HashMap();
KeyData item;
for (String s : keys.keySet()) {
counter = keys.get(s);
list = getType(counter);
if (list.isEmpty()) {
System.out.print("Skip: " + s + " --> freq: ");
for (int u = 0; u < counter.length; u++) {
System.out.print(counter[u] + " ");
}
System.out.println("");
skip++;
continue;
} else if (list.size() == 1) {
idx = list.get(0);
item = new KeyData(s, trigger_type[idx], counter[idx], 1, counter[trigger_type.length]);// stand-alone
// key
mdict.put(s, item);
} else {
int ktype = 3;
if (sharedTrg.containsKey(s) && sharedTrg.get(s).count > fthreshold + 2) { // share key: threshold
// +2 -> avoid noise
ktype = 2;
for (int z = 0; z < list.size(); z++) {
if (counter[list.get(z)] * 1f / sum(counter) < 0.3) {// ~ 0.5 -> number of keys are equals
// -> shared trigger
ktype = 3;
break;
}
}
}
item = new KeyData(s, null, sum(counter), ktype, counter[trigger_type.length]);// shared/mix key
mdict.put(s, item);
for (int j = 0; j < list.size(); j++) {
int k = list.get(j);
KeyData kdt = new KeyData(s, trigger_type[k], counter[k], ktype, counter[9]);
item.addToMap(kdt);
}
}
}
// Now analyzing trigger map
// Loop over reg events: pos, reg, and neg
Map mtg;
for (int i = 6; i < 9; i++) {
mtg = allTrg[i];// regulatory event
for (String s : mtg.keySet()) { // loop over list of trigger
tgdt = mtg.get(s); // trigger data
// Find parent for transcription event
if (tgdt.child[1].isEmpty()) {
continue;
}
for (String st : tgdt.child[1]) {
tmtrg = allTrg[1].get(st);
if (tmtrg == null) {
continue;
}
tmtrg.parent.add(tgdt.trigger);
}
}
}
// Assigning value to keydata
StringBuilder ls, cs;
KeyData kdt = null;
List kls = new ArrayList();
for (String s : mdict.keySet()) {
kls.clear();
kdt = mdict.get(s);
if (kdt.keytype == 1) {
kls.add(kdt);
} else {
kls.addAll(kdt.getMap().values());
}
for (KeyData dt : kls) {
idx = hashType.get(dt.type);
tgdt = allTrg[idx].get(s);
if (tgdt == null) {
continue;
}
// Loop inside trigger over simple type
ls = new StringBuilder();
cs = new StringBuilder();
for (int j = 0; j < tgdt.child.length; j++) {
for (String st : tgdt.child[j]) {
ls.append(st);
ls.append(',');
}
for (String st : tgdt.parent) {
cs.append(st);
cs.append(',');
}
}
dt.pcount = tgdt.pcount;
dt.ecount = tgdt.ecount;
dt.pcause = tgdt.pcause;
dt.ecause = tgdt.ecause;
dt.t2count = tgdt.t2_count;
dt.initData(ls.toString(), cs.toString());
}
}
saveKeyData(mdict); // store into Database
} catch (Exception e) {
e.printStackTrace();
System.out.println(e.getCause());
}
System.out.println("Trigger data generating .... Done!");
}
/**
* Sum of all frequencies of all event types that shared the same trigger
*
* @param ls
* : list of frequencies of each event types coressponding to a given trigger
* @return
*/
private int sum(int[] ls) {
int total = 0;
for (int i = 0; i < 9; i++) {
total += ls[i];
}
return total;
}
/**
* Find location of a given protein
*
* @param pr
* : protein name
* @return: Word (location)
*/
Map mprotein = new HashMap();
Map mtrigger = new HashMap();
Map mevent = new HashMap();
List plist, tlist; // protein list, trigger list
List elist; // event list
String simp, simpsen[], longsen[];
TData tgr, prt;
int[] simp_pos, full_pos;// starting position of the sentence related to the abstract
int split_count = 0;
/**
* Map between absolute position into related position of a given trigger with abstract/sentence
*/
public static String[] trigger_type = { "Gene_expression", "Transcription", "Protein_catabolism",
"Phosphorylation", "Localization", "Binding", "Positive_regulation", "Regulation", "Negative_regulation",
"Ubiquitination", "Protein_modification", "Deacetylation", "Acetylation" };
int[] score = new int[trigger_type.length];
Map hashType = new HashMap();
Map notrigger = new HashMap();
public final static String[] none_rel = { "over", "when", "by", "via", "after", "high", "lower", "under",
"transcripts", "transcript", "upon", "poor", "potent", "in", "low", "through", "a", "the", "are", "is",
"was", "for", "into", "not", "it", "that", "level", "levels", "negative", "higher", "low", "because",
"due", "to", "with", "without", "at", "from", "more", "pair", "both", "and", "on", "inhibitor",
"inhibitors", "receptors", "receptor", "complex", "complexes", "transcriptional", "heterodimers",
"heterodimer", "homodimer", "during", "crucial", "failed", "exist", "critical", "of", "due to",
"because of", "by", "an", "of", "positive", "mrna", "mrnas" };
public class TriggerData {
String trigger;
String type;
int ttype = -1;
int pcount = 0; // protein count -> all events
int ecount = 0; // event count -> reg events ; for simple events: number of reg events infront
int pcause = 0; // cause as protein -> reg events
int ecause = 0; // cause as event -> reg events
int t2_count = 0; // theme2 count -> binding event
public Set[] child = new HashSet[trigger_type.length];
public Set parent = new HashSet();
public TriggerData(String trg, String tp) {
trigger = trg;
type = tp;
for (int i = 0; i < trigger_type.length; i++) {
child[i] = new HashSet();
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy