All Downloads are FREE. Search and download functionalities are using the official Maven repository.

relations.SenAnalyzer Maven / Gradle / Ivy

package relations;

import parser.Parser;
import utils.DBUtils;
import utils.SentenceSplitter.BioSemSentence;

import java.util.*;

/**
 * 
 * @author Chinh
 * @Date: Jun 10, 2011 Analyzing sentence: 1. Detect triggers 2. Detect proteins
 *        3. Split paragraph or abstract into single sentences. 4. Parse
 *        sentences -> chunks, form relation between chunks
 * 
 *        Input: abstract or paragraph Output: sentences with anotated data
 */
public class SenAnalyzer {

	SenSimplifier simp;
	Map proMap = new HashMap(); // ID -> Protein
	List proList; // given proteins list
	String longtxt, current, shortsen[], longsen[];
	@Deprecated
	String shorttxt;
	TData trig, pro;
	int[] senpos;// offset of sentences related to the abstract/paragraph
	String current_id;
	public Map proIDMap = new HashMap(); // PRO
																			// (theme1)
																			// ->
																			// Event
																			// ID
	List allTriggers, longTrg[], detectedTrg[], detectedPro[]; // all
																		// triggers
																		// ,
																		// triggers
																		// per
																		// sentence
	List longPro[]; // protein from long sentence/ or / given triggers
	List tokenList = new ArrayList(); // tokens per sentence
	List tagList = new ArrayList(); // tokens per sentence
	Parser parser = new Parser();
	boolean default_db = true;
	Set simpleDic;
	DBUtils db_sr, db; // DB source and DB destination; DB source: to load dict
						// and patterns ; DB dest: load testing data
	double conf[] = { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.15, 0.1, 0.1 };
	int tgfreq[] = { 5, 5, 3, 3, 3, 5, 5, 5, 5 };

	/**
	 * For creating new dictionary only
	 * 
	 * @param simp
	 *            : SenSimplifier
	 */
	public SenAnalyzer(SenSimplifier simp) {
		this.simp = simp;
		simpleDic = simp.simpleDic;
	}

	/**
	 * Use given database
	 * 
	 * @param dbs
	 *            : DBUtils
	 */
	public SenAnalyzer(DBUtils sr, DBUtils db_dest) {
		db_sr = sr;
		db = db_dest;
		simp = new SenSimplifier(db);
	}

	/**
	 * Set default database
	 * 
	 * @param dbs
	 */
	public void setDB(DBUtils dbs) {
		db = dbs;
		simp.setDB(db);
	}

	/**
	 * Close database [never called]
	 */
	public void closeDB() {
		if (db != null) {
			db.shutdownDB();
		}
	}

	/**
	 * Loading dictionaries
	 */
	public void init() {
		simp.loadDict(db_sr);
		simpleDic = simp.simpleDic;
	}

	/*
	 * Init abstract or paragraph 1. Break long sentence into single sentences;
	 * 2. Load proteins, assign ID 3. Caculate offsets
	 */
	public boolean initAbstract(String pid) {
		proMap.clear();
		proIDMap.clear();
		longtxt = simp.loadSentence(pid); // load long sentence / abstract
		if (longtxt == null) {
			System.out.println("PMID: " + pid + " -> no text");
			return false;
		}

		BioSemSentence[] originalSentences = simp.doSentenceSplitting(longtxt, "\n");
		longsen = new String[originalSentences.length];
		senpos = new int[longsen.length];// offset of long sentences
		for (int i = 1; i < originalSentences.length; i++)
			senpos[i] = originalSentences[i].begin;
		for (int i = 0; i < originalSentences.length; i++)
			longsen[i] = originalSentences[i].text;

		proList = simp.loadProtein(pid); // load Protein list
		
		
		allTriggers = detectTrg(longtxt); // all triggers from full sentences
		longPro = splitData(proList, senpos); // split given proteins into
		// corresponding sentence
		// (proteins belong to this
		// sentence)
		longTrg = splitTrg(allTriggers, senpos);

		// Remove trigger that embedded inside protein name
		List trgRemoveList = new ArrayList();
		List proRemoveList = new ArrayList();
		for (int i = 0; i < senpos.length; i++) {
			trgRemoveList.clear();
			proRemoveList.clear();
			for (Word w : longTrg[i]) { // triggers that embedded inside protein
										// should be removed
				for (TData dt : longPro[i]) {
					// if (w.locs[0] >= dt.list[0] && w.locs[1] <= dt.list[1]) {
					// trgRemoveList.add(w);
					// proRemoveList.add(dt);
					// }
					if ((w.locs[0] >= dt.list[0] && w.locs[0] < dt.list[1])
							|| (w.locs[1] > dt.list[0] && w.locs[1] <= dt.list[1])) {
						trgRemoveList.add(w);
						// protein removal currently doesn't work (errors in
						// other RuleLearner during learning)
						// proRemoveList.add(dt);
					}
				}
			}
			for (Word w : trgRemoveList) {
				longTrg[i].remove(w);
			}
			for (TData td : proRemoveList) {
				longPro[i].remove(td);
				proList.remove(td);
			}
		}

		shortsen = simp.doSimplifySentenceWise(originalSentences, proList);
		if (shortsen.length != longsen.length) {
			System.out.println("Skip due to number of long sentences != short sentences---> " + pid);
			return false;
		}

		// init protein map
		for (TData dt : proList) {
			proMap.put(dt.tid, dt); // map 
			proIDMap.put(dt.new_name, dt.tid);
		}

		return true;
	}

	/**
	 * Workflow: 1. Splitting abstract/parahraph into single sentences 2.
	 * Detecting triggers and proteins belong to each sentence 3. Assign trigger
	 * locations (offset to long sentence), protein ID
	 * 
	 * @param id
	 *            : abstract/paragrap ID Output: list of triggers and proteins
	 *            stored into
	 */
	public boolean initData(String id) {
		current_id = id;
		Word w1, w2;
		TData trg;
		// preparing data
		if (!initAbstract(id)) {
			System.out.println(" Skip this sentence due to init failed " + id);
			return false;
		}
		detectedPro = new ArrayList[senpos.length];// list of Pros per sentence
		detectedTrg = new ArrayList[senpos.length];// list of Triggers per
													// sentence
													// allTriggers =
													// detectTrg(longtxt); //
													// all triggers from full
													// sentences
		// longTrg = splitTrg(allTriggers, senpos);// map triggers from long
		// sentence into short
		// sentences-> to use original
		// locations
		// longPro = splitData(proList,
		// senpos); // split given
		// proteins into
		// // corresponding sentence
		// // (proteins belong to this
		// // sentence)

		// // Remove trigger that embedded inside protein name
		// List removeList = new ArrayList();
		// for (int i = 0; i < senpos.length; i++) {
		// removeList.clear();
		// for (Word w : longTrg[i]) { // triggers that embedded inside protein
		// // should be removed
		// for (TData dt : longPro[i]) {
		// if (w.locs[0] >= dt.list[0] && w.locs[1] <= dt.list[1]) {
		// removeList.add(w);
		// }
		// // if ((w.locs[0] >= dt.list[0] && w.locs[0] <= dt.list[1])
		// // || (w.locs[1] >= dt.list[0] && w.locs[1] <= dt.list[1]))
		// // removeList.add(w);
		// }
		// }
		// if (removeList.size() > 0) {
		// for (Word w : removeList) {
		// longTrg[i].remove(w);
		// }
		// }
		// }

		tokenList.clear();
		for (int i = 0; i < senpos.length; i++) {
			current = shortsen[i];
			String token[] = parser.splitWord(current);
			tokenList.add(token);// keeps track of all token
			if (current.length() < 5) {
				detectedPro[i] = new ArrayList();
				detectedTrg[i] = new ArrayList();
				continue;
			}
			processSentence(current, token, i); // detect trigger and protein
												// for current sentence
			if (detectedTrg[i].size() != longTrg[i].size()) {
				// try to make triggers consistent between simplified and long
				// sentence (i.e. restrict simplified triggers to those actually
				// found in the long sentence)
				Set longTrgs = new HashSet<>();
				for (Word w : longTrg[i]) {
					longTrgs.add(w.word);
				}
				Iterator detectedIt = detectedTrg[i].iterator();
				while (detectedIt.hasNext()) {
					Word word = (Word) detectedIt.next();
					if (!longTrgs.contains(word.word))
						detectedIt.remove();
				}
			}
			// Map trigger from short sentence into long sentence
			if (detectedTrg[i].size() == longTrg[i].size()) {
				for (int f = 0; f < detectedTrg[i].size(); f++) {
					w1 = detectedTrg[i].get(f);
					w2 = longTrg[i].get(f);
					w1.locs = w2.locs;
				}
			} else {
//				System.out.println("Number of triggers doesn't match, sentence " + i + ": " + id);
//				System.out.println("Detected: " + detectedTrg[i].size() + ", given: " + longTrg[i].size());
//				System.out.println("Long: " + longsen[i]);
//				for (Word tg : longTrg[i]) {
//					System.out.print(tg.word + " " + tg.pos + " | ");
//				}
//				System.out.println("");
//				System.out.println("Short: " + shortsen[i]);
//				for (Word tg : detectedTrg[i]) {
//					System.out.print(tg.word + " " + tg.pos + " | ");
//				}
				// System.exit(1);
				throw new IllegalStateException();
			}
			// Asign ID for PRO
			if (detectedPro[i].size() != longPro[i].size()) {
//				System.err.println("Sentence " + i + ": Miss protein, given: " + longPro[i].size() + " detectted "
//						+ detectedPro[i].size());
//				System.err.println("Long: " + longsen[i]);
//				System.err.println("");
//				System.err.println("Short: " + shortsen[i]);
				// System.exit(1);
				throw new IllegalStateException();
			}

			// TODO remove
			// if (i==5) {
			// System.out.println(longsen[5]);
			// System.out.println(shortsen[5]);
			// List list = longPro[5];
			// for (TData tData : list) {
			// System.out.println(tData.name + " " + tData.tid);
			// }
			// System.out.println(Arrays.toString(token));
			// }

			for (Word pr : detectedPro[i]) {
				pr.TID = proIDMap.get(pr.word);// assign PRO ID
				if (pr.TID == null) {
					// if this is happening, we have the problem that in the
					// simplified sentence, processSentence() got a wrong
					// protein name by calling simp.getProteins(w1). This may
					// happen if a gene is only the part of a longer token, e.g.
					// "EPLIN100kb" (occurs this way in PMID 10806352). This is
					// simplified to "PRO9100kb". The protein name should be
					// "PRO9", but the getProteins() method just employs a
					// string match and returns "PRO91" and "PRO910". Thus we
					// search for the longest actually existing protein that is
					// a prefix of the detected protein name to identify the
					// correct one.
					int maxLength = 0;
					String bestCandidate = null;
					for (String prName : proIDMap.keySet()) {
						if (pr.word.startsWith(prName) && prName.length() > maxLength) {
							maxLength = prName.length();
							bestCandidate = prName;
						}
					}
					pr.TID = proIDMap.get(bestCandidate);
				}
				pr.locs = proMap.get(pr.TID).list; // location (orignal
													// location)
			}
		}
		return true;
	}

	int ecount = 0;

	/**
	 * Main method to analyze text
	 * 
	 * @param id
	 *            : abstract ID
	 * @return : array list of chunk + triggers and proteins per sentence
	 */
	public List[] analyze(String id) {
		List[] list = null;
		Chunk c;
		List pros, trgs;
		String tags[], tokens[];
		boolean init = initData(id);
		if (!init) {
			return list;
		} else {
			list = new ArrayList[senpos.length];
		}
		Word prot, trg;
		tagList.clear();
		boolean remove;
		Map dic = simp.sharedDic;
		KeyData kdt;
		int e_idx;
		int freq;
		double threshold;
		for (int i = 0; i < senpos.length; i++) {
			if (detectedPro[i].isEmpty() || detectedTrg[i].isEmpty()) {
				list[i] = null;// skip this sentence due to no trigger nor
								// protein
				tagList.add(new String[0]);
			} else {
				tokens = tokenList.get(i);
				tags = parser.POSTag(tokens);
				tagList.add(tags);
				// System.out.println("Parsing: "+id+" send ID: "+i+"
				// "+shortsen[i]);
				parser.old_txt = shortsen[i];
				List chunks = parser.parse(tokens, tags); // parse
																	// sentence
																	// ith;
																	// tokens
																	// obtained
																	// from
																	// initData
				int pidx = 0, tidx = 0, cidx = 0; // protein idx, trigger idx
													// and chunk idx
				pros = detectedPro[i];
				trgs = detectedTrg[i];
				int pcount = 0, tcount = 0;
				while (cidx < chunks.size()) {
					c = chunks.get(cidx);
					// add detected triggers into current chunk
					while (tidx < trgs.size()) {
						trg = trgs.get(tidx);
						kdt = dic.get(trg.word);
						remove = false;
						if (kdt.keytype == 1) {
							e_idx = SenSimplifier.hashType.get(kdt.type);
							threshold = conf[e_idx];
							freq = tgfreq[e_idx];
						} else {
							threshold = 0.15;
							freq = 10;
						}
						if (kdt.score < threshold || kdt.freq < freq) {
							// remove = true;
						}
						if (c.contains(trg) && !trg.word.contains(" ")) {
							if (notTrg.containsKey(trg.word)) {
								if (trg.pos < tokens.length - 1) {
									String key = tokens[trg.pos + 1];
									if (notTrg.get(trg.word).contains(key)) { // skip
																				// this
																				// trg
										remove = true;
									}
								}
							}
							if (!remove) {
								c.addTrigger(trg);
							}
							tcount++;// count number of triggers have been
										// assigned
							tidx++; // next trigger
							trg.pos_tag = tags[trg.pos]; // assign POS
							trg.chunk_type = c.type;
							continue;
						} else if (c.contains(trg) && trg.word.contains(" ")) {
							if (trg.pos + 1 > c.end) { // merge with next chunk
								if (cidx < chunks.size() - 1) {
									Chunk c1 = chunks.get(cidx + 1);
									c.merge(c1);
									chunks.remove(c);
								} else {
									System.out.println(shortsen[i]);
									printChunk(chunks);
									System.out.println(trg.word + " Pos: " + trg.pos);
									remove = true;
								}
							}
							if (!remove) {
								c.addTrigger(trg);
							}
							tcount++;// count number of triggers have been
										// assigned
							tidx++; // next trigger
							trg.pos_tag = tags[trg.pos + 1]; // assign POS
							trg.chunk_type = c.type;
							continue;
						}
						break;
					}
					while (pidx < pros.size()) {
						prot = pros.get(pidx);
						remove = false;
						if (prot.pos >= c.begin && prot.pos <= c.end) {
							if (prot.fullword != null) {
								if (prot.fullword.contains(prot.word + "+") || prot.fullword.endsWith("+")
										|| prot.fullword.startsWith("anti")) {
									remove = true;
								}
							}
							if (!remove) {
								c.addPro(prot);
							}
							pcount++;
							pidx++;
							continue;
						} else {
							break;
						}
					}
					cidx++;
				}
				if (pcount != pros.size() || tcount != trgs.size()) {
					System.out.println("----BUG---> Sen analyzer: protein or trigger is missing");
					if (pcount != pros.size()) {
						System.out.println("Protein missed");
					} else if (tcount != trgs.size()) {
						System.out.println("Trigger missed");
					}
					for (Chunk ch : chunks) {
						System.out.print(ch + " ");
					}
					System.out.println("");
					ecounter++;
				}
				list[i] = chunks;
			}
		}
		return list;
	}

	int ecounter = 0;

	public List[] splitTrg(List list, int[] pos) {
		List dlist[] = new List[pos.length];
		for (int i = 0; i < pos.length; i++) {
			dlist[i] = new ArrayList();
		}
		int loc, idx;
		for (Word dt : list) {
			loc = dt.locs[0];
			idx = pos2sen(pos, loc);
			dlist[idx].add(dt);
		}
		return dlist;
	}

	private int pos2sen(int[] senpos, int pos) {
		for (int i = senpos.length - 1; i > 0; i--) {
			if (pos >= senpos[i]) {
				return i;
			}
		}
		return 0;
	}

	/**
	 * Split list of Data (Trigger/Protein) based on number of sentences
	 * 
	 * @param list
	 *            : trigger/protein
	 * @return: Array of list
	 */
	public List[] splitData(List list, int pos[]) {
		List dlist[] = new List[pos.length];
		for (int i = 0; i < pos.length; i++) {
			dlist[i] = new ArrayList();
		}
		int loc, idx;
		for (TData dt : list) {
			loc = dt.list[0];
			idx = pos2sen(pos, loc);
			dlist[idx].add(dt);
		}
		return dlist;
	}

	public void testSentence() {
		init();
		String txt = "This is surprising, since IFN-alpha-inducible signaling cascades are present in A3.01 T cells: we showed that the control plasmid harboring interferon-responsive GAS elements was markedly induced by IFN-alpha treatment.";
		List ls = detectTrg(txt);
		for (Word w : ls) {
			System.out.println(w.word + " Pos: " + w.pos + " --> " + txt.substring(w.locs[0], w.locs[1]));
		}
	}

	public List detectTrg(String txt) {
		List tokens[] = utils.SentenceSplitter.wordSpliter(txt);
		String[] words = tokens[0].toArray(new String[0]);
		List trgList = new ArrayList();
		Word word;
		String temp, w1, w2 = null;
		int i = 0, len;
		int loc = 0;
		while (i < words.length) {
			loc += tokens[1].get(i).length();
			word = null;
			w1 = words[i];
			len = w1.length();
			temp = w1.toLowerCase();
			if (simpleDic.contains(temp)) {
				if (temp.contains("-")) {
					String[] ww = temp.split("-");
					if (simpleDic.contains(ww[0] + ww[1])) {
						word = new Word(ww[0] + ww[1], i, loc);
					}
				}
				if (word == null) {
					word = new Word(temp, i, loc);
				}
				int pos[] = new int[2];
				pos[0] = loc;
				pos[1] = loc + len;
				word.locs = pos;
				trgList.add(word);
			} else if (temp.contains("-") && temp.length() >= 8) {
				String[] ww = temp.split("-");
				int pt = ww.length - 1;
				String s = ww[pt];
				if (ww.length == 2 && simpleDic.contains(ww[0] + ww[1])) {
					word = new Word(ww[0] + ww[1], i, loc);
					int pos[] = new int[2];
					pos[0] = loc;
					pos[1] = loc + len;
					word.locs = pos;
					trgList.add(word);
				} else if (simpleDic.contains(s)) {
					word = new Word(s, i, loc + temp.indexOf(s));
					int pos[] = new int[2];
					pos[0] = loc + temp.indexOf(s);
					pos[1] = pos[0] + s.length();
					word.locs = pos;
					trgList.add(word);
				}
			}
			i++;
			loc = loc + len;
		}
		return trgList;
	}

	public void processSentence(String txt, String token[], int idx) {
		String[] words = token;
		List list;
		List trgList = new ArrayList();
		List protList = new ArrayList();
		Word word;
		String temp, w1, w2 = null;
		int i = 0;
		String pid;
		boolean combine = false;
		while (i < words.length) {
			combine = false;
			w1 = words[i];
			temp = w1.toLowerCase();
			word = null;
			if (simpleDic.contains(temp)) {
				if (temp.contains("-")) {
					String[] ww = temp.split("-");
					if (simpleDic.contains(ww[0] + ww[1])) {
						word = new Word(ww[0] + ww[1], i, 0);
					}
				}
				if (word == null) {
					word = new Word(temp, i, 0);
				}
				trgList.add(word);
				i++;
				continue;
			}
			if (proIDMap.containsKey(w1)) {
				word = new Word(w1, i, 0);
				pid = proIDMap.get(w1);
				TData pr = proMap.get(pid);
				word.locs = pr.list;
				protList.add(word);
				i++;
				continue;
			}
			if ((list = simp.getProteins(w1)).size() > 0) { // compound or
															// combined
				if (w1.contains("-") && w1.length() >= 8) {
					String[] ww = w1.split("-");
					int pt = ww.length - 1;
					if (simpleDic.contains(ww[pt].toLowerCase())) {
						word = new Word(ww[pt].toLowerCase(), i, 0);
						word.compound = true;
						if (proIDMap.containsKey(ww[0]) && ww.length == 2) {
							word.combined = true;
							combine = true;
						}
						word.fullword = w1;
						trgList.add(word);
					}
				}
				for (String s : list) {
					word = new Word(s, i, 0);
					word.compound = true;
					word.combined = combine;
					word.fullword = w1;
					pid = proIDMap.get(s);
					TData pr = proMap.get(pid);
					if (pr != null) {
						word.locs = pr.list;
					} else { // remove the last digit due to unwanted pattern
								// e.g. PROXY
						// System.out.println(" ----> " + "PRO: " + word.word +
						// " not found");
						String s1 = s.substring(0, s.length() - 1);
						// System.out.println("Pro: new name "+s1);
						word.word = s1;
						pid = pid = proIDMap.get(s1);
						pr = proMap.get(pid);
						if (pr != null) {
							word.locs = pr.list;
						} else { // failed!!
							word.locs = new int[2];
						}
					}
					protList.add(word);
				}
				i++;
				continue;
			}
			if (temp.contains("-") && temp.length() >= 8) {
				String[] ww = temp.split("-");
				int pt = ww.length - 1;
				if (ww.length == 2 && simpleDic.contains(ww[0] + ww[1])) {
					word = new Word(ww[0] + ww[1], i, 0);
					trgList.add(word);
				} else if (pt >= 0) {
					if (simpleDic.contains(ww[pt])) {
						word = new Word(ww[pt], i, 0);
						word.compound = true;
						word.fullword = w1;
						trgList.add(word);
					}
				}
			}
			i++;
		}
		detectedPro[idx] = protList;
		detectedTrg[idx] = trgList;
	}

	private void removePro(List pro, List names) {
		List rm = new ArrayList();
		for (String s : names) {
			for (Word w : pro) {
				if (w.word.equals(s)) {
					rm.add(w);
				}
			}
		}
		for (Word w : rm) {
			pro.remove(w);
		}
	}

	/**
	 * DO NOT USE! This method is not correct due to some changes.
	 * 
	 * @param txt
	 * @return
	 */
	public int findEquiv(String txt) {
		int count = 0;
		StringBuilder sb = new StringBuilder(txt);
		int i = 0;
		int[] openP = new int[15];
		int index = -1;
		String sub;
		List alter = new ArrayList();
		List foundList;
		while (i < sb.length()) {
			if (sb.charAt(i) == '(') {
				openP[++index] = i;
			} else {
				if (sb.charAt(i) == ')') {
					int k = i + 1;
					if (index >= 0) {
						sub = sb.substring(openP[index], k);
						alter = simp.getProteins(sub);
						if (alter.isEmpty()) {
							i = openP[index];
						} else if (alter.size() == 1) {
							// have proteins, now create a list
							// check whether this list belongs to the protein
							// closed to this list
							foundList = new ArrayList();
							int pidx = simp.getPIndex(alter.get(0));
							if (pidx > 0) {
								String pr1 = "PRO" + pidx;
								String pro2 = "PRO" + (pidx - 1);
								// if (simp.findDistance(pro, pr1) <= 2) {
								foundList.add(pro2);
								// equiList.put(pr1, foundList);
								count++;
								// }
							}
						}
						index--;
					}
				}
			}
			i++;
		}
		return count;
	}

	public Word findTrigger(int loc[], List relWord) {
		for (Word w : relWord) {
			if (w.locs[0] == loc[0] || w.locs[1] == loc[1]) {
				return w;
			}
		}
		return null;
	}

	public List[] splitEvents(List list) {
		List dlist[] = new List[longsen.length];
		int pos, idx;
		TData tg;
		for (int i = 0; i < longsen.length; i++) {
			dlist[i] = new ArrayList();
		}
		for (EData dt : list) {
			tg = dt.trgdata;
			pos = tg.list[0];
			idx = pos2sen(senpos, pos);
			dlist[idx].add(dt);
		}
		return dlist;
	}

	public void testAll() {
		init();
		int total = 0, detected = 0, match = 0, recognized = 0;
		List ids = simp.loadPMIDs();
		System.out.println("Total abstract: " + ids.size());
		Map TGmap = new HashMap();
		List elist;
		List trgList, candidate = new ArrayList();
		Map map = new HashMap();
		Map miss = new HashMap();
		TData tg;
		String key = "";
		for (String id : ids) {
			map.clear();
			elist = simp.loadEvent(id);
			trgList = simp.loadTrigger(id);
			for (TData dt : trgList) { // prepare hash for trigger
				map.put(dt.tid, dt);
			}
			initData(id);
			List events[] = splitEvents(elist);
			for (int i = 0; i < senpos.length; i++) {
				TGmap.clear();
				for (EData ev : events[i]) {
					if (!TGmap.containsKey(ev.trigID)) {
						TGmap.put(ev.trigID, ev.trigID); // avoiding repetition
					}
				}
				total += TGmap.size();
				candidate.clear();
				for (String s : TGmap.keySet()) {
					tg = map.get(s);
					if (simp.sharedDic.containsKey(tg.name.toLowerCase())) {
						candidate.add(tg);// recognized trigger
					} else {
						Counter c = miss.get(tg.name.toLowerCase());
						if (c == null) {
							c = new Counter(1);
							miss.put(tg.name.toLowerCase(), c);
						} else {
							c.inc();
						}
					}
				}
				recognized += candidate.size();
				for (TData dt : candidate) {
					for (Word w : detectedTrg[i]) {
						if (dt.list[0] == w.locs[0] || dt.list[1] == w.locs[1]) {
							match++;
						}
					}
				}
				if (detectedTrg[i] != null) {
					detected += detectedTrg[i].size();
				}
			}
		}
		System.out.println("Total non-repeat triggers: " + total);
		System.out.println("Recognized triggers: " + recognized + " -> Recall: " + (1f * recognized / total));
		System.out.println("Match trigger: " + match + " -> Recall: " + (1f * match / total));
		System.out.println("Missed triggers: " + (detected - match));
		int recover = 0;
		for (String s : miss.keySet()) {
			Counter c = miss.get(s);
			if (c.count >= 2) {
				System.out.println(s + " " + miss.get(s).count);
				recover += c.count;
			}
		}
		System.out.println("Number of trggers can recovers: " + recover);

	}

	/**
	 * Get list of prepositions of a sentence
	 * 
	 * @param tokens
	 * @return
	 */
	public List getPreps(String[] tokens) {
		String s;
		List list = new ArrayList();
		for (int i = 0; i < tokens.length; i++) {
			s = tokens[i];
			if (SenSimplifier.prepmap.contains(s)) {
				Word w = new Word(s, i, 0);
				list.add(w);
			}
		}
		return list;
	}

	/**
	 * Get list of prepositions of a sentence
	 * 
	 * @param tokens
	 * @return
	 */
	public List getModifier(String[] tokens) {
		String s;
		List list = new ArrayList();
		for (int i = 0; i < tokens.length; i++) {
			s = tokens[i];
			if (SenSimplifier.modifiers.containsKey(s)) {
				Word w = new Word(s, i, 0);
				list.add(w);
			}
		}
		return list;
	}

	public void printChunk(List ls) {
		for (Chunk c : ls) {
			System.out.print("[" + c.type + " " + c.txt + "]");
		}
		System.out.println("");
	}

	private void printChunkValue(List ls) {
		for (Chunk c : ls) {
			System.out.print(c.getValues());
		}
		System.out.println("");
	}

	public void testChunks() {
		init();
		ChunkAnalyzer op = new ChunkAnalyzer();
		List ids = simp.loadPMIDs();
		List[] out;
		int count = 0;
		for (String id : ids) {
			out = analyze(id);
			for (int i = 0; i < out.length; i++) {
				ecount++;
				if (out[i] != null) {
					op.curr_text = shortsen[i];
					op.analyzeChunk(out[i], tagList.get(i), tokenList.get(i));
					if (op.verbList.size() > 0) {
						for (VerbChunk vb : op.verbList) {
							vb.subject.mergeNP();
							vb.object.mergeNP();
						}
					}

				}
			}
		}
		System.out.println("Total sentences: " + ecount);
		System.out.println("Total error due to trigger detection " + ecounter);
		System.out.println("Number of abstract: " + ids.size() + " Number of sentences: " + count);
		System.out.println("---Number of unknown subject cases: " + op.sub_count);

	}

	public void printInChunks() {
		ChunkAnalyzer op = new ChunkAnalyzer();
		List ids = simp.loadPMIDs();
		List[] out;
		int count = 0;
		for (String id : ids) {
			out = analyze(id);
			for (int i = 0; i < out.length; i++) {
				if (out[i] != null) {
					for (Chunk c : out[i]) {
						if (c.trigs.size() == 2) {
							System.out.println(c.getValues());
						}
					}
				}
			}
			count++;
		}
		System.out.println("Total cases: " + ecount);
		System.out.println("Total error due to trigger detection " + ecounter);
		System.out.println("Number of abstract: " + ids.size() + " Number of sentences: " + count);
		System.out.println("---Number of unknown subject cases: " + op.sub_count);

	}

	static final Map> notTrg = new HashMap>();
	static final String notrigger[] = {
			"binding->site|sites|domain|element|elements|complexes|protein|proteins|subunit|subunits|complex",
			"transcription->factor|factors|initiation|sites|site|start" };

	static {
		for (String s : notrigger) {
			String st[] = s.split("->");
			String sub[] = st[1].split("\\|");
			Set mapsub = new HashSet();
			mapsub.addAll(Arrays.asList(sub));
			notTrg.put(st[0], mapsub);
		}
	}

	public static void main(String[] args) {
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy