corpora.DataLoader Maven / Gradle / Ivy
package corpora;
import utils.DBUtils;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author Chinh
* @Date: Aug 30, 2010
*/
public class DataLoader {
DataSaver saver;
String currentPid = null;
int countabs = 0;
public DataLoader() {
// saver = new DataSaver("D:/DB");
}
private void readProtein(String filename) {
File file = new File(filename);
BufferedReader reader = null;
String st[];
try {
reader = new BufferedReader(new FileReader(file));
String text = null;
// repeat until all lines is read
while ((text = reader.readLine()) != null) {
readProteinLine(text);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Reads, and stores in the DB, a single protein mention given in the BioNLP
* Shared Task 2011 format:
*
* ID<tab>Entity-Type[Protein]<tab>start<tab>end<tab>Mention name
*
* Example: T3 Protein 166 174 TGF-beta
*
* @param proteinLine
*/
private void readProteinLine(String proteinLine) {
String[] st;
st = proteinLine.split("\\t|\\s+", 5);
int lWs = 0, rWs = 0;
String name = st[4];
// remove leading whitespaces
for (int i = 0; i < name.length(); ++i) {
if (name.charAt(i) == ' ')
lWs++;
else
break;
}
for (int i = name.length() - 1; i > 0; --i) {
if (name.charAt(i) == ' ')
rWs++;
else
break;
}
// store with possibly corrected offsets due to leading or trailing
// whitespaces
saver.saveProtein(currentPid, st[0], Integer.parseInt(st[2]) + lWs,
Integer.parseInt(st[3]) - rWs, name.trim());
// just some code to quickly check the correction by whitespace removal
// if (rWs != 0 || lWs != 0) {
// System.out.println(st[4] + " " + st[2] + "-" + st[3]);
// System.out.println(name.trim() + " " + (Integer.parseInt(st[2])+lWs)
// + "-" + (Integer.parseInt(st[3])-rWs));
// }
}
private void readAbstract(String filename) {
File file = new File(filename);
BufferedReader reader = null;
String st[];
try {
reader = new BufferedReader(new FileReader(file));
String text = null;
List list = new ArrayList();
// repeat until all lines is read
while ((text = reader.readLine()) != null) {
list.add(text);
}
text = list.get(0);
if (list.size() > 1) {
for (int i = 1; i < list.size(); i++) {
text = text + "\n" + list.get(i);
}
}
saver.saveAbstract(currentPid, text);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void readTrigger(String filename) {
File file = new File(filename);
BufferedReader reader = null;
String st[];
String[] sub1, sub2, sub3, sub4;
String cause_id = "", theme2 = "";
try {
reader = new BufferedReader(new FileReader(file));
String text = null;
// repeat until all lines is read
while ((text = reader.readLine()) != null) {
if (text.startsWith("T")) {
st = text.split("\\t|\\s+", 5);
saver.saveTrigger(currentPid, st[0], st[1],
Integer.parseInt(st[2]), Integer.parseInt(st[3]),
st[4]);
} else {
st = text.split("\\t|\\s+", 0);
}
if (st[0].startsWith("E")) { // Event
if (st.length >= 4) { // trigger, theme and cause
sub3 = st[3].split(":");
if (sub3[0].startsWith("Theme2")) {
theme2 = sub3[1];
cause_id = "";
} else if (sub3[0].startsWith("Cause")) {
cause_id = sub3[1];
theme2 = "";
} else {
cause_id = "";
theme2 = "";
}
} else {
cause_id = "";
theme2 = "";
}
sub1 = st[1].split(":"); // type and trigger id
sub2 = st[2].split(":"); // theme
saver.saveEvent(currentPid, st[0], sub1[0], sub1[1],
sub2[1], theme2, cause_id);
} else if (st[0].startsWith("M")) {
saver.saveModify(currentPid, st[0], st[1], st[2]);
} else if (st[0].startsWith("*")) {
st = text.split("\\t|\\s+", 4);
saver.saveEquiv(currentPid, st[2], st[3]);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void loadData(String path, boolean train) {
// Set filter to .txt, extract filename
// Get list of all .txt in the given folder
// For each file .a1: read protein, save to Protein table
// For each file .a2: read trigger and event, save them to the
// corresponding tables
String name, n, n1, n2, file_path;
int count = 0;
try {
File file = new File(path);
File[] list = file.listFiles();
for (File f : list) {
if (f.isFile()) {
name = f.getName();
file_path = f.getParent();
if (name.endsWith("txt")) {
String[] ns = name.split("\\.txt");
currentPid = ns[0];
n = file_path + File.separatorChar + ns[0] + ".txt";
readAbstract(n);
n1 = file_path + File.separatorChar + ns[0] + ".a1";
readProtein(n1);
if (train) {
n2 = file_path + File.separatorChar + ns[0] + ".a2";
readTrigger(n2);
}
count++;
}
}
}
} catch (Exception e) {
throw new RuntimeException("Could not load data at " + path, e);
}
System.out.println("Number of abstracts: " + count);
}
public void Txt2Db(String path, String dest, boolean train) {
DBUtils db = new DBUtils();
db.openDB(dest);
System.out.println("Loading data .... !");
saver = new DataSaver(db);
loadData(path, train);
System.out.println("Loading data .... done!");
// simplification is done once at prediction time so it is of no
// specific use to do it here.
// on the other hand, at prediction time, the original text as well as
// the simplified text are sentence-segmented. Sometimes the segments
// are not equal due to minor errors in gene tagging or just because the
// sentence splitter makes a different decision for the simplified text.
// Thus, we simplify at prediction time and do it sentence-wise.
// SenSimplifier simp = new SenSimplifier(db);
// simp.doSimplify();
// System.out.println("Simplifying data ... done!");
// System.out.println("Generating dictionary ... done!");
db.closeDB();
}
/**
* Creates a database for event extraction as an programmatic API-call where
* all values are given directly rather then reading the values from files.
* In contrast to {@link #Txt2Db(String, String, boolean)}, created
* in-memory database is returned for further processing and not persisted
* to file.
* The protein lines have to match the Shared Task 2011 format:
*
* ID<tab>Entity-Type[Protein]<tab>start<tab>end<tab>Mention name
*
* Example: T3 Protein 166 174 TGF-beta
*
* @param pid
* @param text
* @param proteins
* @return
*/
public DBUtils Txt2Db(String pid, String text, List proteins) {
currentPid = pid;
// open in-mem database
DBUtils db = new DBUtils(pid, "mem");
db.openDB();
// System.out.println("Loading data .... !");
saver = new DataSaver(db);
// store document text in the database
saver.saveAbstract(currentPid, text);
// store the protein mentions
for (String proteinLine : proteins)
readProteinLine(proteinLine);
// System.out.println("Loading data .... done!");
// SenSimplifier simp = new SenSimplifier(db);
// simp.doSimplify();
// System.out.println("Simplifying data ... done!");
// System.out.println("Generating dictionary ... done!");
return db;
}
public static void main(String[] args) {
// DataLoader data = new DataLoader();
// data.Txt2Db("D:/DataNLP/Data2011TestText", "D:/DataNLP/Data2011TestPrepared/Data", false);
DataLoader data = new DataLoader();
if (args.length == 3) {
Boolean a2 = Boolean.valueOf(args[2]);
data.Txt2Db(args[0], args[1], a2);
}
else if (args.length == 2) {
data.Txt2Db(args[0], args[1], true);
}
else {
System.out.println("No input and output folder declared.");
System.exit(1);
}
}
static final String new_line = System.getProperty("line.separator");
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy