jmaxent.Data Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jmaxent;
import java.io.*;
import java.util.*;
import jvntextpro.util.StringUtils;
// TODO: Auto-generated Javadoc
/**
* The Class Data.
*/
public class Data {
/** The option. */
Option option = null;
/** The lb str2 int. */
public Map lbStr2Int = null;
/** The lb int2 str. */
public Map lbInt2Str = null;
/** The cp str2 int. */
public Map cpStr2Int = null;
/** The cp int2 str. */
public Map cpInt2Str = null;
/** The trn data. */
public List trnData = null;
/** The tst data. */
public List tstData = null;
/** The ulb data. */
public List ulbData = null;
/**
* Instantiates a new data.
*
* @param option the option
*/
public Data(Option option) {
this.option = option;
}
/**
* Read cp maps.
*
* @param fin the fin
* @throws IOException Signals that an I/O exception has occurred.
*/
public void readCpMaps(BufferedReader fin) throws IOException {
if (cpStr2Int != null) {
cpStr2Int.clear();
} else {
cpStr2Int = new HashMap();
}
if (cpInt2Str != null) {
cpInt2Str.clear();
} else {
cpInt2Str = new HashMap();
}
String line;
// get size of the map
if ((line = fin.readLine()) == null) {
System.out.println("No context predicate map size information");
return;
}
int numCps = Integer.parseInt(line);
if (numCps <= 0) {
System.out.println("Invalid context predicate mapping size");
return;
}
System.out.println("Reading the context predicate maps ...");
for (int i = 0; i < numCps; i++) {
line = fin.readLine();
if (line == null) {
System.out.println("Invalid context predicate mapping line");
return;
}
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
if (strTok.countTokens() != 2) {
continue;
}
String cpStr = strTok.nextToken();
String cpInt = strTok.nextToken();
cpStr2Int.put(cpStr, new Integer(cpInt));
cpInt2Str.put(new Integer(cpInt), cpStr);
}
System.out.println("Reading context predicate maps (" +
Integer.toString(cpStr2Int.size()) + " entries) completed!");
// read the line ###...
line = fin.readLine();
option.numCps = cpStr2Int.size();
}
/**
* Num cps.
*
* @return the int
*/
public int numCps() {
if (cpStr2Int == null) {
return 0;
} else {
return cpStr2Int.size();
}
}
/**
* Write cp maps.
*
* @param dict the dict
* @param fout the fout
* @throws IOException Signals that an I/O exception has occurred.
*/
public void writeCpMaps(Dictionary dict, PrintWriter fout) throws IOException {
Iterator it = null;
if (cpStr2Int == null) {
return;
}
int count = 0;
for (it = cpStr2Int.keySet().iterator(); it.hasNext(); ) {
String cpStr = (String)it.next();
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
Element elem = (Element)dict.dict.get(cpInt);
if (elem != null) {
if (elem.chosen == 1) {
count++;
}
}
}
// write the map size
fout.println(Integer.toString(count));
for (it = cpStr2Int.keySet().iterator(); it.hasNext(); ) {
String cpStr = (String)it.next();
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
Element elem = (Element)dict.dict.get(cpInt);
if (elem != null) {
if (elem.chosen == 1) {
fout.println(cpStr + " " + cpInt.toString());
}
}
}
// write the line ###...
fout.println(Option.modelSeparator);
}
/**
* Read lb maps.
*
* @param fin the fin
* @throws IOException Signals that an I/O exception has occurred.
*/
public void readLbMaps(BufferedReader fin) throws IOException {
if (lbStr2Int != null) {
lbStr2Int.clear();
} else {
lbStr2Int = new HashMap();
}
if (lbInt2Str != null) {
lbInt2Str.clear();
} else {
lbInt2Str = new HashMap();
}
String line;
// get size of the map
if ((line = fin.readLine()) == null) {
System.out.println("No label map size information");
return;
}
int numLabels = Integer.parseInt(line);
if (numLabels <= 0) {
System.out.println("Invalid label mapping size");
return;
}
System.out.println("Reading the context predicate maps ...");
for (int i = 0; i < numLabels; i++) {
line = fin.readLine();
if (line == null) {
System.out.println("Invalid context predicate mapping line");
return;
}
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
if (strTok.countTokens() != 2) {
continue;
}
String lbStr = strTok.nextToken();
String lbInt = strTok.nextToken();
lbStr2Int.put(lbStr, new Integer(lbInt));
lbInt2Str.put(new Integer(lbInt), lbStr);
}
System.out.println("Reading label maps (" +
Integer.toString(lbStr2Int.size()) + " entries) completed!");
// read the line ###...
line = fin.readLine();
option.numLabels = lbStr2Int.size();
}
/**
* Num labels.
*
* @return the int
*/
public int numLabels() {
if (lbStr2Int == null) {
return 0;
} else {
return lbStr2Int.size();
}
}
/**
* Write lb maps.
*
* @param fout the fout
* @throws IOException Signals that an I/O exception has occurred.
*/
public void writeLbMaps(PrintWriter fout) throws IOException {
if (lbStr2Int == null) {
return;
}
// write the map size
fout.println(Integer.toString(lbStr2Int.size()));
for (Iterator it = lbStr2Int.keySet().iterator(); it.hasNext(); ) {
String lbStr = (String)it.next();
Integer lbInt = (Integer)lbStr2Int.get(lbStr);
fout.println(lbStr + " " + lbInt.toString());
}
// write the line ###...
fout.println(Option.modelSeparator);
}
/**
* Read trn data.
*
* @param dataFile the data file
*/
public void readTrnData(String dataFile) {
if (cpStr2Int != null) {
cpStr2Int.clear();
} else {
cpStr2Int = new HashMap();
}
if (cpInt2Str != null) {
cpInt2Str.clear();
} else {
cpInt2Str = new HashMap();
}
if (lbStr2Int != null) {
lbStr2Int.clear();
} else {
lbStr2Int = new HashMap();
}
if (lbInt2Str != null) {
lbInt2Str.clear();
} else {
lbInt2Str = new HashMap();
}
if (trnData != null) {
trnData.clear();
} else {
trnData = new ArrayList();
}
// open data file
BufferedReader fin = null;
try {
fin = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8"));
// BufferedWriter flog = new BufferedWriter(new OutputStreamWriter(
// / new FileOutputStream((new File(dataFile)).getParent() + File.separator + "log.txt"), "UTF-8"));
System.out.println("Reading training data ...");
String line;
while ((line = fin.readLine()) != null) {
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
int len = strTok.countTokens();
if (len <= 1) {
// skip this invalid line
continue;
}
List strCps = new ArrayList();
for (int i = 0; i < len - 1; i++) {
strCps.add(strTok.nextToken());
}
String labelStr = strTok.nextToken();
//
// String [] tags = {"N", "Np", "Nc", "Nu", "V", "A", "P", "L", "M",
// "R", "E", "C", "I", "T", "U", "Y", "X", "LBKT", "RBKT"};
//
//
// //System.out.println("--" + labelStr);
// //if (!StringUtils.isSign(labelStr)){
// boolean flag = false;
// for (String tag : tags){
// if (labelStr.equalsIgnoreCase(tag)){
// flag = true;
// }
// }
//
// if (!flag){
// // flog.write(line + "\n");
// //System.out.println("--" + labelStr);
// }
// //}
List intCps = new ArrayList();
for (int i = 0; i < strCps.size(); i++) {
String cpStr = (String)strCps.get(i);
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
if (cpInt != null) {
intCps.add(cpInt);
} else {
intCps.add(new Integer(cpStr2Int.size()));
cpStr2Int.put(cpStr, new Integer(cpStr2Int.size()));
cpInt2Str.put(new Integer(cpInt2Str.size()), cpStr);
}
}
Integer labelInt = (Integer)lbStr2Int.get(labelStr);
if (labelInt == null) {
labelInt = new Integer(lbStr2Int.size());
// System.out.println("hey:" + labelStr);
// flog.write(labelStr + "\t" + line + "\n");
lbStr2Int.put(labelStr, labelInt);
lbInt2Str.put(labelInt, labelStr);
}
int[] cps = new int[intCps.size()];
for (int i = 0; i < cps.length; i++) {
cps[i] = ((Integer)intCps.get(i)).intValue();
}
Observation obsr = new Observation(labelInt.intValue(), cps);
// add this observation to the data
trnData.add(obsr);
}
System.out.println("Reading " + Integer.toString(trnData.size()) +
" training data examples completed!");
// flog.close();
} catch (IOException e) {
System.out.println(e.toString());
return;
}
option.numCps = cpStr2Int.size();
option.numLabels = lbStr2Int.size();
option.numTrainExps = trnData.size();
}
/**
* Read tst data.
*
* @param dataFile the data file
*/
public void readTstData(String dataFile) {
if (tstData != null) {
tstData.clear();
} else {
tstData = new ArrayList();
}
// open data file
BufferedReader fin = null;
try {
fin = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8"));
System.out.println("Reading testing data ...");
String line;
while ((line = fin.readLine()) != null) {
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
int len = strTok.countTokens();
if (len <= 1) {
// skip this invalid line
continue;
}
List strCps = new ArrayList();
for (int i = 0; i < len - 1; i++) {
strCps.add(strTok.nextToken());
}
String labelStr = strTok.nextToken();
List intCps = new ArrayList();
for (int i = 0; i < strCps.size(); i++) {
String cpStr = (String)strCps.get(i);
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
if (cpInt != null) {
intCps.add(cpInt);
} else {
// do nothing
}
}
Integer labelInt = (Integer)lbStr2Int.get(labelStr);
if (labelInt == null) {
System.out.println("Reading testing observation, label not found or invalid");
return;
}
int[] cps = new int[intCps.size()];
for (int i = 0; i < cps.length; i++) {
cps[i] = ((Integer)intCps.get(i)).intValue();
}
Observation obsr = new Observation(labelInt.intValue(), cps);
// add this observation to the data
tstData.add(obsr);
}
System.out.println("Reading " + Integer.toString(tstData.size()) +
" testing data examples completed!");
} catch (IOException e) {
System.out.println(e.toString());
return;
}
option.numTestExps = tstData.size();
}
/*
public void writeTstData(String dataFile) {
}
public void readUlbData(String dataFile) {
}
public void writeUlbDataWithModelLabel(String dataFile) {
}
*/
} // end of class Data
© 2015 - 2024 Weber Informatics LLC | Privacy Policy