jmaxent.Data Maven / Gradle / Ivy
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jmaxent;
import java.io.*;
import java.util.*;
import jvntextpro.util.StringUtils;
// TODO: Auto-generated Javadoc
/**
* The Class Data.
*/
public class Data {
/** The option. */
Option option = null;
/** The lb str2 int. */
public Map lbStr2Int = null;
/** The lb int2 str. */
public Map lbInt2Str = null;
/** The cp str2 int. */
public Map cpStr2Int = null;
/** The cp int2 str. */
public Map cpInt2Str = null;
/** The trn data. */
public List trnData = null;
/** The tst data. */
public List tstData = null;
/** The ulb data. */
public List ulbData = null;
/**
* Instantiates a new data.
*
* @param option the option
*/
public Data(Option option) {
this.option = option;
}
/**
* Read cp maps.
*
* @param fin the fin
* @throws IOException Signals that an I/O exception has occurred.
*/
public void readCpMaps(BufferedReader fin) throws IOException {
if (cpStr2Int != null) {
cpStr2Int.clear();
} else {
cpStr2Int = new HashMap();
}
if (cpInt2Str != null) {
cpInt2Str.clear();
} else {
cpInt2Str = new HashMap();
}
String line;
// get size of the map
if ((line = fin.readLine()) == null) {
System.out.println("No context predicate map size information");
return;
}
int numCps = Integer.parseInt(line);
if (numCps <= 0) {
System.out.println("Invalid context predicate mapping size");
return;
}
System.out.println("Reading the context predicate maps ...");
for (int i = 0; i < numCps; i++) {
line = fin.readLine();
if (line == null) {
System.out.println("Invalid context predicate mapping line");
return;
}
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
if (strTok.countTokens() != 2) {
continue;
}
String cpStr = strTok.nextToken();
String cpInt = strTok.nextToken();
cpStr2Int.put(cpStr, new Integer(cpInt));
cpInt2Str.put(new Integer(cpInt), cpStr);
}
System.out.println("Reading context predicate maps (" +
Integer.toString(cpStr2Int.size()) + " entries) completed!");
// read the line ###...
line = fin.readLine();
option.numCps = cpStr2Int.size();
}
/**
* Num cps.
*
* @return the int
*/
public int numCps() {
if (cpStr2Int == null) {
return 0;
} else {
return cpStr2Int.size();
}
}
/**
* Write cp maps.
*
* @param dict the dict
* @param fout the fout
* @throws IOException Signals that an I/O exception has occurred.
*/
public void writeCpMaps(Dictionary dict, PrintWriter fout) throws IOException {
Iterator it = null;
if (cpStr2Int == null) {
return;
}
int count = 0;
for (it = cpStr2Int.keySet().iterator(); it.hasNext(); ) {
String cpStr = (String)it.next();
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
Element elem = (Element)dict.dict.get(cpInt);
if (elem != null) {
if (elem.chosen == 1) {
count++;
}
}
}
// write the map size
fout.println(Integer.toString(count));
for (it = cpStr2Int.keySet().iterator(); it.hasNext(); ) {
String cpStr = (String)it.next();
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
Element elem = (Element)dict.dict.get(cpInt);
if (elem != null) {
if (elem.chosen == 1) {
fout.println(cpStr + " " + cpInt.toString());
}
}
}
// write the line ###...
fout.println(Option.modelSeparator);
}
/**
* Read lb maps.
*
* @param fin the fin
* @throws IOException Signals that an I/O exception has occurred.
*/
public void readLbMaps(BufferedReader fin) throws IOException {
if (lbStr2Int != null) {
lbStr2Int.clear();
} else {
lbStr2Int = new HashMap();
}
if (lbInt2Str != null) {
lbInt2Str.clear();
} else {
lbInt2Str = new HashMap();
}
String line;
// get size of the map
if ((line = fin.readLine()) == null) {
System.out.println("No label map size information");
return;
}
int numLabels = Integer.parseInt(line);
if (numLabels <= 0) {
System.out.println("Invalid label mapping size");
return;
}
System.out.println("Reading the context predicate maps ...");
for (int i = 0; i < numLabels; i++) {
line = fin.readLine();
if (line == null) {
System.out.println("Invalid context predicate mapping line");
return;
}
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
if (strTok.countTokens() != 2) {
continue;
}
String lbStr = strTok.nextToken();
String lbInt = strTok.nextToken();
lbStr2Int.put(lbStr, new Integer(lbInt));
lbInt2Str.put(new Integer(lbInt), lbStr);
}
System.out.println("Reading label maps (" +
Integer.toString(lbStr2Int.size()) + " entries) completed!");
// read the line ###...
line = fin.readLine();
option.numLabels = lbStr2Int.size();
}
/**
* Num labels.
*
* @return the int
*/
public int numLabels() {
if (lbStr2Int == null) {
return 0;
} else {
return lbStr2Int.size();
}
}
/**
* Write lb maps.
*
* @param fout the fout
* @throws IOException Signals that an I/O exception has occurred.
*/
public void writeLbMaps(PrintWriter fout) throws IOException {
if (lbStr2Int == null) {
return;
}
// write the map size
fout.println(Integer.toString(lbStr2Int.size()));
for (Iterator it = lbStr2Int.keySet().iterator(); it.hasNext(); ) {
String lbStr = (String)it.next();
Integer lbInt = (Integer)lbStr2Int.get(lbStr);
fout.println(lbStr + " " + lbInt.toString());
}
// write the line ###...
fout.println(Option.modelSeparator);
}
/**
* Read trn data.
*
* @param dataFile the data file
*/
public void readTrnData(String dataFile) {
if (cpStr2Int != null) {
cpStr2Int.clear();
} else {
cpStr2Int = new HashMap();
}
if (cpInt2Str != null) {
cpInt2Str.clear();
} else {
cpInt2Str = new HashMap();
}
if (lbStr2Int != null) {
lbStr2Int.clear();
} else {
lbStr2Int = new HashMap();
}
if (lbInt2Str != null) {
lbInt2Str.clear();
} else {
lbInt2Str = new HashMap();
}
if (trnData != null) {
trnData.clear();
} else {
trnData = new ArrayList();
}
// open data file
BufferedReader fin = null;
try {
fin = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8"));
// BufferedWriter flog = new BufferedWriter(new OutputStreamWriter(
// / new FileOutputStream((new File(dataFile)).getParent() + File.separator + "log.txt"), "UTF-8"));
System.out.println("Reading training data ...");
String line;
while ((line = fin.readLine()) != null) {
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
int len = strTok.countTokens();
if (len <= 1) {
// skip this invalid line
continue;
}
List strCps = new ArrayList();
for (int i = 0; i < len - 1; i++) {
strCps.add(strTok.nextToken());
}
String labelStr = strTok.nextToken();
//
// String [] tags = {"N", "Np", "Nc", "Nu", "V", "A", "P", "L", "M",
// "R", "E", "C", "I", "T", "U", "Y", "X", "LBKT", "RBKT"};
//
//
// //System.out.println("--" + labelStr);
// //if (!StringUtils.isSign(labelStr)){
// boolean flag = false;
// for (String tag : tags){
// if (labelStr.equalsIgnoreCase(tag)){
// flag = true;
// }
// }
//
// if (!flag){
// // flog.write(line + "\n");
// //System.out.println("--" + labelStr);
// }
// //}
List intCps = new ArrayList();
for (int i = 0; i < strCps.size(); i++) {
String cpStr = (String)strCps.get(i);
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
if (cpInt != null) {
intCps.add(cpInt);
} else {
intCps.add(new Integer(cpStr2Int.size()));
cpStr2Int.put(cpStr, new Integer(cpStr2Int.size()));
cpInt2Str.put(new Integer(cpInt2Str.size()), cpStr);
}
}
Integer labelInt = (Integer)lbStr2Int.get(labelStr);
if (labelInt == null) {
labelInt = new Integer(lbStr2Int.size());
// System.out.println("hey:" + labelStr);
// flog.write(labelStr + "\t" + line + "\n");
lbStr2Int.put(labelStr, labelInt);
lbInt2Str.put(labelInt, labelStr);
}
int[] cps = new int[intCps.size()];
for (int i = 0; i < cps.length; i++) {
cps[i] = ((Integer)intCps.get(i)).intValue();
}
Observation obsr = new Observation(labelInt.intValue(), cps);
// add this observation to the data
trnData.add(obsr);
}
System.out.println("Reading " + Integer.toString(trnData.size()) +
" training data examples completed!");
// flog.close();
} catch (IOException e) {
System.out.println(e.toString());
return;
}
option.numCps = cpStr2Int.size();
option.numLabels = lbStr2Int.size();
option.numTrainExps = trnData.size();
}
/**
* Read tst data.
*
* @param dataFile the data file
*/
public void readTstData(String dataFile) {
if (tstData != null) {
tstData.clear();
} else {
tstData = new ArrayList();
}
// open data file
BufferedReader fin = null;
try {
fin = new BufferedReader(new InputStreamReader(new FileInputStream(dataFile), "UTF-8"));
System.out.println("Reading testing data ...");
String line;
while ((line = fin.readLine()) != null) {
StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
int len = strTok.countTokens();
if (len <= 1) {
// skip this invalid line
continue;
}
List strCps = new ArrayList();
for (int i = 0; i < len - 1; i++) {
strCps.add(strTok.nextToken());
}
String labelStr = strTok.nextToken();
List intCps = new ArrayList();
for (int i = 0; i < strCps.size(); i++) {
String cpStr = (String)strCps.get(i);
Integer cpInt = (Integer)cpStr2Int.get(cpStr);
if (cpInt != null) {
intCps.add(cpInt);
} else {
// do nothing
}
}
Integer labelInt = (Integer)lbStr2Int.get(labelStr);
if (labelInt == null) {
System.out.println("Reading testing observation, label not found or invalid");
return;
}
int[] cps = new int[intCps.size()];
for (int i = 0; i < cps.length; i++) {
cps[i] = ((Integer)intCps.get(i)).intValue();
}
Observation obsr = new Observation(labelInt.intValue(), cps);
// add this observation to the data
tstData.add(obsr);
}
System.out.println("Reading " + Integer.toString(tstData.size()) +
" testing data examples completed!");
} catch (IOException e) {
System.out.println(e.toString());
return;
}
option.numTestExps = tstData.size();
}
/*
public void writeTstData(String dataFile) {
}
public void readUlbData(String dataFile) {
}
public void writeUlbDataWithModelLabel(String dataFile) {
}
*/
} // end of class Data
© 2015 - 2025 Weber Informatics LLC | Privacy Policy