jvnpostag.POSDataReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* [email protected] or [email protected]
*
* Xuan-Hieu Phan
* [email protected]
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnpostag;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import jvntextpro.data.DataReader;
import jvntextpro.data.Sentence;
import jvntextpro.util.StringUtils;
public class POSDataReader extends DataReader{
protected String [] tags = {"N", "Np", "Nc", "Nu", "V", "A", "P", "L", "M", "R",
"E", "C", "I", "T", "B", "Y", "X", "Ny", "Nb", "Vb", "Mrk"};
protected boolean isTrainReading = false;
//-------------------------------------
// Constructor
//-------------------------------------
public POSDataReader(){
// Do nothing
}
public POSDataReader(boolean isTrainReading){
this.isTrainReading = isTrainReading;
}
//-------------------------------------
// Override methods
//-------------------------------------
@Override
public List readFile(String datafile){
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(datafile), "UTF-8"));
String line = null;
List data = new ArrayList();
while ((line = reader.readLine()) != null){
Sentence sentence = new Sentence();
boolean error = false;
//System.out.println(line);
if (line.startsWith("#"))
continue;
StringTokenizer tk = new StringTokenizer(line, " ");
while (tk.hasMoreTokens()){
String word = "", tag = null;
String token = tk.nextToken();
if (isTrainReading){
if (token == "/"){
word = "/";
tag = "Mrk";
}
else if (token == "///"){
word = "/";
tag = "Mrk";
}
else {
String [] fields = token.split("/");
if (fields.length == 1){
error = true;
break;
}
else if (fields.length == 2){
word = fields[0];
tag = fields[1];
}
else if (fields.length > 2){//token = 20/9/08
tag = fields[fields.length - 1];
for (int i = 0; i < fields.length - 2; ++i)
word += fields[i] + "/";
word += fields[fields.length - 2];
}
if (tag != null){
if (StringUtils.isPunc(tag))
sentence.addTWord(word, "Mrk");
else {
boolean found = false;
for (int i = 0; i < tags.length; ++i){
if (tag.equalsIgnoreCase(tags[i])){
//sentence.addTWord(word, tags[i]);
tag = tags[i];
found = true;
break;
}
}
if (!found) {error = true;
System.out.println("error");
System.out.println(tag);
}
sentence.addTWord(word, tag);
}
}
else {
//sentence.addTWord(word, tag);
error = true; //uncomment this when reading data for training
break;
}
}
}
else {
word = token;
tag = null;
sentence.addTWord(word, tag);
}
}
if (!error)
data.add(sentence);
}
reader.close();
return data;
}
catch (Exception e){
System.out.println("Error while reading data!");
e.printStackTrace();
return null;
}
}
@Override
public List readString(String dataStr){
String [] lines = dataStr.split("\n");
List data = new ArrayList();
for (String line : lines){
Sentence sentence = new Sentence();
StringTokenizer tk = new StringTokenizer(line, " ");
while (tk.hasMoreTokens()){
if (isTrainReading){
String token = tk.nextToken();
String [] fields = token.split("/");
if (fields.length > 0){
String word = fields[0];
String tag = null;
if (fields.length == 2)
tag = fields[1];
sentence.addTWord(word, tag);
}
}
else {
String token = tk.nextToken();
sentence.addTWord(token, null);
}
}
data.add(sentence);
}
return data;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy