All Downloads are FREE. Search and download functionalities are using the official Maven repository.

is2.io.CONLLReader09 Maven / Gradle / Ivy

The newest version!


package is2.io;

import is2.data.Instances;
import is2.data.SentenceData09;
import is2.util.DB;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;



/**
 * This class reads files in the CONLL-09 format.
 *  
 * @author Bernd Bohnet
 */
public class CONLLReader09 extends IOGenerals {


	private BufferedReader inputReader;

	public static final boolean NORMALIZE = true;

	public static final boolean NO_NORMALIZE = false;

	public  boolean normalizeOn =true;

	static public String joint ="";
	
	private int format = 0; 

	private int lineNumber = 0;


	public CONLLReader09(boolean normalize){

		normalizeOn=normalize;
	}

	public CONLLReader09(String file){
		lineNumber=0;
		try {
			inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public CONLLReader09(String file, boolean normalize){
		this(file);
		normalizeOn=normalize;
	}

	/**
	 * Sets the input format: 
	 * 
	 * CONLL09 is standard, 
	 * ONE_LINE
	 * 
	 *  @param format  the fomrat (see the constants starting with F_).
	 */
	public void setInputFormat(int format) {
		this.format=format;
	}
	
	

	/**
	 * 
	 */
	public CONLLReader09() {}

	/**
	 * @param testfile
	 * @param formatTask
	 */
	public CONLLReader09(String testfile, int formatTask) {
		this(testfile);
	}

	public void startReading(String file ){
		lineNumber=0;
		try {
			inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public SentenceData09 getNext()  {

		if (F_ONE_LINE == format) return getNextOneLine(); 
		else return getNextCoNLL09();
	}
	
	/**
	 * @return
	 */
	private SentenceData09 getNextOneLine() {
		
		String line=null;
		int i=0;
		try {


			line = inputReader.readLine();
			lineNumber++;

			if (line==null ) {
				inputReader.close();
				return null;
			}
			
			String[] tokens = line.split(" ");
			int length = tokens.length;
			if (line.isEmpty()) length=0;

 			SentenceData09 it = new SentenceData09();

			it.forms = new String[length+1];

			it.plemmas = new String[length+1];
			//	it.ppos = new String[length+1];
			it.gpos = new String[length+1];
			it.labels = new String[length+1];
			it.heads = new int[length+1];
			it.pheads = new int[length+1];
			it.plabels = new String[length+1];

			it.ppos = new String[length+1];
			it.lemmas = new String[length+1];
			it.fillp = new String[length+1];
			it.feats = new String[length+1][];
			it.ofeats = new String[length+1];
			it.pfeats = new String[length+1];
			it.id = new String[length+1];

			it.forms[0] = ROOT;
			it.plemmas[0] = ROOT_LEMMA;
			it.fillp[0] = "N";
			it.lemmas[0] = ROOT_LEMMA;

			it.gpos[0] = ROOT_POS;
			it.ppos[0] = ROOT_POS;
			it.labels[0] = NO_TYPE;
			it.heads[0] = -1;
			it.plabels[0] = NO_TYPE;
			it.pheads[0] = -1;
			it.ofeats[0] = NO_TYPE;
			it.id[0] ="0";

			// root is 0 therefore start with 1

			for(i = 1; i <= length; i++) {
				
				it.id[i] = ""+i;
			
				it.forms[i] = this.normalizeOn?normalize(tokens[i-1]):tokens[i-1]; //normalize(


			}
		
			return it;

		} catch(Exception e) {
			System.out.println("\n!!! Error in input file sentence before line: "+lineNumber+" (in sentence line "+i+" ) "+e.toString());
			e.printStackTrace();
		
			
			


			//throw new Exception();
			return null;
		}

		
		
	}

	/**i.forms[heads[l]-1]+" "+rel+" "+
	 * Read a instance
	 * @return a instance
	 * @throws Exception 
	 */
	
	public SentenceData09 getNextCoNLL09()  {

		String line=null;
		int i=0;
		try {

			ArrayList lineList = new ArrayList();

			line = inputReader.readLine();
			lineNumber++;

			while(line !=null && line.length()==0) {
				line = inputReader.readLine();
				lineNumber++;
				System.out.println("skip empty line at line "+lineNumber);
			} 

			while (line != null && line.length()!=0 &&  !line.startsWith(STRING) &&!line.startsWith(REGEX)) {
				lineList.add(line.split(REGEX));
				line = inputReader.readLine();
				lineNumber++;
			}



			int length = lineList.size();

			if(length == 0) {
				inputReader.close();
				return null;
			}

			SentenceData09 it = new SentenceData09();

			it.forms = new String[length+1];

			it.plemmas = new String[length+1];
			//	it.ppos = new String[length+1];
			it.gpos = new String[length+1];
			it.labels = new String[length+1];
			it.heads = new int[length+1];
			it.pheads = new int[length+1];
			it.plabels = new String[length+1];

			it.ppos = new String[length+1];
			it.lemmas = new String[length+1];
			it.fillp = new String[length+1];
			it.feats = new String[length+1][];
			it.ofeats = new String[length+1];
			it.pfeats = new String[length+1];
			it.id = new String[length+1];

			it.forms[0] = ROOT;
			it.plemmas[0] = ROOT_LEMMA;
			it.fillp[0] = "N";
			it.lemmas[0] = ROOT_LEMMA;

			it.gpos[0] = ROOT_POS;
			it.ppos[0] = ROOT_POS;
			it.labels[0] = NO_TYPE;
			it.heads[0] = -1;
			it.plabels[0] = NO_TYPE;
			it.pheads[0] = -1;
			it.ofeats[0] = NO_TYPE;
			it.id[0] ="0";

			// root is 0 therefore start with 1

			for(i = 1; i <= length; i++) {
				
				
				
				String[] info = lineList.get(i-1);

				it.id[i] = info[0];
				it.forms[i] = info[1]; //normalize(
				if (info.length<3) continue;

				it.lemmas[i] = info[2];
				it.plemmas[i] =info[3]; 
				it.gpos[i] = info[4];  

				if (info.length<5) continue;
				it.ppos[i] = info[5];//.split("\\|")[0];
				// feat 6
	
	
				// now we try underscore
				it.ofeats[i]=info[6].equals(CONLLWriter09.DASH)? "_" : info[6];

				if (joint.length()>0) {
					
					StringBuilder b = new StringBuilder();
//					b.append(it.gpos[i]);
					if (joint.startsWith("cz")) {
						
					//	boolean caseFound =false;
				
						String [] split = it.ofeats[i].split(PIPE);
				//		if (!caseFound)
						for(String s : split) {
							if (s.startsWith("SubPOS")) {
								if (b.length()>0 )b.append("|");
								b.append(s);
							}
						}
					
						for(String s : split) {
							if (s.startsWith("Cas")){
								if (b.length()>0 )b.append("|");
								b.append(s);
							}
						
						}

//						for(String s : split) {
//							if (s.startsWith("Num")) {
//								if (b.length()>0 )b.append("|");
//								b.append(s);
//							}
//						}

				
						
					} else if (joint.contains("ger")) {
					
						String [] split = it.ofeats[i].split(PIPE);
						for(String s : split) { 
							if ( s.matches("Nom|Acc|Dat|Gen")) {
								if (b.length()>0 )b.append("|");
								b.append(s);
							}
							if ( s.matches("Sg|Pl")) {
								if (b.length()>0 )b.append("|");
								b.append(s);
							}
						}
						
					} else {
						String [] split = it.ofeats[i].split(PIPE);
						for(String s : split) 
							if ( s.matches(joint)) b.append("|").append(s);
					}
					if (b.length()==0)b.append("_");
					it.ofeats[i] = b.toString();
				}
				
				if (info[7].equals(CONLLWriter09.DASH)) it.feats[i]=null;
				else {
					it.feats[i] =info[7].split(PIPE);
					it.pfeats[i] = info[7];
				}

				
				
				if (info[8].equals(US))it.heads[i]=-1;
				else it.heads[i] = Integer.parseInt(info[8]);// head
	
				it.pheads[i]=info[9].equals(US) ? it.pheads[i]=-1:  Integer.parseInt(info[9]);// head

				it.labels[i] = info[10];					
				it.plabels[i] = info[11];
				it.fillp[i]=info[12];

				if (info.length>13) {
					if (!info[13].equals(US)) it.addPredicate(i,info[13]);
					for(int k=14;k




© 2015 - 2024 Weber Informatics LLC | Privacy Policy