jvnsensegmenter.FeatureGenerator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heideltime Show documentation
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
There is a newer version: 2.2.1
Show newest version
/*
 Copyright (C) 2010 by
 * 
 * 	Cam-Tu Nguyen 
 *  [email protected] or [email protected]
 *
 *  Xuan-Hieu Phan  
 *  [email protected] 
 *
 *  College of Technology, Vietnamese University, Hanoi
 * 	Graduate School of Information Sciences, Tohoku University
 *
 * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License,
 * or (at your option) any later version.
 *
 * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with  JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

package jvnsensegmenter;

import java.util.*;
import java.io.*;

import jvntextpro.util.StringUtils;
// TODO: Auto-generated Javadoc

/**
 * The Class FeatureGenerator.
 *
 * @author TuNC
 */
public class FeatureGenerator {        
        
    /**
     * The main method.
     *
     * @param args the arguments
     */
    public static void main(String [] args ){
        if (args.length != 3){            
            printUsage();
            System.exit(1);
        }
        
        boolean label = (args[0].toLowerCase().trim().equals("-lbl"));
        
        try{
            String inputWhat = args[1].toLowerCase().trim();
        
            if (inputWhat.equals("-inputfile"))
            {
                BufferedReader in = new BufferedReader(new InputStreamReader(
                        new FileInputStream(args[2]), "UTF-8"));
                
                BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
                        new FileOutputStream(args[2] + ".tagged"), "UTF-8"));

                String text = "", line = "";                
                while ((line = in.readLine()) != null){
                    text += "\n" + line;
                }
                text = text.trim();

                //text normalization 
                text = text.replaceAll("([\t\n\r ])+", "$1");                
                text = text.replaceAll("[\\[\\]]", "");                
                text = text.replaceAll("<[^<>]*>", "");
                    
                List MarkList = new ArrayList();
                
                ArrayList recordList =  (ArrayList) doFeatureGen(
                        new HashMap(), text , MarkList, label) ;

                for (int i = 0; i < recordList.size(); ++i){
                
                    out.write(recordList.get(i).toString());
                    
                    out.write("\n");
                
                }

                in.close();
                out.close();
            }

            else if (inputWhat.equals("-inputdir")){
                
                BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
                        new FileOutputStream(args[2] + ".tagged"), "UTF-8"));

                File inputDir = new File(args[2]);
                File [] childrent = inputDir.listFiles();
                
                for (int i = 0; i ]*>", "");

                    List MarkList = new ArrayList();
                    ArrayList recordList = (ArrayList) doFeatureGen(
                            new HashMap(), text , MarkList, label) ;

                    
                    for (int j = 0; j < recordList.size(); ++j){                    
                        out.write(recordList.get(j).toString());                    
                        out.write("\n");
                    }

                    in.close();                 
                }
                out.close();
            }
         
            else printUsage();
        }
        catch (Exception e)
        {
            System.out.println("In feature generator main : " + e.getMessage());
            return;
        }
        
    }
    
    /**
     * Prints the usage.
     */
    public static void printUsage(){
        System.out.println("Usage: FeatureGeneration -lbl/-unlbl -inputfile/-inputdir [input file/input dir]");
    }
    
    /**
     * Read abbr list.
     *
     * @param dataFile the data file
     * @param map the map
     * @throws IOException Signals that an I/O exception has occurred.
     */
    public static void readAbbrList(String dataFile, Map map) throws IOException {
		BufferedReader fin = new BufferedReader(new FileReader(dataFile));
		
		String line;
		while ((line = fin.readLine()) != null) {
		    StringTokenizer strTok = new StringTokenizer(line, " \t\r\n");
		    
		    if (strTok.countTokens() <= 0) {
			continue;
		    } 
		    
		    String token = strTok.nextToken();
		    
		    map.put(token.toLowerCase(), token.toLowerCase());
		}
    }    

    /**
     * Generate context predicates for a specified text, return string representing the context predicates.
     *
     * @param map the map
     * @param text the text
     * @param markList the mark list
     * @param label the label
     * @return the list
     */
    public static List doFeatureGen(Map map, String text , List markList, boolean label){
        markList.clear();
        
        //Find out positions of .!? and store them in the markList
        int nextPos = 0;
        while( (nextPos = StringUtils.findFirstOf(text, ".!?", nextPos + 1)) != -1) 
                markList.add(new Integer(nextPos));
        
        //Generate context predicates at those positions
        List results = new ArrayList();
        for (int i = 0; i < markList.size(); ++i){                        
        
            int curPos = ((Integer) markList.get(i)).intValue();            
            String record = genCPs(map, text, curPos);
            
            //Assign label to feature string if it is specified
            if (label){
                int idx = StringUtils.findFirstNotOf(text, " \t", curPos + 1);
            
                if (idx == -1 || (text.charAt(idx) == '\n')){
                    //end of sentence
                    record += " " + "y";
                }                    
                else record += " " + "n";                                
            }
            
            results.add(record);
        }        
        return results;
    }
    
    /**
     * get context predicates at a specified position in the sequence.
     *
     * @param map the map
     * @param text the text
     * @param position the position
     * @return the string
     */
    private static String genCPs(Map map, String text, int position){        
        //get the current token(containing this mark) and its suffix & prefix            
        String token = "", suffix = "", prefix = "";
        int idx1 = -1, idx2 = -1, idx;
        
        idx1 =  StringUtils.findLastOf(text, " \t\n\r", position);
        if (idx1 == -1) idx1 = 0;        
        idx2 = StringUtils.findFirstOf(text, " \t\n\r", position + 1);
        if (idx2 == -1) idx2 = text.length();
        
        token = text.substring(idx1 + 1, idx2);                      
        if (position + 1 < idx2)
            suffix = text.substring(position + 1, idx2).trim();        
        if (idx1 + 1 < position) 
            prefix = text.substring(idx1 + 1, position).trim();
        
        //get the previous token
        
        idx = idx2; // save idx2 for get preToken later
        
        //get the previous token            
        String  preToken = "";                
        if (idx1 != 0 ){
            idx2 = StringUtils.findLastNotOf(text, " \t\n\r", idx1);            
            idx1 = StringUtils.findLastOf(text, " \t\n\r", idx2);
            
            if (idx1 == -1) idx1 = 0;            
            if (idx2 != -1) 
                preToken = text.substring(idx1, idx2 + 1).trim();
        }
        
        //get the next token
        String nexToken = ""; 
        idx2 = idx;
        
        if (idx2 != text.length()){
            idx1 = StringUtils.findFirstNotOf(text, " \t\n\r", idx2 + 1);
            idx2 = StringUtils.findFirstOf(text, " \t\n\r", idx1);
            
            if (idx2 == -1) idx2 = text.length();
            if (idx1 != -1) 
                nexToken = text.substring(idx1, idx2).trim();            
        }        
        
        //generating context predicates
        String cps = "";
        // 01:tok=
		cps += " 01=" + token;
		// 02:tok-lower
		cps += " 02=" + token.toLowerCase();
		if (StringUtils.isFirstCap(token)) {
		    // 03:tok-first-cap
		    cps += " 03";
		}	
		if (map.containsKey(token.toLowerCase())) {
		    // 04:tok-in-abbrlist
		    cps += " 04";
		}	
		if (StringUtils.containNumber(token)) {
		    // 05:tok-has-num
		    cps += " 05";
		}
		if (StringUtils.containLetter(token)) {
		    // 06:tok-has-let
		    cps += " 06";
		}	
		if (StringUtils.containLetterAndDigit(token)) {
		    // 07:tok-has-let-num
		    cps += " 07";
		}	
		if (StringUtils.isAllNumber(token)) {
		    // 08:tok-is-all-num
		    cps += " 08";
		}	
		// 09:tok-countstop
		cps += " 09=" + Integer.toString(StringUtils.countStops(token));
		// 10:tok-countsign
		cps += " 10=" + Integer.toString(StringUtils.countPuncs(token));
	
		// 11:tok-pre
		cps += " 11=" + prefix;
		// 12:tok-pre-lower
		cps += " 12=" + prefix.toLowerCase();
		if (StringUtils.isFirstCap(prefix)) {
		    // 13:tok-pre-first-cap
		    cps += " 13";
		}	
		// 14:tok-suf
		cps += " 14=" + suffix;
		// 15:tok-suf-lower
		cps += " 15=" + suffix.toLowerCase();
		if (StringUtils.isFirstCap(suffix)) {
		    // 16:tok-suf-first-cap
		    cps += " 16";
		}
	
		if (preToken != "") {
		    // 17:pre-tok
		    cps += " 17=" + preToken;
		    // 18:pre-tok-lower
		    cps += " 18=" + preToken.toLowerCase();
		    if (StringUtils.isFirstCap(preToken)) {
			// 19:pre-tok-first-cap
			cps += " 19";
		    }	
		    if (map.containsKey(preToken.toLowerCase())) {
			// 20:pre-tok-in-abbrlist
			cps += " 20";
		    }	
		    if (StringUtils.containNumber(preToken)) {
			// 21:pre-tok-has-num
			cps += " 21";
		    }
		    if (StringUtils.containLetter(preToken)) {
			// 22:pre-tok-has-let
			cps += " 22";
		    }	
		    if (StringUtils.containLetterAndDigit(preToken)) {
			// 23:pre-tok-has-let-num
			cps += " 23";
		    }	
		    if (StringUtils.isAllNumber(preToken)) {
			// 24:pre-tok-is-allnum
			cps += " 24";
		    }	
		    // 25:pre-tok-countstop
		    cps += " 25=" + Integer.toString(StringUtils.countStops(preToken));
		    // 26:pre-tok-countsign
		    cps += " 26=" + Integer.toString(StringUtils.countPuncs(preToken));
		    
		} else {
		    // 27:pre-tok
		    cps += " 27=null";
		}
	
		if (nexToken != "") {
		    // 28:nex-tok
		    cps += " 28=" + nexToken;
		    // 29:nex-tok-lower
		    cps += " 29=" + nexToken.toLowerCase();
		    if (StringUtils.isFirstCap(nexToken)) {
			// 30:nex-tok-first-cap
			cps += " 30";
		    }	
		    if (map.containsKey(nexToken.toLowerCase())) {
			// 31:nex-tok-in-abbrlist
			cps += " 31";
		    }	
		    
		    if (nexToken.startsWith("\"") || nexToken.startsWith("''") || nexToken.startsWith("``") 
				|| nexToken.startsWith("'") || nexToken.startsWith("`")) {
			cps += " 39";
		    }
		    
		    if (StringUtils.isFirstCap(nexToken)) {
			cps += " 40";
		    }
		    
		    if (StringUtils.containNumber(nexToken)) {
			// 32:nex-tok-has-num
			cps += " 32";
		    }
		    if (StringUtils.containLetter(nexToken)) {
			// 33:nex-tok-has-let
			cps += " 33";
		    }	
		    if (StringUtils.containLetterAndDigit(nexToken)) {
			// 34:nex-tok-has-let-num
			cps += " 34";
		    }	
		    if (StringUtils.isAllNumber(nexToken)) {
			// 35:nex-tok-is-allnum
			cps += " 35";
		    }	
		    // 36:nex-tok-countstop
		    cps += " 36=" + Integer.toString(StringUtils.countStops(nexToken));
		    // 37:nex-tok-countsign
		    cps += " 37=" + Integer.toString(StringUtils.countPuncs(nexToken));
		    
		} else {
		    // 38:nex-tok
		    cps += " 38=null";
		}
	        
	        //extra context predicates for Vietnamese sensegment
	        
        //39:tok-has-@
        if (token.contains("@"))            
            cps += " 39";
        
        //40:len-of-prefix
        cps += " 40=" + prefix.length();
        
        //41:len-of-suffix
        cps += " 41=" + suffix.length();
        
        //42:tok-has-slash
        if (token.contains("/"))
            cps += " 42";
        
        //43:nex-tok-first_char
        if (nexToken != "")
            cps += " 43=" + nexToken.charAt(0);
        return cps.trim();
    }    
    
   }