All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.creole.tokeniser.SimpleTokeniser Maven / Gradle / Ivy

Go to download

ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.

There is a newer version: 9.1
Show newest version
/*
 *  DefaultTokeniser.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 2000
 *
 *  $Id: SimpleTokeniser.java 20054 2017-02-02 06:44:12Z markagreenwood $
 */

package gate.creole.tokeniser;

import java.io.BufferedReader;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;

/** Implementation of a Unicode rule based tokeniser.
 * The tokeniser gets its rules from a file an {@link java.io.InputStream
 * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
 * of the constructors.
 * The implementations is based on a finite state machine that is built based
 * on the set of rules.
 * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
 * that are separated by the ">" character. The LHS represents a
 * regular expression that will be matched against the input while the RHS
 * describes a Gate2 annotation in terms of annotation type and attribute-value
 * pairs.
 * The matching is done using Unicode enumarated types as defined by the {@link
 * java.lang.Character Character} class. At the time of writing this class the
 * suported Unicode categories were:
 * 
    *
  • UNASSIGNED *
  • UPPERCASE_LETTER *
  • LOWERCASE_LETTER *
  • TITLECASE_LETTER *
  • MODIFIER_LETTER *
  • OTHER_LETTER *
  • NON_SPACING_MARK *
  • ENCLOSING_MARK *
  • COMBINING_SPACING_MARK *
  • DECIMAL_DIGIT_NUMBER *
  • LETTER_NUMBER *
  • OTHER_NUMBER *
  • SPACE_SEPARATOR *
  • LINE_SEPARATOR *
  • PARAGRAPH_SEPARATOR *
  • CONTROL *
  • FORMAT *
  • PRIVATE_USE *
  • SURROGATE *
  • DASH_PUNCTUATION *
  • START_PUNCTUATION *
  • END_PUNCTUATION *
  • CONNECTOR_PUNCTUATION *
  • OTHER_PUNCTUATION *
  • MATH_SYMBOL *
  • CURRENCY_SYMBOL *
  • MODIFIER_SYMBOL *
  • OTHER_SYMBOL *
* The accepted operators for the LHS are "+", "*" and "|" having the usual * interpretations of "1 to n occurences", "0 to n occurences" and * "boolean OR". * For instance this is a valid LHS: *
"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ *
meaning an uppercase letter followed by one or more lowercase letters. * * The RHS describes an annotation that is to be created and inserted in the * annotation set provided in case of a match. The new annotation will span the * text that has been recognised. The RHS consists in the annotation type * followed by pairs of attributes and associated values. * E.g. for the LHS above a possible RHS can be:
* Token;kind=upperInitial;
* representing an annotation of type "Token" having one attribute * named "kind" with the value "upperInitial"
* The entire rule willbe:
*
"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;
*
* The tokeniser ignores all the empty lines or the ones that start with # or * //. * */ @CreoleResource(name="GATE Unicode Tokeniser", comment="A customisable Unicode tokeniser.", helpURL="http://gate.ac.uk/userguide/sec:annie:tokeniser", icon="tokeniser") public class SimpleTokeniser extends AbstractLanguageAnalyser implements ANNIEConstants { private static final long serialVersionUID = 1411111968361716069L; public static final String SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document"; public static final String SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName"; public static final String SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL"; public static final String SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding"; /** * Creates a tokeniser */ public SimpleTokeniser(){ } /** * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building * the finite state machine at the core of the tokeniser. * * @exception ResourceInstantiationException */ @Override public Resource init() throws ResourceInstantiationException{ BufferedReader bRulesReader = null; try{ if(rulesURL != null){ bRulesReader = new BufferedReader(new BomStrippingInputStreamReader(rulesURL.openStream(), encoding)); }else{ //no init data, Scream! throw new ResourceInstantiationException( "No URL provided for the rules!"); } initialState = new FSMState(this); String line = bRulesReader.readLine(); ///String toParse = ""; StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE); while (line != null){ if(line.endsWith("\\")){ ///toParse += line.substring(0,line.length()-1); toParse.append(line.substring(0,line.length()-1)); }else{ /*toParse += line; parseRule(toParse); toParse = ""; */ toParse.append(line); parseRule(toParse.toString()); toParse.delete(0,toParse.length()); } line = bRulesReader.readLine(); } eliminateVoidTransitions(); }catch(java.io.IOException ioe){ throw new ResourceInstantiationException(ioe); }catch(TokeniserException te){ throw new ResourceInstantiationException(te); } finally { IOUtils.closeQuietly(bRulesReader); } return this; } /** * Prepares this Processing resource for a new run. */ public void reset(){ document = null; } /** Parses one input line containing a tokeniser rule. * This will create the necessary FSMState objects and the links * between them. * * @param line the string containing the rule */ void parseRule(String line)throws TokeniserException{ //ignore comments if(line.startsWith("#")) return; if(line.startsWith("//")) return; StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true); FSMState newState = new FSMState(this); initialState.put(null, newState); FSMState finalState = parseLHS(newState, st, LHStoRHS); String rhs = ""; if(st.hasMoreTokens()) rhs = st.nextToken("\f"); if(rhs.length() > 0)finalState.setRhs(rhs); } // parseRule /** Parses a part or the entire LHS. * * @param startState a FSMState object representing the initial state for * the small FSM that will recognise the (part of) the rule parsed by this * method. * @param st a {@link java.util.StringTokenizer StringTokenizer} that * provides the input * @param until the string that marks the end of the section to be * recognised. This method will first be called by {@link * #parseRule(String)} with " >" in order to parse the entire * LHS. when necessary it will make itself another call to {@link #parseLHS * parseLHS} to parse a region of the LHS (e.g. a * "(",")" enclosed part. */ FSMState parseLHS(FSMState startState, StringTokenizer st, String until) throws TokeniserException{ FSMState currentState = startState; boolean orFound = false; List orList = new LinkedList(); String token; token = skipIgnoreTokens(st); if(null == token) return currentState; FSMState newState; Integer typeId; UnicodeType uType; bigwhile: while(!token.equals(until)){ if(token.equals("(")){//(..) newState = parseLHS(currentState, st,")"); } else if(token.equals("\"")){//"unicode_type" String sType = parseQuotedString(st, "\""); newState = new FSMState(this); typeId = stringTypeIds.get(sType); if(null == typeId) throw new InvalidRuleException("Invalid type: \"" + sType + "\""); else uType = new UnicodeType(typeId.intValue()); currentState.put(uType ,newState); } else {// a type with no quotes String sType = token; newState = new FSMState(this); typeId = stringTypeIds.get(sType); if(null == typeId) throw new InvalidRuleException("Invalid type: \"" + sType + "\""); else uType = new UnicodeType(typeId.intValue()); currentState.put(uType ,newState); } //treat the operators token = skipIgnoreTokens(st); if(null == token) throw new InvalidRuleException("Tokeniser rule ended too soon!"); if(token.equals("|")) { orFound = true; orList.add(newState); token = skipIgnoreTokens(st); if(null == token) throw new InvalidRuleException("Tokeniser rule ended too soon!"); continue bigwhile; } else if(orFound) {//done parsing the "|" orFound = false; orList.add(newState); newState = new FSMState(this); Iterator orListIter = orList.iterator(); while(orListIter.hasNext()) orListIter.next().put(null, newState); orList.clear(); } if(token.equals("+")) { newState.put(null,currentState); currentState = newState; newState = new FSMState(this); currentState.put(null,newState); token = skipIgnoreTokens(st); if(null == token) throw new InvalidRuleException("Tokeniser rule ended too soon!"); } else if(token.equals("*")) { currentState.put(null,newState); newState.put(null,currentState); currentState = newState; newState = new FSMState(this); currentState.put(null,newState); token = skipIgnoreTokens(st); if(null == token) throw new InvalidRuleException("Tokeniser rule ended too soon!"); } currentState = newState; } return currentState; } // parseLHS /** Parses from the given string tokeniser until it finds a specific * delimiter. * One use for this method is to read everything until the first quote. * * @param st a {@link java.util.StringTokenizer StringTokenizer} that * provides the input * @param until a String representing the end delimiter. */ String parseQuotedString(StringTokenizer st, String until) throws TokeniserException { String token; if(st.hasMoreElements()) token = st.nextToken(); else return null; ///String type = ""; StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE); while(!token.equals(until)){ //type += token; type.append(token); if(st.hasMoreElements())token = st.nextToken(); else throw new InvalidRuleException("Tokeniser rule ended too soon!"); } return type.toString(); } // parseQuotedString /** Skips the ignorable tokens from the input returning the first significant * token. * The ignorable tokens are defined by {@link #ignoreTokens a set} */ protected static String skipIgnoreTokens(StringTokenizer st){ Iterator ignorables; boolean ignorableFound = false; String currentToken; while(true){ if(st.hasMoreTokens()){ currentToken = st.nextToken(); ignorables = ignoreTokens.iterator(); ignorableFound = false; while(!ignorableFound && ignorables.hasNext()){ if(currentToken.equals(ignorables.next())) ignorableFound = true; } if(!ignorableFound) return currentToken; } else return null; } }//skipIgnoreTokens /* Computes the lambda-closure (aka epsilon closure) of the given set of * states, that is the set of states that are accessible from any of the * states in the given set using only unrestricted transitions. * @return a set containing all the states accessible from this state via * transitions that bear no restrictions. */ /** * Converts the finite state machine to a deterministic one. * * @param s */ private AbstractSet lambdaClosure(Set s){ //the stack/queue used by the algorithm LinkedList list = new LinkedList(s); //the set to be returned AbstractSet lambdaClosure = new HashSet(s); FSMState top; FSMState currentState; Set nextStates; Iterator statesIter; while(!list.isEmpty()) { top = list.removeFirst(); nextStates = top.nextSet(null); if(null != nextStates){ statesIter = nextStates.iterator(); while(statesIter.hasNext()) { currentState = statesIter.next(); if(!lambdaClosure.contains(currentState)){ lambdaClosure.add(currentState); list.addFirst(currentState); }//if(!lambdaClosure.contains(currentState)) }//while(statesIter.hasNext()) }//if(null != nextStates) } return lambdaClosure; } // lambdaClosure /** Converts the FSM from a non-deterministic to a deterministic one by * eliminating all the unrestricted transitions. */ void eliminateVoidTransitions() throws TokeniserException { //kalina:clear() faster than init() which is called with init() newStates.clear(); Set> sdStates = new HashSet>(); LinkedList> unmarkedDStates = new LinkedList>(); DFSMState dCurrentState = new DFSMState(this); Set sdCurrentState = new HashSet(); sdCurrentState.add(initialState); sdCurrentState = lambdaClosure(sdCurrentState); newStates.put(sdCurrentState, dCurrentState); sdStates.add(sdCurrentState); //find out if the new state is a final one Iterator innerStatesIter = sdCurrentState.iterator(); String rhs; FSMState currentInnerState; Set rhsClashSet = new HashSet(); boolean newRhs = false; while(innerStatesIter.hasNext()){ currentInnerState = innerStatesIter.next(); if(currentInnerState.isFinal()){ rhs = currentInnerState.getRhs(); rhsClashSet.add(rhs); dCurrentState.rhs = rhs; newRhs = true; } } if(rhsClashSet.size() > 1){ Err.println("Warning, rule clash: " + rhsClashSet + "\nSelected last definition: " + dCurrentState.rhs); } if(newRhs)dCurrentState.buildTokenDesc(); rhsClashSet.clear(); unmarkedDStates.addFirst(sdCurrentState); dInitialState = dCurrentState; Set nextSet; while(!unmarkedDStates.isEmpty()){ //Out.println("\n\n=====================" + unmarkedDStates.size()); sdCurrentState = unmarkedDStates.removeFirst(); for(int type = 0; type < maxTypeId; type++){ //Out.print(type); nextSet = new HashSet(); innerStatesIter = sdCurrentState.iterator(); while(innerStatesIter.hasNext()){ currentInnerState = innerStatesIter.next(); Set tempSet = currentInnerState.nextSet(type); if(null != tempSet) nextSet.addAll(tempSet); }//while(innerStatesIter.hasNext()) if(!nextSet.isEmpty()){ nextSet = lambdaClosure(nextSet); dCurrentState = newStates.get(nextSet); if(dCurrentState == null){ //we have a new DFSMState dCurrentState = new DFSMState(this); sdStates.add(nextSet); unmarkedDStates.add(nextSet); //check to see whether the new state is a final one innerStatesIter = nextSet.iterator(); newRhs =false; while(innerStatesIter.hasNext()){ currentInnerState = innerStatesIter.next(); if(currentInnerState.isFinal()){ rhs = currentInnerState.getRhs(); rhsClashSet.add(rhs); dCurrentState.rhs = rhs; newRhs = true; } } if(rhsClashSet.size() > 1){ Err.println("Warning, rule clash: " + rhsClashSet + "\nSelected last definition: " + dCurrentState.rhs); } if(newRhs)dCurrentState.buildTokenDesc(); rhsClashSet.clear(); newStates.put(nextSet, dCurrentState); } newStates.get(sdCurrentState).put(type,dCurrentState); } // if(!nextSet.isEmpty()) } // for(byte type = 0; type < 256; type++) } // while(!unmarkedDStates.isEmpty()) } // eliminateVoidTransitions /** Returns a string representation of the non-deterministic FSM graph using * GML (Graph modelling language). */ public String getFSMgml(){ String res = "graph[ \ndirected 1\n"; ///String nodes = "", edges = ""; StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); Iterator fsmStatesIter = fsmStates.iterator(); while (fsmStatesIter.hasNext()){ FSMState currentState = fsmStatesIter.next(); int stateIndex = currentState.getIndex(); /*nodes += "node[ id " + stateIndex + " label \"" + stateIndex; */ nodes.append("node[ id "); nodes.append(stateIndex); nodes.append(" label \""); nodes.append(stateIndex); if(currentState.isFinal()){ ///nodes += ",F\\n" + currentState.getRhs(); nodes.append(",F\\n" + currentState.getRhs()); } ///nodes += "\" ]\n"; nodes.append("\" ]\n"); ///edges += currentState.getEdgesGML(); edges.append(currentState.getEdgesGML()); } res += nodes.toString() + edges.toString() + "]\n"; return res; } // getFSMgml /** Returns a string representation of the deterministic FSM graph using * GML. */ public String getDFSMgml() { String res = "graph[ \ndirected 1\n"; ///String nodes = "", edges = ""; StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); Iterator dfsmStatesIter = dfsmStates.iterator(); while (dfsmStatesIter.hasNext()) { DFSMState currentState = dfsmStatesIter.next(); int stateIndex = currentState.getIndex(); /* nodes += "node[ id " + stateIndex + " label \"" + stateIndex; */ nodes.append("node[ id "); nodes.append(stateIndex); nodes.append(" label \""); nodes.append(stateIndex); if(currentState.isFinal()){ /// nodes += ",F\\n" + currentState.getRhs(); nodes.append(",F\\n" + currentState.getRhs()); } /// nodes += "\" ]\n"; nodes.append("\" ]\n"); /// edges += currentState.getEdgesGML(); edges.append(currentState.getEdgesGML()); } res += nodes.toString() + edges.toString() + "]\n"; return res; } // getDFSMgml /** * The method that does the actual tokenisation. */ @Override public void execute() throws ExecutionException { interrupted = false; AnnotationSet annotationSet; //check the input if(document == null) { throw new ExecutionException( "No document to tokenise!" ); } if(annotationSetName == null || annotationSetName.equals("")) annotationSet = document.getAnnotations(); else annotationSet = document.getAnnotations(annotationSetName); fireStatusChanged( "Tokenising " + document.getName() + "..."); String content = document.getContent().toString(); int length = content.length(); int currentChar; int charsInCurrentCP = 1; DFSMState graphPosition = dInitialState; //the index of the first character of the token trying to be recognised int tokenStart = 0; DFSMState lastMatchingState = null; DFSMState nextState; String tokenString; int charIdx = 0; int oldCharIdx = 0; FeatureMap newTokenFm; while(charIdx < length){ currentChar = content.codePointAt(charIdx); // number of chars we have to advance after processing this code point. // 1 in the vast majority of cases, but 2 where the code point is a // supplementary character represented as a surrogate pair. charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1; // Out.println( // currentChar + typesMnemonics[Character.getType(currentChar)+128]); nextState = graphPosition.next(typeIds.get( new Integer(Character.getType(currentChar))).intValue()); if( null != nextState ) { graphPosition = nextState; if(graphPosition.isFinal()) { lastMatchingState = graphPosition; } charIdx += charsInCurrentCP; } else {//we have a match! newTokenFm = Factory.newFeatureMap(); if (null == lastMatchingState) { // no rule matches this character, so create a single-char // DEFAULT_TOKEN annotation covering it and start again after it charIdx = tokenStart + charsInCurrentCP; tokenString = content.substring(tokenStart, charIdx); newTokenFm.put("type","UNKNOWN"); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); try { annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! ioe.printStackTrace(Err.getPrintWriter()); } // Out.println("Default token: " + tokenStart + // "->" + tokenStart + " :" + tokenString + ";"); } else { // we've reached the end of a string that the FSM recognised tokenString = content.substring(tokenStart, charIdx); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + // lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch(InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } // Out.println(lastMatchingState.getTokenDesc()[0][0] + // ": " + tokenStart + "->" + lastMatch + // " :" + tokenString + ";"); //charIdx = lastMatch + 1; } // reset to initial state and start looking again from here lastMatchingState = null; graphPosition = dInitialState; tokenStart = charIdx; } if((charIdx - oldCharIdx > 256)){ fireProgressChanged((100 * charIdx )/ length ); oldCharIdx = charIdx; if(isInterrupted()) throw new ExecutionInterruptedException(); } } // while(charIdx < length) if (null != lastMatchingState) { // we dropped off the end having found a match, annotate it tokenString = content.substring(tokenStart, charIdx); newTokenFm = Factory.newFeatureMap(); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch(InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } } reset(); fireProcessFinished(); fireStatusChanged("Tokenisation complete!"); } // run /** * Sets the value of the rulesURL property which holds an URL * to the file containing the rules for this tokeniser. * @param newRulesURL */ @CreoleParameter(defaultValue="resources/tokeniser/DefaultTokeniser.rules", comment="The URL to the rules file", suffixes="rules") public void setRulesURL(ResourceReference newRulesURL) { rulesURL = newRulesURL; } @Deprecated public void setRulesURL(URL newRulesURL) { try { this.setRulesURL(new ResourceReference(newRulesURL)); } catch (URISyntaxException e) { throw new RuntimeException("Error converting URL to ResourceReference", e); } } /** * Gets the value of the rulesURL property hich holds an * URL to the file containing the rules for this tokeniser. */ public ResourceReference getRulesURL() { return rulesURL; } @RunTime @Optional @CreoleParameter(comment="The annotation set to be used for the generated annotations") public void setAnnotationSetName(String newAnnotationSetName) { annotationSetName = newAnnotationSetName; } /** */ public String getAnnotationSetName() { return annotationSetName; } public void setRulesResourceName(String newRulesResourceName) { rulesResourceName = newRulesResourceName; } public String getRulesResourceName() { return rulesResourceName; } @CreoleParameter(defaultValue="UTF-8", comment="The encoding used for reading the definitions") public void setEncoding(String newEncoding) { encoding = newEncoding; } public String getEncoding() { return encoding; } /** the annotations et where the new annotations will be adde */ protected String annotationSetName; /** The initial state of the non deterministic machin */ protected FSMState initialState; /** A set containng all the states of the non deterministic machine */ protected Set fsmStates = new HashSet(); /** The initial state of the deterministic machine */ protected DFSMState dInitialState; /** A set containng all the states of the deterministic machine */ protected Set dfsmStates = new HashSet(); /** The separator from LHS to RH */ static String LHStoRHS = ">"; /** A set of string representing tokens to be ignored (e.g. blanks */ protected static final Set ignoreTokens; /** maps from int (the static value on {@link java.lang.Character} to int * the internal value used by the tokeniser. The ins values used by the * tokeniser are consecutive values, starting from 0 and going as high as * necessary. * They map all the public static int members on{@link java.lang.Character} */ protected static final Map typeIds; /** The maximum int value used internally as a type i */ protected static final int maxTypeId; /** Maps the internal type ids to the type name */ protected static final List typeMnemonics; /** Maps from type names to type internal id */ protected static final Map stringTypeIds; private String rulesResourceName; private ResourceReference rulesURL; private String encoding; //kalina: added this as method to minimise too many init() calls protected transient Map, DFSMState> newStates = new HashMap, DFSMState>(); /** The static initialiser will inspect the class {@link java.lang.Character} * using reflection to find all the public static members and will map them * to ids starting from 0. * After that it will build all the static data: {@link #typeIds}, {@link * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds} */ static{ Field[] characterClassFields; try{ characterClassFields = Class.forName("java.lang.Character").getFields(); }catch(ClassNotFoundException cnfe){ throw new GateRuntimeException("Could not find the java.lang.Character class!"); } Collection staticFields = new LinkedList(); // JDK 1.4 introduced directionality constants that have the same values as //character types; we need to skip those as well for(int i = 0; i< characterClassFields.length; i++) if(Modifier.isStatic(characterClassFields[i].getModifiers()) && characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1) staticFields.add(characterClassFields[i]); Map tempTypeIds = new HashMap(); maxTypeId = staticFields.size() -1; String[] mnemonics = new String[maxTypeId+1]; Map tempStringTypeIds = new HashMap(); Iterator staticFieldsIter = staticFields.iterator(); Field currentField; int currentId = 0; String fieldName; try { while(staticFieldsIter.hasNext()){ currentField = staticFieldsIter.next(); if(currentField.getType().toString().equals("byte")){ fieldName = currentField.getName(); tempTypeIds.put(new Integer(currentField.getInt(null)), new Integer(currentId)); mnemonics[currentId]= fieldName; tempStringTypeIds.put(fieldName, new Integer(currentId)); currentId++; } } } catch(Exception e) { throw new GateRuntimeException(e.toString()); } typeIds = Collections.unmodifiableMap(tempTypeIds); stringTypeIds = Collections.unmodifiableMap(tempStringTypeIds); Set toIgnore = new HashSet(); toIgnore.add(" "); toIgnore.add("\t"); toIgnore.add("\f"); ignoreTokens = Collections.unmodifiableSet(toIgnore); typeMnemonics = Collections.unmodifiableList(Arrays.asList(mnemonics)); } } // class DefaultTokeniser




© 2015 - 2024 Weber Informatics LLC | Privacy Policy