gate.creole.tokeniser.SimpleTokeniser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
/*
* DefaultTokeniser.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Valentin Tablan, 2000
*
* $Id: SimpleTokeniser.java 20054 2017-02-02 06:44:12Z markagreenwood $
*/
package gate.creole.tokeniser;
import java.io.BufferedReader;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.commons.io.IOUtils;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
/** Implementation of a Unicode rule based tokeniser.
* The tokeniser gets its rules from a file an {@link java.io.InputStream
* InputStream} or a {@link java.io.Reader Reader} which should be sent to one
* of the constructors.
* The implementations is based on a finite state machine that is built based
* on the set of rules.
* A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
* that are separated by the ">" character. The LHS represents a
* regular expression that will be matched against the input while the RHS
* describes a Gate2 annotation in terms of annotation type and attribute-value
* pairs.
* The matching is done using Unicode enumarated types as defined by the {@link
* java.lang.Character Character} class. At the time of writing this class the
* suported Unicode categories were:
*
* - UNASSIGNED
*
- UPPERCASE_LETTER
*
- LOWERCASE_LETTER
*
- TITLECASE_LETTER
*
- MODIFIER_LETTER
*
- OTHER_LETTER
*
- NON_SPACING_MARK
*
- ENCLOSING_MARK
*
- COMBINING_SPACING_MARK
*
- DECIMAL_DIGIT_NUMBER
*
- LETTER_NUMBER
*
- OTHER_NUMBER
*
- SPACE_SEPARATOR
*
- LINE_SEPARATOR
*
- PARAGRAPH_SEPARATOR
*
- CONTROL
*
- FORMAT
*
- PRIVATE_USE
*
- SURROGATE
*
- DASH_PUNCTUATION
*
- START_PUNCTUATION
*
- END_PUNCTUATION
*
- CONNECTOR_PUNCTUATION
*
- OTHER_PUNCTUATION
*
- MATH_SYMBOL
*
- CURRENCY_SYMBOL
*
- MODIFIER_SYMBOL
*
- OTHER_SYMBOL
*
* The accepted operators for the LHS are "+", "*" and "|" having the usual
* interpretations of "1 to n occurences", "0 to n occurences" and
* "boolean OR".
* For instance this is a valid LHS:
*
"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
*
meaning an uppercase letter followed by one or more lowercase letters.
*
* The RHS describes an annotation that is to be created and inserted in the
* annotation set provided in case of a match. The new annotation will span the
* text that has been recognised. The RHS consists in the annotation type
* followed by pairs of attributes and associated values.
* E.g. for the LHS above a possible RHS can be:
* Token;kind=upperInitial;
* representing an annotation of type "Token" having one attribute
* named "kind" with the value "upperInitial"
* The entire rule willbe:
* "UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;
*
* The tokeniser ignores all the empty lines or the ones that start with # or
* //.
*
*/
@CreoleResource(name="GATE Unicode Tokeniser", comment="A customisable Unicode tokeniser.", helpURL="http://gate.ac.uk/userguide/sec:annie:tokeniser", icon="tokeniser")
public class SimpleTokeniser extends AbstractLanguageAnalyser implements ANNIEConstants {
private static final long serialVersionUID = 1411111968361716069L;
public static final String
SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
public static final String
SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
public static final String
SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
public static final String
SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
/**
* Creates a tokeniser
*/
public SimpleTokeniser(){
}
/**
* Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
* the finite state machine at the core of the tokeniser.
*
* @exception ResourceInstantiationException
*/
@Override
public Resource init() throws ResourceInstantiationException{
BufferedReader bRulesReader = null;
try{
if(rulesURL != null){
bRulesReader = new BufferedReader(new BomStrippingInputStreamReader(rulesURL.openStream(), encoding));
}else{
//no init data, Scream!
throw new ResourceInstantiationException(
"No URL provided for the rules!");
}
initialState = new FSMState(this);
String line = bRulesReader.readLine();
///String toParse = "";
StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
while (line != null){
if(line.endsWith("\\")){
///toParse += line.substring(0,line.length()-1);
toParse.append(line.substring(0,line.length()-1));
}else{
/*toParse += line;
parseRule(toParse);
toParse = "";
*/
toParse.append(line);
parseRule(toParse.toString());
toParse.delete(0,toParse.length());
}
line = bRulesReader.readLine();
}
eliminateVoidTransitions();
}catch(java.io.IOException ioe){
throw new ResourceInstantiationException(ioe);
}catch(TokeniserException te){
throw new ResourceInstantiationException(te);
}
finally {
IOUtils.closeQuietly(bRulesReader);
}
return this;
}
/**
* Prepares this Processing resource for a new run.
*/
public void reset(){
document = null;
}
/** Parses one input line containing a tokeniser rule.
* This will create the necessary FSMState objects and the links
* between them.
*
* @param line the string containing the rule
*/
void parseRule(String line)throws TokeniserException{
//ignore comments
if(line.startsWith("#")) return;
if(line.startsWith("//")) return;
StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
FSMState newState = new FSMState(this);
initialState.put(null, newState);
FSMState finalState = parseLHS(newState, st, LHStoRHS);
String rhs = "";
if(st.hasMoreTokens()) rhs = st.nextToken("\f");
if(rhs.length() > 0)finalState.setRhs(rhs);
} // parseRule
/** Parses a part or the entire LHS.
*
* @param startState a FSMState object representing the initial state for
* the small FSM that will recognise the (part of) the rule parsed by this
* method.
* @param st a {@link java.util.StringTokenizer StringTokenizer} that
* provides the input
* @param until the string that marks the end of the section to be
* recognised. This method will first be called by {@link
* #parseRule(String)} with " >" in order to parse the entire
* LHS. when necessary it will make itself another call to {@link #parseLHS
* parseLHS} to parse a region of the LHS (e.g. a
* "(",")" enclosed part.
*/
FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
throws TokeniserException{
FSMState currentState = startState;
boolean orFound = false;
List orList = new LinkedList();
String token;
token = skipIgnoreTokens(st);
if(null == token) return currentState;
FSMState newState;
Integer typeId;
UnicodeType uType;
bigwhile: while(!token.equals(until)){
if(token.equals("(")){//(..)
newState = parseLHS(currentState, st,")");
} else if(token.equals("\"")){//"unicode_type"
String sType = parseQuotedString(st, "\"");
newState = new FSMState(this);
typeId = stringTypeIds.get(sType);
if(null == typeId)
throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
else uType = new UnicodeType(typeId.intValue());
currentState.put(uType ,newState);
} else {// a type with no quotes
String sType = token;
newState = new FSMState(this);
typeId = stringTypeIds.get(sType);
if(null == typeId)
throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
else uType = new UnicodeType(typeId.intValue());
currentState.put(uType ,newState);
}
//treat the operators
token = skipIgnoreTokens(st);
if(null == token) throw
new InvalidRuleException("Tokeniser rule ended too soon!");
if(token.equals("|")) {
orFound = true;
orList.add(newState);
token = skipIgnoreTokens(st);
if(null == token) throw
new InvalidRuleException("Tokeniser rule ended too soon!");
continue bigwhile;
} else if(orFound) {//done parsing the "|"
orFound = false;
orList.add(newState);
newState = new FSMState(this);
Iterator orListIter = orList.iterator();
while(orListIter.hasNext())
orListIter.next().put(null, newState);
orList.clear();
}
if(token.equals("+")) {
newState.put(null,currentState);
currentState = newState;
newState = new FSMState(this);
currentState.put(null,newState);
token = skipIgnoreTokens(st);
if(null == token) throw
new InvalidRuleException("Tokeniser rule ended too soon!");
} else if(token.equals("*")) {
currentState.put(null,newState);
newState.put(null,currentState);
currentState = newState;
newState = new FSMState(this);
currentState.put(null,newState);
token = skipIgnoreTokens(st);
if(null == token) throw
new InvalidRuleException("Tokeniser rule ended too soon!");
}
currentState = newState;
}
return currentState;
} // parseLHS
/** Parses from the given string tokeniser until it finds a specific
* delimiter.
* One use for this method is to read everything until the first quote.
*
* @param st a {@link java.util.StringTokenizer StringTokenizer} that
* provides the input
* @param until a String representing the end delimiter.
*/
String parseQuotedString(StringTokenizer st, String until)
throws TokeniserException {
String token;
if(st.hasMoreElements()) token = st.nextToken();
else return null;
///String type = "";
StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
while(!token.equals(until)){
//type += token;
type.append(token);
if(st.hasMoreElements())token = st.nextToken();
else throw new InvalidRuleException("Tokeniser rule ended too soon!");
}
return type.toString();
} // parseQuotedString
/** Skips the ignorable tokens from the input returning the first significant
* token.
* The ignorable tokens are defined by {@link #ignoreTokens a set}
*/
protected static String skipIgnoreTokens(StringTokenizer st){
Iterator ignorables;
boolean ignorableFound = false;
String currentToken;
while(true){
if(st.hasMoreTokens()){
currentToken = st.nextToken();
ignorables = ignoreTokens.iterator();
ignorableFound = false;
while(!ignorableFound && ignorables.hasNext()){
if(currentToken.equals(ignorables.next()))
ignorableFound = true;
}
if(!ignorableFound) return currentToken;
} else return null;
}
}//skipIgnoreTokens
/* Computes the lambda-closure (aka epsilon closure) of the given set of
* states, that is the set of states that are accessible from any of the
* states in the given set using only unrestricted transitions.
* @return a set containing all the states accessible from this state via
* transitions that bear no restrictions.
*/
/**
* Converts the finite state machine to a deterministic one.
*
* @param s
*/
private AbstractSet lambdaClosure(Set s){
//the stack/queue used by the algorithm
LinkedList list = new LinkedList(s);
//the set to be returned
AbstractSet lambdaClosure = new HashSet(s);
FSMState top;
FSMState currentState;
Set nextStates;
Iterator statesIter;
while(!list.isEmpty()) {
top = list.removeFirst();
nextStates = top.nextSet(null);
if(null != nextStates){
statesIter = nextStates.iterator();
while(statesIter.hasNext()) {
currentState = statesIter.next();
if(!lambdaClosure.contains(currentState)){
lambdaClosure.add(currentState);
list.addFirst(currentState);
}//if(!lambdaClosure.contains(currentState))
}//while(statesIter.hasNext())
}//if(null != nextStates)
}
return lambdaClosure;
} // lambdaClosure
/** Converts the FSM from a non-deterministic to a deterministic one by
* eliminating all the unrestricted transitions.
*/
void eliminateVoidTransitions() throws TokeniserException {
//kalina:clear() faster than init() which is called with init()
newStates.clear();
Set> sdStates = new HashSet>();
LinkedList> unmarkedDStates = new LinkedList>();
DFSMState dCurrentState = new DFSMState(this);
Set sdCurrentState = new HashSet();
sdCurrentState.add(initialState);
sdCurrentState = lambdaClosure(sdCurrentState);
newStates.put(sdCurrentState, dCurrentState);
sdStates.add(sdCurrentState);
//find out if the new state is a final one
Iterator innerStatesIter = sdCurrentState.iterator();
String rhs;
FSMState currentInnerState;
Set rhsClashSet = new HashSet();
boolean newRhs = false;
while(innerStatesIter.hasNext()){
currentInnerState = innerStatesIter.next();
if(currentInnerState.isFinal()){
rhs = currentInnerState.getRhs();
rhsClashSet.add(rhs);
dCurrentState.rhs = rhs;
newRhs = true;
}
}
if(rhsClashSet.size() > 1){
Err.println("Warning, rule clash: " + rhsClashSet +
"\nSelected last definition: " + dCurrentState.rhs);
}
if(newRhs)dCurrentState.buildTokenDesc();
rhsClashSet.clear();
unmarkedDStates.addFirst(sdCurrentState);
dInitialState = dCurrentState;
Set nextSet;
while(!unmarkedDStates.isEmpty()){
//Out.println("\n\n=====================" + unmarkedDStates.size());
sdCurrentState = unmarkedDStates.removeFirst();
for(int type = 0; type < maxTypeId; type++){
//Out.print(type);
nextSet = new HashSet();
innerStatesIter = sdCurrentState.iterator();
while(innerStatesIter.hasNext()){
currentInnerState = innerStatesIter.next();
Set tempSet = currentInnerState.nextSet(type);
if(null != tempSet) nextSet.addAll(tempSet);
}//while(innerStatesIter.hasNext())
if(!nextSet.isEmpty()){
nextSet = lambdaClosure(nextSet);
dCurrentState = newStates.get(nextSet);
if(dCurrentState == null){
//we have a new DFSMState
dCurrentState = new DFSMState(this);
sdStates.add(nextSet);
unmarkedDStates.add(nextSet);
//check to see whether the new state is a final one
innerStatesIter = nextSet.iterator();
newRhs =false;
while(innerStatesIter.hasNext()){
currentInnerState = innerStatesIter.next();
if(currentInnerState.isFinal()){
rhs = currentInnerState.getRhs();
rhsClashSet.add(rhs);
dCurrentState.rhs = rhs;
newRhs = true;
}
}
if(rhsClashSet.size() > 1){
Err.println("Warning, rule clash: " + rhsClashSet +
"\nSelected last definition: " + dCurrentState.rhs);
}
if(newRhs)dCurrentState.buildTokenDesc();
rhsClashSet.clear();
newStates.put(nextSet, dCurrentState);
}
newStates.get(sdCurrentState).put(type,dCurrentState);
} // if(!nextSet.isEmpty())
} // for(byte type = 0; type < 256; type++)
} // while(!unmarkedDStates.isEmpty())
} // eliminateVoidTransitions
/** Returns a string representation of the non-deterministic FSM graph using
* GML (Graph modelling language).
*/
public String getFSMgml(){
String res = "graph[ \ndirected 1\n";
///String nodes = "", edges = "";
StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
Iterator fsmStatesIter = fsmStates.iterator();
while (fsmStatesIter.hasNext()){
FSMState currentState = fsmStatesIter.next();
int stateIndex = currentState.getIndex();
/*nodes += "node[ id " + stateIndex +
" label \"" + stateIndex;
*/
nodes.append("node[ id ");
nodes.append(stateIndex);
nodes.append(" label \"");
nodes.append(stateIndex);
if(currentState.isFinal()){
///nodes += ",F\\n" + currentState.getRhs();
nodes.append(",F\\n" + currentState.getRhs());
}
///nodes += "\" ]\n";
nodes.append("\" ]\n");
///edges += currentState.getEdgesGML();
edges.append(currentState.getEdgesGML());
}
res += nodes.toString() + edges.toString() + "]\n";
return res;
} // getFSMgml
/** Returns a string representation of the deterministic FSM graph using
* GML.
*/
public String getDFSMgml() {
String res = "graph[ \ndirected 1\n";
///String nodes = "", edges = "";
StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
Iterator dfsmStatesIter = dfsmStates.iterator();
while (dfsmStatesIter.hasNext()) {
DFSMState currentState = dfsmStatesIter.next();
int stateIndex = currentState.getIndex();
/* nodes += "node[ id " + stateIndex +
" label \"" + stateIndex;
*/
nodes.append("node[ id ");
nodes.append(stateIndex);
nodes.append(" label \"");
nodes.append(stateIndex);
if(currentState.isFinal()){
/// nodes += ",F\\n" + currentState.getRhs();
nodes.append(",F\\n" + currentState.getRhs());
}
/// nodes += "\" ]\n";
nodes.append("\" ]\n");
/// edges += currentState.getEdgesGML();
edges.append(currentState.getEdgesGML());
}
res += nodes.toString() + edges.toString() + "]\n";
return res;
} // getDFSMgml
/**
* The method that does the actual tokenisation.
*/
@Override
public void execute() throws ExecutionException {
interrupted = false;
AnnotationSet annotationSet;
//check the input
if(document == null) {
throw new ExecutionException(
"No document to tokenise!"
);
}
if(annotationSetName == null ||
annotationSetName.equals("")) annotationSet = document.getAnnotations();
else annotationSet = document.getAnnotations(annotationSetName);
fireStatusChanged(
"Tokenising " + document.getName() + "...");
String content = document.getContent().toString();
int length = content.length();
int currentChar;
int charsInCurrentCP = 1;
DFSMState graphPosition = dInitialState;
//the index of the first character of the token trying to be recognised
int tokenStart = 0;
DFSMState lastMatchingState = null;
DFSMState nextState;
String tokenString;
int charIdx = 0;
int oldCharIdx = 0;
FeatureMap newTokenFm;
while(charIdx < length){
currentChar = content.codePointAt(charIdx);
// number of chars we have to advance after processing this code point.
// 1 in the vast majority of cases, but 2 where the code point is a
// supplementary character represented as a surrogate pair.
charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1;
// Out.println(
// currentChar + typesMnemonics[Character.getType(currentChar)+128]);
nextState = graphPosition.next(typeIds.get(
new Integer(Character.getType(currentChar))).intValue());
if( null != nextState ) {
graphPosition = nextState;
if(graphPosition.isFinal()) {
lastMatchingState = graphPosition;
}
charIdx += charsInCurrentCP;
} else {//we have a match!
newTokenFm = Factory.newFeatureMap();
if (null == lastMatchingState) {
// no rule matches this character, so create a single-char
// DEFAULT_TOKEN annotation covering it and start again after it
charIdx = tokenStart + charsInCurrentCP;
tokenString = content.substring(tokenStart, charIdx);
newTokenFm.put("type","UNKNOWN");
newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
Integer.toString(tokenString.length()));
try {
annotationSet.add(new Long(tokenStart),
new Long(charIdx),
"DEFAULT_TOKEN", newTokenFm);
} catch (InvalidOffsetException ioe) {
//This REALLY shouldn't happen!
ioe.printStackTrace(Err.getPrintWriter());
}
// Out.println("Default token: " + tokenStart +
// "->" + tokenStart + " :" + tokenString + ";");
} else {
// we've reached the end of a string that the FSM recognised
tokenString = content.substring(tokenStart, charIdx);
newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
Integer.toString(tokenString.length()));
for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
lastMatchingState.getTokenDesc()[i][1]);
//Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
// lastMatchingState.getTokenDesc()[i][1]);
}
try {
annotationSet.add(new Long(tokenStart),
new Long(charIdx),
lastMatchingState.getTokenDesc()[0][0], newTokenFm);
} catch(InvalidOffsetException ioe) {
//This REALLY shouldn't happen!
throw new GateRuntimeException(ioe.toString());
}
// Out.println(lastMatchingState.getTokenDesc()[0][0] +
// ": " + tokenStart + "->" + lastMatch +
// " :" + tokenString + ";");
//charIdx = lastMatch + 1;
}
// reset to initial state and start looking again from here
lastMatchingState = null;
graphPosition = dInitialState;
tokenStart = charIdx;
}
if((charIdx - oldCharIdx > 256)){
fireProgressChanged((100 * charIdx )/ length );
oldCharIdx = charIdx;
if(isInterrupted()) throw new ExecutionInterruptedException();
}
} // while(charIdx < length)
if (null != lastMatchingState) {
// we dropped off the end having found a match, annotate it
tokenString = content.substring(tokenStart, charIdx);
newTokenFm = Factory.newFeatureMap();
newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
Integer.toString(tokenString.length()));
for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
lastMatchingState.getTokenDesc()[i][1]);
}
try {
annotationSet.add(new Long(tokenStart),
new Long(charIdx),
lastMatchingState.getTokenDesc()[0][0], newTokenFm);
} catch(InvalidOffsetException ioe) {
//This REALLY shouldn't happen!
throw new GateRuntimeException(ioe.toString());
}
}
reset();
fireProcessFinished();
fireStatusChanged("Tokenisation complete!");
} // run
/**
* Sets the value of the rulesURL
property which holds an URL
* to the file containing the rules for this tokeniser.
* @param newRulesURL
*/
@CreoleParameter(defaultValue="resources/tokeniser/DefaultTokeniser.rules", comment="The URL to the rules file", suffixes="rules")
public void setRulesURL(ResourceReference newRulesURL) {
rulesURL = newRulesURL;
}
@Deprecated
public void setRulesURL(URL newRulesURL) {
try {
this.setRulesURL(new ResourceReference(newRulesURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
/**
* Gets the value of the rulesURL
property hich holds an
* URL to the file containing the rules for this tokeniser.
*/
public ResourceReference getRulesURL() {
return rulesURL;
}
@RunTime
@Optional
@CreoleParameter(comment="The annotation set to be used for the generated annotations")
public void setAnnotationSetName(String newAnnotationSetName) {
annotationSetName = newAnnotationSetName;
}
/** */
public String getAnnotationSetName() {
return annotationSetName;
}
public void setRulesResourceName(String newRulesResourceName) {
rulesResourceName = newRulesResourceName;
}
public String getRulesResourceName() {
return rulesResourceName;
}
@CreoleParameter(defaultValue="UTF-8", comment="The encoding used for reading the definitions")
public void setEncoding(String newEncoding) {
encoding = newEncoding;
}
public String getEncoding() {
return encoding;
}
/** the annotations et where the new annotations will be adde
*/
protected String annotationSetName;
/** The initial state of the non deterministic machin
*/
protected FSMState initialState;
/** A set containng all the states of the non deterministic machine
*/
protected Set fsmStates = new HashSet();
/** The initial state of the deterministic machine
*/
protected DFSMState dInitialState;
/** A set containng all the states of the deterministic machine
*/
protected Set dfsmStates = new HashSet();
/** The separator from LHS to RH
*/
static String LHStoRHS = ">";
/** A set of string representing tokens to be ignored (e.g. blanks
*/
protected static final Set ignoreTokens;
/** maps from int (the static value on {@link java.lang.Character} to int
* the internal value used by the tokeniser. The ins values used by the
* tokeniser are consecutive values, starting from 0 and going as high as
* necessary.
* They map all the public static int members on{@link java.lang.Character}
*/
protected static final Map typeIds;
/** The maximum int value used internally as a type i
*/
protected static final int maxTypeId;
/** Maps the internal type ids to the type name
*/
protected static final List typeMnemonics;
/** Maps from type names to type internal id
*/
protected static final Map stringTypeIds;
private String rulesResourceName;
private ResourceReference rulesURL;
private String encoding;
//kalina: added this as method to minimise too many init() calls
protected transient Map, DFSMState> newStates = new HashMap, DFSMState>();
/** The static initialiser will inspect the class {@link java.lang.Character}
* using reflection to find all the public static members and will map them
* to ids starting from 0.
* After that it will build all the static data: {@link #typeIds}, {@link
* #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
*/
static{
Field[] characterClassFields;
try{
characterClassFields = Class.forName("java.lang.Character").getFields();
}catch(ClassNotFoundException cnfe){
throw new GateRuntimeException("Could not find the java.lang.Character class!");
}
Collection staticFields = new LinkedList();
// JDK 1.4 introduced directionality constants that have the same values as
//character types; we need to skip those as well
for(int i = 0; i< characterClassFields.length; i++)
if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
staticFields.add(characterClassFields[i]);
Map tempTypeIds = new HashMap();
maxTypeId = staticFields.size() -1;
String[] mnemonics = new String[maxTypeId+1];
Map tempStringTypeIds = new HashMap();
Iterator staticFieldsIter = staticFields.iterator();
Field currentField;
int currentId = 0;
String fieldName;
try {
while(staticFieldsIter.hasNext()){
currentField = staticFieldsIter.next();
if(currentField.getType().toString().equals("byte")){
fieldName = currentField.getName();
tempTypeIds.put(new Integer(currentField.getInt(null)),
new Integer(currentId));
mnemonics[currentId]= fieldName;
tempStringTypeIds.put(fieldName, new Integer(currentId));
currentId++;
}
}
} catch(Exception e) {
throw new GateRuntimeException(e.toString());
}
typeIds = Collections.unmodifiableMap(tempTypeIds);
stringTypeIds = Collections.unmodifiableMap(tempStringTypeIds);
Set toIgnore = new HashSet();
toIgnore.add(" ");
toIgnore.add("\t");
toIgnore.add("\f");
ignoreTokens = Collections.unmodifiableSet(toIgnore);
typeMnemonics = Collections.unmodifiableList(Arrays.asList(mnemonics));
}
} // class DefaultTokeniser