gate.creole.tokeniser.SimpleTokeniser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.
There is a newer version: 9.1
Show newest version
/*
 *  DefaultTokeniser.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 2000
 *
 *  $Id: SimpleTokeniser.java 20054 2017-02-02 06:44:12Z markagreenwood $
 */

package gate.creole.tokeniser;

import java.io.BufferedReader;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;

/** Implementation of a Unicode rule based tokeniser.
 * The tokeniser gets its rules from a file an {@link java.io.InputStream
 * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
 * of the constructors.
 * The implementations is based on a finite state machine that is built based
 * on the set of rules.
 * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
 * that are separated by the ">" character. The LHS represents a
 * regular expression that will be matched against the input while the RHS
 * describes a Gate2 annotation in terms of annotation type and attribute-value
 * pairs.
 * The matching is done using Unicode enumarated types as defined by the {@link
 * java.lang.Character Character} class. At the time of writing this class the
 * suported Unicode categories were:
 * 
 * UNASSIGNED
 * 
UPPERCASE_LETTER
 * 
LOWERCASE_LETTER
 * 
TITLECASE_LETTER
 * 
MODIFIER_LETTER
 * 
OTHER_LETTER
 * 
NON_SPACING_MARK
 * 
ENCLOSING_MARK
 * 
COMBINING_SPACING_MARK
 * 
DECIMAL_DIGIT_NUMBER
 * 
LETTER_NUMBER
 * 
OTHER_NUMBER
 * 
SPACE_SEPARATOR
 * 
LINE_SEPARATOR
 * 
PARAGRAPH_SEPARATOR
 * 
CONTROL
 * 
FORMAT
 * 
PRIVATE_USE
 * 
SURROGATE
 * 
DASH_PUNCTUATION
 * 
START_PUNCTUATION
 * 
END_PUNCTUATION
 * 
CONNECTOR_PUNCTUATION
 * 
OTHER_PUNCTUATION
 * 
MATH_SYMBOL
 * 
CURRENCY_SYMBOL
 * 
MODIFIER_SYMBOL
 * 
OTHER_SYMBOL
 * 
 * The accepted operators for the LHS are "+", "*" and "|" having the usual
 * interpretations of "1 to n occurences", "0 to n occurences" and
 * "boolean OR".
 * For instance this is a valid LHS:
 * 
"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
 * 
meaning an uppercase letter followed by one or more lowercase letters.
 *
 * The RHS describes an annotation that is to be created and inserted in the
 * annotation set provided in case of a match. The new annotation will span the
 * text that has been recognised. The RHS consists in the annotation type
 * followed by pairs of attributes and associated values.
 * E.g. for the LHS above a possible RHS can be:

 * Token;kind=upperInitial;

 * representing an annotation of type "Token" having one attribute
 * named "kind" with the value "upperInitial"

 * The entire rule willbe:

 * "UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;
 * 

 * The tokeniser ignores all the empty lines or the ones that start with # or
 * //.
 *
 */
@CreoleResource(name="GATE Unicode Tokeniser", comment="A customisable Unicode tokeniser.", helpURL="http://gate.ac.uk/userguide/sec:annie:tokeniser", icon="tokeniser")
public class SimpleTokeniser extends AbstractLanguageAnalyser implements ANNIEConstants {

  private static final long serialVersionUID = 1411111968361716069L;

  public static final String
    SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";

  public static final String
    SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";

  public static final String
    SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";

  public static final String
    SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";

  /**
   * Creates a tokeniser
   */
  public SimpleTokeniser(){
  }

  /**
   * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
   * the finite state machine at the core of the tokeniser.
   *
   * @exception ResourceInstantiationException
   */
  @Override
  public Resource init() throws ResourceInstantiationException{
    BufferedReader bRulesReader = null;
    try{
      if(rulesURL != null){
        bRulesReader = new BufferedReader(new BomStrippingInputStreamReader(rulesURL.openStream(), encoding));
      }else{
        //no init data, Scream!
        throw new ResourceInstantiationException(
          "No URL provided for the rules!");
      }
      initialState = new FSMState(this);
      String line = bRulesReader.readLine();
      ///String toParse = "";
      StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);

      while (line != null){
        if(line.endsWith("\\")){
          ///toParse += line.substring(0,line.length()-1);
          toParse.append(line.substring(0,line.length()-1));
        }else{
          /*toParse += line;
          parseRule(toParse);
          toParse = "";
          */
          toParse.append(line);
          parseRule(toParse.toString());
          toParse.delete(0,toParse.length());
        }
        line = bRulesReader.readLine();
      }
      eliminateVoidTransitions();
    }catch(java.io.IOException ioe){
      throw new ResourceInstantiationException(ioe);
    }catch(TokeniserException te){
      throw new ResourceInstantiationException(te);
    }
    finally {
      IOUtils.closeQuietly(bRulesReader);
    }
    return this;
  }

  /**
   * Prepares this Processing resource for a new run.
   */
  public void reset(){
    document = null;
  }

  /** Parses one input line containing a tokeniser rule.
   * This will create the necessary FSMState objects and the links
   * between them.
   *
   * @param line the string containing the rule
   */
  void parseRule(String line)throws TokeniserException{
    //ignore comments
    if(line.startsWith("#")) return;

    if(line.startsWith("//")) return;

    StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
    FSMState newState = new FSMState(this);

    initialState.put(null, newState);
    FSMState finalState = parseLHS(newState, st, LHStoRHS);
    String rhs = "";

    if(st.hasMoreTokens()) rhs = st.nextToken("\f");

    if(rhs.length() > 0)finalState.setRhs(rhs);
  } // parseRule

  /** Parses a part or the entire LHS.
   *
   * @param startState a FSMState object representing the initial state for
   *     the small FSM that will recognise the (part of) the rule parsed by this
   *     method.
   * @param st a {@link java.util.StringTokenizer StringTokenizer} that
   *     provides the input
   * @param until the string that marks the end of the section to be
   *     recognised. This method will first be called by {@link
   *     #parseRule(String)} with " >" in order to parse the entire
   *     LHS. when necessary it will make itself another call to {@link #parseLHS
   *     parseLHS} to parse a region of the LHS (e.g. a
   *     "(",")" enclosed part.
   */
  FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
       throws TokeniserException{

    FSMState currentState = startState;
    boolean orFound = false;
    List orList = new LinkedList();
    String token;
    token = skipIgnoreTokens(st);

    if(null == token) return currentState;

    FSMState newState;
    Integer typeId;
    UnicodeType uType;

    bigwhile: while(!token.equals(until)){
      if(token.equals("(")){//(..)
        newState = parseLHS(currentState, st,")");
      } else if(token.equals("\"")){//"unicode_type"
        String sType = parseQuotedString(st, "\"");
        newState = new FSMState(this);
        typeId = stringTypeIds.get(sType);

        if(null == typeId)
          throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
        else uType = new UnicodeType(typeId.intValue());

        currentState.put(uType ,newState);
      } else {// a type with no quotes
        String sType = token;
        newState = new FSMState(this);
        typeId = stringTypeIds.get(sType);

        if(null == typeId)
          throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
        else uType = new UnicodeType(typeId.intValue());

        currentState.put(uType ,newState);
      }
      //treat the operators
      token = skipIgnoreTokens(st);
      if(null == token) throw
        new InvalidRuleException("Tokeniser rule ended too soon!");

      if(token.equals("|")) {

        orFound = true;
        orList.add(newState);
        token = skipIgnoreTokens(st);
        if(null == token) throw
          new InvalidRuleException("Tokeniser rule ended too soon!");

        continue bigwhile;
      } else if(orFound) {//done parsing the "|"
        orFound = false;
        orList.add(newState);
        newState = new FSMState(this);
        Iterator orListIter = orList.iterator();

        while(orListIter.hasNext())
          orListIter.next().put(null, newState);
        orList.clear();
      }

      if(token.equals("+")) {

        newState.put(null,currentState);
        currentState = newState;
        newState = new FSMState(this);
        currentState.put(null,newState);
        token = skipIgnoreTokens(st);

        if(null == token) throw
          new InvalidRuleException("Tokeniser rule ended too soon!");
      } else if(token.equals("*")) {

        currentState.put(null,newState);
        newState.put(null,currentState);
        currentState = newState;
        newState = new FSMState(this);
        currentState.put(null,newState);
        token = skipIgnoreTokens(st);

        if(null == token) throw
          new InvalidRuleException("Tokeniser rule ended too soon!");
      }
      currentState = newState;
    }
    return currentState;
  } // parseLHS

  /** Parses from the given string tokeniser until it finds a specific
   * delimiter.
   * One use for this method is to read everything until the first quote.
   *
   * @param st a {@link java.util.StringTokenizer StringTokenizer} that
   *     provides the input
   * @param until a String representing the end delimiter.
   */
  String parseQuotedString(StringTokenizer st, String until)
    throws TokeniserException {

    String token;

    if(st.hasMoreElements()) token = st.nextToken();
    else return null;

    ///String type = "";
    StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);

    while(!token.equals(until)){
      //type += token;
      type.append(token);
      if(st.hasMoreElements())token = st.nextToken();
      else throw new InvalidRuleException("Tokeniser rule ended too soon!");
    }
    return type.toString();
  } // parseQuotedString

  /** Skips the ignorable tokens from the input returning the first significant
   * token.
   * The ignorable tokens are defined by {@link #ignoreTokens a set}
   */
  protected static String skipIgnoreTokens(StringTokenizer st){
    Iterator ignorables;
    boolean ignorableFound = false;
    String currentToken;

    while(true){
      if(st.hasMoreTokens()){
        currentToken = st.nextToken();
        ignorables = ignoreTokens.iterator();
        ignorableFound = false;

        while(!ignorableFound && ignorables.hasNext()){
          if(currentToken.equals(ignorables.next()))
            ignorableFound = true;
        }

        if(!ignorableFound) return currentToken;
      } else return null;
    }
  }//skipIgnoreTokens

  /* Computes the lambda-closure (aka epsilon closure) of the given set of
   * states, that is the set of states that are accessible from any of the
   * states in the given set using only unrestricted transitions.
   * @return a set containing all the states accessible from this state via
   * transitions that bear no restrictions.
   */
  /**
   * Converts the finite state machine to a deterministic one.
   *
   * @param s
   */
  private AbstractSet lambdaClosure(Set s){

    //the stack/queue used by the algorithm
    LinkedList list = new LinkedList(s);

    //the set to be returned
    AbstractSet lambdaClosure = new HashSet(s);

    FSMState top;
    FSMState currentState;
    Set nextStates;
    Iterator statesIter;

    while(!list.isEmpty()) {
      top = list.removeFirst();
      nextStates = top.nextSet(null);

      if(null != nextStates){
        statesIter = nextStates.iterator();

        while(statesIter.hasNext()) {
          currentState = statesIter.next();
          if(!lambdaClosure.contains(currentState)){
            lambdaClosure.add(currentState);
            list.addFirst(currentState);
          }//if(!lambdaClosure.contains(currentState))
        }//while(statesIter.hasNext())

      }//if(null != nextStates)
    }
    return lambdaClosure;
  } // lambdaClosure

  /** Converts the FSM from a non-deterministic to a deterministic one by
   * eliminating all the unrestricted transitions.
   */
  void eliminateVoidTransitions() throws TokeniserException {

    //kalina:clear() faster than init() which is called with init()
    newStates.clear();
    Set> sdStates = new HashSet>();
    LinkedList> unmarkedDStates = new LinkedList>();
    DFSMState dCurrentState = new DFSMState(this);
    Set sdCurrentState = new HashSet();

    sdCurrentState.add(initialState);
    sdCurrentState = lambdaClosure(sdCurrentState);
    newStates.put(sdCurrentState, dCurrentState);
    sdStates.add(sdCurrentState);

    //find out if the new state is a final one
    Iterator innerStatesIter = sdCurrentState.iterator();
    String rhs;
    FSMState currentInnerState;
    Set rhsClashSet = new HashSet();
    boolean newRhs = false;

    while(innerStatesIter.hasNext()){
      currentInnerState = innerStatesIter.next();
      if(currentInnerState.isFinal()){
        rhs = currentInnerState.getRhs();
        rhsClashSet.add(rhs);
        dCurrentState.rhs = rhs;
        newRhs = true;
      }
    }

    if(rhsClashSet.size() > 1){
      Err.println("Warning, rule clash: " +  rhsClashSet +
                         "\nSelected last definition: " + dCurrentState.rhs);
    }

    if(newRhs)dCurrentState.buildTokenDesc();
    rhsClashSet.clear();
    unmarkedDStates.addFirst(sdCurrentState);
    dInitialState = dCurrentState;
    Set nextSet;

    while(!unmarkedDStates.isEmpty()){
      //Out.println("\n\n=====================" + unmarkedDStates.size());
      sdCurrentState = unmarkedDStates.removeFirst();
      for(int type = 0; type < maxTypeId; type++){
      //Out.print(type);
        nextSet = new HashSet();
        innerStatesIter = sdCurrentState.iterator();

        while(innerStatesIter.hasNext()){
          currentInnerState = innerStatesIter.next();
          Set tempSet = currentInnerState.nextSet(type);
          if(null != tempSet) nextSet.addAll(tempSet);
        }//while(innerStatesIter.hasNext())

        if(!nextSet.isEmpty()){
          nextSet = lambdaClosure(nextSet);
          dCurrentState = newStates.get(nextSet);

          if(dCurrentState == null){

            //we have a new DFSMState
            dCurrentState = new DFSMState(this);
            sdStates.add(nextSet);
            unmarkedDStates.add(nextSet);

            //check to see whether the new state is a final one
            innerStatesIter = nextSet.iterator();
            newRhs =false;

            while(innerStatesIter.hasNext()){
              currentInnerState = innerStatesIter.next();
              if(currentInnerState.isFinal()){
                rhs = currentInnerState.getRhs();
                rhsClashSet.add(rhs);
                dCurrentState.rhs = rhs;
                newRhs = true;
              }
            }

            if(rhsClashSet.size() > 1){
              Err.println("Warning, rule clash: " +  rhsClashSet +
                            "\nSelected last definition: " + dCurrentState.rhs);
            }

            if(newRhs)dCurrentState.buildTokenDesc();
            rhsClashSet.clear();
            newStates.put(nextSet, dCurrentState);
          }
          newStates.get(sdCurrentState).put(type,dCurrentState);
        } // if(!nextSet.isEmpty())

      } // for(byte type = 0; type < 256; type++)

    } // while(!unmarkedDStates.isEmpty())

  } // eliminateVoidTransitions

  /** Returns a string representation of the non-deterministic FSM graph using
   * GML (Graph modelling language).
   */
  public String getFSMgml(){
    String res = "graph[ \ndirected 1\n";
    ///String nodes = "", edges = "";
    StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
                 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);

    Iterator fsmStatesIter = fsmStates.iterator();
    while (fsmStatesIter.hasNext()){
      FSMState currentState = fsmStatesIter.next();
      int stateIndex = currentState.getIndex();
      /*nodes += "node[ id " + stateIndex +
               " label \"" + stateIndex;
        */
        nodes.append("node[ id ");
        nodes.append(stateIndex);
        nodes.append(" label \"");
        nodes.append(stateIndex);

             if(currentState.isFinal()){
              ///nodes += ",F\\n" + currentState.getRhs();
              nodes.append(",F\\n" + currentState.getRhs());
             }
             ///nodes +=  "\"  ]\n";
             nodes.append("\"  ]\n");
      ///edges += currentState.getEdgesGML();
      edges.append(currentState.getEdgesGML());
    }
    res += nodes.toString() + edges.toString() + "]\n";
    return res;
  } // getFSMgml

  /** Returns a string representation of the deterministic FSM graph using
   * GML.
   */
  public String getDFSMgml() {
    String res = "graph[ \ndirected 1\n";
    ///String nodes = "", edges = "";
    StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
                 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);

    Iterator dfsmStatesIter = dfsmStates.iterator();
    while (dfsmStatesIter.hasNext()) {
      DFSMState currentState = dfsmStatesIter.next();
      int stateIndex = currentState.getIndex();
/*      nodes += "node[ id " + stateIndex +
               " label \"" + stateIndex;
*/
        nodes.append("node[ id ");
        nodes.append(stateIndex);
        nodes.append(" label \"");
        nodes.append(stateIndex);

             if(currentState.isFinal()){
///              nodes += ",F\\n" + currentState.getRhs();
              nodes.append(",F\\n" + currentState.getRhs());
             }
///             nodes +=  "\"  ]\n";
             nodes.append("\"  ]\n");
///      edges += currentState.getEdgesGML();
        edges.append(currentState.getEdgesGML());
    }
    res += nodes.toString() + edges.toString() + "]\n";
    return res;
  } // getDFSMgml

  /**
   * The method that does the actual tokenisation.
   */
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    //check the input
    if(document == null) {
      throw new ExecutionException(
        "No document to tokenise!"
      );
    }

    if(annotationSetName == null ||
       annotationSetName.equals("")) annotationSet = document.getAnnotations();
    else annotationSet = document.getAnnotations(annotationSetName);

    fireStatusChanged(
        "Tokenising " + document.getName() + "...");

    String content = document.getContent().toString();
    int length = content.length();
    int currentChar;
    int charsInCurrentCP = 1;

    DFSMState graphPosition = dInitialState;

    //the index of the first character of the token trying to be recognised
    int tokenStart = 0;

    DFSMState lastMatchingState = null;
    DFSMState nextState;
    String tokenString;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap newTokenFm;

    while(charIdx < length){
      currentChar = content.codePointAt(charIdx);
      // number of chars we have to advance after processing this code point.
      // 1 in the vast majority of cases, but 2 where the code point is a
      // supplementary character represented as a surrogate pair.
      charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1;
      
//      Out.println(
//      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
      nextState = graphPosition.next(typeIds.get(
                  new Integer(Character.getType(currentChar))).intValue());

      if( null != nextState ) {
        graphPosition = nextState;
        if(graphPosition.isFinal()) {
          lastMatchingState = graphPosition;
        }
        charIdx += charsInCurrentCP;
      } else {//we have a match!
        newTokenFm = Factory.newFeatureMap();

        if (null == lastMatchingState) {
          // no rule matches this character, so create a single-char
          // DEFAULT_TOKEN annotation covering it and start again after it
          charIdx  = tokenStart + charsInCurrentCP;
          tokenString = content.substring(tokenStart, charIdx);
          newTokenFm.put("type","UNKNOWN");
          newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
          newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
                         Integer.toString(tokenString.length()));

          try {
            annotationSet.add(new Long(tokenStart),
                              new Long(charIdx),
                              "DEFAULT_TOKEN", newTokenFm);
          } catch (InvalidOffsetException ioe) {
            //This REALLY shouldn't happen!
            ioe.printStackTrace(Err.getPrintWriter());
          }
          // Out.println("Default token: " + tokenStart +
          //             "->" + tokenStart + " :" + tokenString + ";");
        } else {
          // we've reached the end of a string that the FSM recognised
          tokenString = content.substring(tokenStart, charIdx);
          newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
          newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
                         Integer.toString(tokenString.length()));

          for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
            newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
                           lastMatchingState.getTokenDesc()[i][1]);
          //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
          //                       lastMatchingState.getTokenDesc()[i][1]);
          }


          try {
            annotationSet.add(new Long(tokenStart),
                            new Long(charIdx),
                            lastMatchingState.getTokenDesc()[0][0], newTokenFm);
          } catch(InvalidOffsetException ioe) {
            //This REALLY shouldn't happen!
            throw new GateRuntimeException(ioe.toString());
          }

          // Out.println(lastMatchingState.getTokenDesc()[0][0] +
          //              ": " + tokenStart + "->" + lastMatch +
          //              " :" + tokenString + ";");
          //charIdx = lastMatch + 1;
        }

        // reset to initial state and start looking again from here
        lastMatchingState = null;
        graphPosition = dInitialState;
        tokenStart = charIdx;
      }

      if((charIdx - oldCharIdx > 256)){
        fireProgressChanged((100 * charIdx )/ length );
        oldCharIdx = charIdx;
        if(isInterrupted()) throw new ExecutionInterruptedException();
      }

    } // while(charIdx < length)

    if (null != lastMatchingState) {
      // we dropped off the end having found a match, annotate it
      tokenString = content.substring(tokenStart, charIdx);
      newTokenFm = Factory.newFeatureMap();
      newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
      newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
                     Integer.toString(tokenString.length()));

      for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
        newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
                       lastMatchingState.getTokenDesc()[i][1]);
      }


      try {
        annotationSet.add(new Long(tokenStart),
                          new Long(charIdx),
                          lastMatchingState.getTokenDesc()[0][0], newTokenFm);
      } catch(InvalidOffsetException ioe) {
        //This REALLY shouldn't happen!
        throw new GateRuntimeException(ioe.toString());
      }

    }

    reset();
    fireProcessFinished();
    fireStatusChanged("Tokenisation complete!");
  } // run

  /**
   * Sets the value of the rulesURL property which holds an URL
   * to the file containing the rules for this tokeniser.
   * @param newRulesURL
   */
  @CreoleParameter(defaultValue="resources/tokeniser/DefaultTokeniser.rules", comment="The URL to the rules file", suffixes="rules")
  public void setRulesURL(ResourceReference newRulesURL) {
    rulesURL = newRulesURL;
  }
  
  @Deprecated
  public void setRulesURL(URL newRulesURL) {
		try {
			this.setRulesURL(new ResourceReference(newRulesURL));
		} catch (URISyntaxException e) {
			throw new RuntimeException("Error converting URL to ResourceReference", e);
		}
  }
  
  /**
   * Gets the value of the rulesURL property hich holds an
   * URL to the file containing the rules for this tokeniser.
   */
  public ResourceReference getRulesURL() {
    return rulesURL;
  }
    
  @RunTime
  @Optional
  @CreoleParameter(comment="The annotation set to be used for the generated annotations")
  public void setAnnotationSetName(String newAnnotationSetName) {
    annotationSetName = newAnnotationSetName;
  }
  /**    */
  public String getAnnotationSetName() {
    return annotationSetName;
  }
  public void setRulesResourceName(String newRulesResourceName) {
    rulesResourceName = newRulesResourceName;
  }
  public String getRulesResourceName() {
    return rulesResourceName;
  }
  
  @CreoleParameter(defaultValue="UTF-8", comment="The encoding used for reading the definitions")
  public void setEncoding(String newEncoding) {
    encoding = newEncoding;
  }
  public String getEncoding() {
    return encoding;
  }

  /** the annotations et where the new annotations will be adde
   */
  protected String annotationSetName;

  /** The initial state of the non deterministic machin
   */
  protected FSMState initialState;

  /** A set containng all the states of the non deterministic machine
   */
  protected Set fsmStates = new HashSet();

  /** The initial state of the deterministic machine
   */
  protected DFSMState dInitialState;

  /** A set containng all the states of the deterministic machine
   */
  protected Set dfsmStates = new HashSet();

  /** The separator from LHS to RH
   */
  static String LHStoRHS = ">";

  /** A set of string representing tokens to be ignored (e.g. blanks
   */
  protected static final Set ignoreTokens;

  /** maps from int (the static value on {@link java.lang.Character} to int
   * the internal value used by the tokeniser. The ins values used by the
   * tokeniser are consecutive values, starting from 0 and going as high as
   * necessary.
   * They map all the public static int members on{@link java.lang.Character}
   */
  protected static final Map typeIds;

  /** The maximum int value used internally as a type i
   */
  protected static final int maxTypeId;

  /** Maps the internal type ids to the type name
   */
  protected static final List typeMnemonics;

  /** Maps from type names to type internal id
   */
  protected static final Map stringTypeIds;

  private String rulesResourceName;
  private ResourceReference rulesURL;
  private String encoding;

  //kalina: added this as method to minimise too many init() calls
  protected transient Map, DFSMState> newStates = new HashMap, DFSMState>();


  /** The static initialiser will inspect the class {@link java.lang.Character}
    * using reflection to find all the public static members and will map them
    * to ids starting from 0.
    * After that it will build all the static data: {@link #typeIds}, {@link
    * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
    */
  static{
    Field[] characterClassFields;

    try{
      characterClassFields = Class.forName("java.lang.Character").getFields();
    }catch(ClassNotFoundException cnfe){
      throw new GateRuntimeException("Could not find the java.lang.Character class!");
    }

    Collection staticFields = new LinkedList();
    // JDK 1.4 introduced directionality constants that have the same values as
    //character types; we need to skip those as well
    for(int i = 0; i< characterClassFields.length; i++)
      if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
         characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
        staticFields.add(characterClassFields[i]);

    Map tempTypeIds = new HashMap();
    maxTypeId = staticFields.size() -1;
    String[] mnemonics = new String[maxTypeId+1];
    Map tempStringTypeIds = new HashMap();

    
    
    Iterator staticFieldsIter = staticFields.iterator();
    Field currentField;
    int currentId = 0;
    String fieldName;

    try {
      while(staticFieldsIter.hasNext()){
        currentField = staticFieldsIter.next();
        if(currentField.getType().toString().equals("byte")){
          fieldName = currentField.getName();
          tempTypeIds.put(new Integer(currentField.getInt(null)),
                                    new Integer(currentId));
          mnemonics[currentId]= fieldName;
          tempStringTypeIds.put(fieldName, new Integer(currentId));
          currentId++;
        }
      }
    } catch(Exception e) {
      throw new GateRuntimeException(e.toString());
    }
    
    typeIds = Collections.unmodifiableMap(tempTypeIds);
    stringTypeIds = Collections.unmodifiableMap(tempStringTypeIds);

    Set toIgnore = new HashSet();
    toIgnore.add(" ");
    toIgnore.add("\t");
    toIgnore.add("\f");
    
    ignoreTokens = Collections.unmodifiableSet(toIgnore);
    
    typeMnemonics = Collections.unmodifiableList(Arrays.asList(mnemonics));
  }

} // class DefaultTokeniser