All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.jape.parser.ParseCpsl.jj Maven / Gradle / Ivy

Go to download

ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.

The newest version!
/* ParseCpsl.jj - parser for CPSL grammars
   Hamish, 8/7/98
   $Id: ParseCpsl.jj 20054 2017-02-02 06:44:12Z markagreenwood $
 */

options {
  CACHE_TOKENS = true;
  IGNORE_CASE = false;
  DEBUG_PARSER = false;
  DEBUG_TOKEN_MANAGER = false;
  LOOKAHEAD = 1;
  FORCE_LA_CHECK = false;
  CHOICE_AMBIGUITY_CHECK = 2;
  OTHER_AMBIGUITY_CHECK = 1;
  STATIC = false;
  DEBUG_LOOKAHEAD = false;
  ERROR_REPORTING = true;
  JAVA_UNICODE_ESCAPE = false;
  UNICODE_INPUT = true;
  USER_TOKEN_MANAGER = false;
  USER_CHAR_STREAM = false;
  BUILD_PARSER = true;
  BUILD_TOKEN_MANAGER = true;
  SANITY_CHECK = true;
}

PARSER_BEGIN(ParseCpsl)

package gate.jape.parser;

import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;

import gate.Factory;
import gate.util.*;
import gate.jape.*;
import gate.jape.constraint.*;
import gate.event.*;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
  * A parser for the CPSL language. Generated using JavaCC.
  * @author Hamish Cunningham
  */
public class ParseCpsl implements JapeConstants {

  private static final long serialVersionUID = -2754817550046808372L;

  private static final Logger log = LoggerFactory.getLogger(ParseCpsl.class);

  /** Construct from a URL and an encoding
    */
  public ParseCpsl(URL url, String encoding) throws IOException {
    this(url, encoding, new HashMap());
  }

  /** Construct from a URL and an encoding
    */
  public ParseCpsl(URL url, String encoding, Map existingMacros) throws IOException {
    this(url, encoding, existingMacros, new HashMap());
  }

  public ParseCpsl(URL url, String encoding, Map existingMacros, Map existingTemplates) throws IOException {
    this(new BomStrippingInputStreamReader(url.openStream(), encoding), 
         existingMacros, existingTemplates);
    baseURL = url;
    this.encoding = encoding;
  }

  public ParseCpsl(java.io.Reader stream, Map existingMacros) {
    this(stream, existingMacros, new HashMap());
  }
  
  public ParseCpsl(java.io.Reader stream, Map existingMacros, Map existingTemplates) {
    this(stream);
    macrosMap = existingMacros;
    templatesMap = existingTemplates;
  }

  //StatusReporter Implementation
  public void addStatusListener(StatusListener listener){
    myStatusListeners.add(listener);
  }
  public void removeStatusListener(StatusListener listener){
    myStatusListeners.remove(listener);
  }
  protected void fireStatusChangedEvent(String text){
    Iterator listenersIter = myStatusListeners.iterator();
    while(listenersIter.hasNext())
      listenersIter.next().statusChanged(text);
  }

  protected SinglePhaseTransducer createSinglePhaseTransducer(String name){
    try {
      Constructor c = sptClass.getConstructor
          (String.class);
      return c.newInstance(name);
    } catch (NoSuchMethodException e) { // Shouldn't happen
      throw new RuntimeException(e);
    } catch (IllegalArgumentException e) { // Shouldn't happen
      throw new RuntimeException(e);
    } catch (InstantiationException e) { // Shouldn't happen
      throw new RuntimeException(e);
    } catch (IllegalAccessException e) { // Shouldn't happen
      throw new RuntimeException(e);
    } catch (InvocationTargetException e) { // Happens if the constructor throws an exception
      throw new RuntimeException(e);
    }
  }

  protected ParseCpsl spawn(URL sptURL) throws IOException{
    ParseCpsl newParser = new ParseCpsl(sptURL, encoding, macrosMap, templatesMap);
    newParser.setSptClass(this.sptClass);
    return newParser;
  }

  protected void finishSPT(SinglePhaseTransducer t) throws ParseException {
    if(ruleNumber == 0)
      throw(new ParseException("no rules defined in transducer " + t.getName()));
    t.setBaseURL(baseURL);
  }

  protected void finishBPE(BasicPatternElement bpe) {
  }

  /**
   * Attempt to parse a multi phase transducer from the current file.  This
   * method ensures that the JAPE file reader is properly closed when the
   * method completes, whether it completes successfully or throws an
   * exception.
   */
  public MultiPhaseTransducer MultiPhaseTransducer() throws ParseException {
    try {
      return _MultiPhaseTransducer();
    }
    finally {
      // this is a bit nasty but I couldn't find a better way to get at the
      // underlying Reader
      if(jj_input_stream.inputStream != null) {
        try {
          jj_input_stream.inputStream.close();
        }
        catch(IOException e) {
          log.warn("Couldn't close input stream while parsing " + baseURL, e);
        }
      }
    }
  }
  
  protected String toJavaIdentifier(String japeIdentifier) {
    return japeIdentifier.replace("-", "_");
  }

  /**
   * Normalise for quoted and unquoted strings - if the token is a string,
   * strip the quotes off its image, otherwise return the image as-is.
   */
  protected String stringValueOf(Token tok) {
    if(tok.kind == string) {
      // quoted string - strip the quotes
      return tok.image.substring(1, tok.image.length() - 1);
    } else {
      return tok.image;
    }
  }

  /**
   * Append the given string to the end of the given buffer as a Java string
   * literal.  If str is null, we append the four
   * characters n, u, l, l.  Otherwise, we append the contents of str surrounded
   * by double quotes, except that characters in str are escaped as necessary
   * to be a legal Java string literal: backspace, formfeed, tab, newline and
   * return are replaced by their escape sequences \b, \f, etc.; single and double
   * quote and backslash are preceded by an extra backslash; other non-ASCII
   * and non-printing characters are rendered as Unicode escapes (backslash-u
   * followed by four hex digits).
   */
  protected void appendJavaStringLiteral(StringBuffer buf, String str) {
  	if(str == null) {
  	  buf.append("null");
  	}
  	else {
  	  Formatter formatter = null;
  	  buf.append("\"");
  	  for(int i = 0; i < str.length(); i++) {
  	    char c = str.charAt(i);
  	    switch(c) {
  	      case '\b':
  	        buf.append("\\b");
  	        break;
  	      case '\f':
  	        buf.append("\\f");
  	        break;
  	      case '\n':
  	        buf.append("\\n");
  	        break;
  	      case '\r':
  	        buf.append("\\r");
  	        break;
  	      case '\t':
  	        buf.append("\\t");
  	        break;
  	      case '\"':
  	        buf.append("\\\"");
  	        break;
  	      case '\'':
  	        buf.append("\\\'");
  	        break;
  	      case '\\':
  	        buf.append("\\\\");
  	        break;

  	      default:
  	        if(c < 32 || c > 127) {
  	          if(formatter == null) formatter = new Formatter(buf);
  	          formatter.format("\\u%04X", Integer.valueOf(c));
  	        }
  	        else {
  	          buf.append(c);
  	        }
  	        break;
  	    }
  	  }
  	  buf.append("\"");
  	}
  }

  protected void appendAnnotationAdd(StringBuffer blockBuffer, String newAnnotType, String annotSetName)
  {
      String nl = Strings.getNl();
      blockBuffer.append("      if(outputAS == inputAS) { // use nodes directly" + nl);
      blockBuffer.append("        outputAS.add(" + nl);
      blockBuffer.append("          " + annotSetName + ".firstNode(), ");
      blockBuffer.append(annotSetName + ".lastNode(), " + nl);
      blockBuffer.append("          ");
      appendJavaStringLiteral(blockBuffer, newAnnotType);
      blockBuffer.append(", features" + nl);
      blockBuffer.append("        );" + nl);
      blockBuffer.append("      }" + nl);
      blockBuffer.append("      else { // use offsets" + nl);
      blockBuffer.append("        try {" + nl);
      blockBuffer.append("          outputAS.add(" + nl);
      blockBuffer.append("            " + annotSetName + ".firstNode().getOffset(), ");
      blockBuffer.append(annotSetName + ".lastNode().getOffset(), " + nl);
      blockBuffer.append("            ");
      appendJavaStringLiteral(blockBuffer, newAnnotType);
      blockBuffer.append(", features" + nl);
      blockBuffer.append("          );" + nl);
      blockBuffer.append("        }" + nl);
      blockBuffer.append("        catch(gate.util.InvalidOffsetException ioe) {" + nl);
      blockBuffer.append("          throw new gate.util.GateRuntimeException(\"Invalid offset exception generated \" +" + nl);
      blockBuffer.append("               \"from offsets taken from same document!\");" + nl);
      blockBuffer.append("        }" + nl);
      blockBuffer.append("      }" + nl);
      blockBuffer.append("      // end of RHS assignment block");
  }
  
  /**
   * Takes a string containing ${key} placeholders and substitutes
   * in the corresponding values from the given map.  If there is
   * no value in the map for a particular placeholder it is left
   * un-resolved, i.e. given a template of "${key1}/${key2}" and
   * a values map of just [key1: "hello"], this method would return
   * "hello/${key2}".
   */
  protected Pair substituteTemplate(Token templateNameTok,
          Map values) throws ParseException {
    Pair template = templatesMap.get(templateNameTok.image);
    if(template == null) {
      throw new ParseException(errorMsgPrefix(templateNameTok) +
              "unknown template name " + templateNameTok.image);
    }
    Pair returnVal = null;
    Set unusedParams = new HashSet(values.keySet());
    if(((Integer)template.first).intValue() == string) {
      log.debug("Substituting template " + templateNameTok.image + " with map "
              + values + ". Template is " + template);
      StringBuffer buf = new StringBuffer();
      Matcher mat = Pattern.compile("\\$\\{([^\\}]+)\\}")
              .matcher((String)template.second);
      while(mat.find()) {
        String key = mat.group(1);
        if(values.containsKey(key)) {
          mat.appendReplacement(buf,
                  Matcher.quoteReplacement(String.valueOf(values.get(key))));
          unusedParams.remove(key);
        }
        else {
          mat.appendReplacement(buf, "\\${");
          buf.append(key);
          buf.append("}");
        }
      }
      mat.appendTail(buf);
      
      returnVal = new Pair();
      returnVal.first = Integer.valueOf(string);
      returnVal.second = buf.toString();
      log.debug("Template substitution produced " + returnVal.second);
    }
    else {
      returnVal = template;
    }

    // check that there were no invalid parameters
    if(!unusedParams.isEmpty()) {
      throw new ParseException(errorMsgPrefix(templateNameTok) +
              "invalid parameters " + unusedParams +
              " for template " + templateNameTok.image);
    }
    else {
      return returnVal;
    }
  }

  public void setBaseURL (URL newURL) {
    baseURL = newURL;
  }

  public void setEncoding (String newEncoding) {
    encoding = newEncoding;
  }
  
  public void setSptClass(Class sptClass) {
    this.sptClass = sptClass;
  }

  private String errorMsgPrefix(Token t) {
    return ((baseURL != null) ? baseURL.toExternalForm() : "(No URL)")+
      ( (t == null) ? " " :
          ":"+t.beginLine+":"+t.beginColumn+": ");
   }

  private transient List myStatusListeners = new LinkedList();

  /** Position of the current rule */
  private int ruleNumber;

  /** A list of all the bindings we made this time, for checking
    * the RHS during parsing.
    */
  private Set bindingNameSet = null;

  /** A table of macro definitions. */
  protected Map macrosMap;
  
  /**
   * A table of template definitions. Keys are template names,
   * values are Pairs of token kind and value, as returned by
   * AttrVal.
   */
  protected Map templatesMap;

  protected URL baseURL;
  protected String encoding;
  
  protected Class sptClass =
      SinglePhaseTransducer.class;

  protected SinglePhaseTransducer curSPT;
} // class ParseCpsl

PARSER_END(ParseCpsl)


///////////////////
// lexical analysis
///////////////////

////////////////
// utility stuff

 TOKEN: {
  <#space: ("\n" | "\r" | "\t" | "\f" | " ")>
}
 TOKEN: { <#spaces: ("\n" | "\r" | "\t" | "\f" | " ")+> }
 TOKEN: { <#newline: ("\n" | "\r" | "\n\r" | "\r\n")> }
 TOKEN: { <#digits: (["0"-"9"])+> }
 TOKEN: { <#letter: ["A"-"Z", "a"-"z"]> }
 TOKEN: { <#letterOrUnderscore: ["A"-"Z", "a"-"z", "_"]> }
 TOKEN: { <#letters: (["A"-"Z", "a"-"z"])+> }
 TOKEN: { <#lettersAndDigits: (["A"-"Z", "a"-"z", "0"-"9"])+> }
 TOKEN: {
  <#letterOrDigitOrDash: ["A"-"Z", "a"-"z", "0"-"9", "-", "_"]>
}
 TOKEN: {
  <#lettersAndDigitsAndDashes: (["A"-"Z", "a"-"z", "0"-"9", "-", "_"])+>
}

////////////////
// parsed tokens

 TOKEN [IGNORE_CASE]: {  }

// phases has its own lexical state so we can deal with relative paths
// pointing to grammar files
 TOKEN [IGNORE_CASE]: {  :IN_PHASES }
 TOKEN: {
  
}
 SPECIAL_TOKEN: { // ignore whitespace
  )+>
}
 SPECIAL_TOKEN: { // single-line C++/Java style comments
  )? >
}
 SPECIAL_TOKEN: { // single-line comments CPSL style
  )? >
}
 SPECIAL_TOKEN: {
   : PHASES_WITHIN_COMMENT
}
 MORE: {  }
 SPECIAL_TOKEN: {
   : IN_PHASES
}

 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN [IGNORE_CASE]: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: { " | "<" | ">=" | "<=" | "=~" | "!~" | "==~" | "!=~"> }
 TOKEN: {  }
 TOKEN: {  }


//starts a string
   MORE:
   {
      "\"" : IN_STRING
   }

//reads the contents of the string
    MORE :
   {
      "\\n"  { image.setLength(image.length() - 2); image.append("\n"); }
    |
      "\\r"  { image.setLength(image.length() - 2); image.append("\r"); }
    |
      "\\t"  { image.setLength(image.length() - 2); image.append("\t"); }
    |
      "\\b"  { image.setLength(image.length() - 2); image.append("\b"); }
    |
      "\\f"  { image.setLength(image.length() - 2); image.append("\f"); }
    |
      "\\\""  { image.setLength(image.length() - 2); image.append("\""); }
    |
      "\\\'"  { image.setLength(image.length() - 2); image.append("\'"); }
    |
      "\\\\"  { image.setLength(image.length() - 2); image.append("\\"); }
    |
      <"\\u" (["0"-"9","A"-"F","a"-"f"]) (["0"-"9","A"-"F","a"-"f"])
             (["0"-"9","A"-"F","a"-"f"]) (["0"-"9","A"-"F","a"-"f"])>
             {
               String digits = image.substring(image.length() - 4, image.length());
               image.setLength(image.length() - 6);
               image.append((char)Integer.parseInt(digits, 16));
             }
    |
      < ~["\"", "\\"] >  // Disallow backslashes that weren't caught by previous rules
                         // Note that here you don't need any action.
   }

//finishes the string
    TOKEN :
   {
      
      {
        // image.setLength(image.length() - 1);
         matchedToken.image = image.toString();
      } : DEFAULT
   }

 TOKEN: {  }
 TOKEN: {  ()* > }
 TOKEN: {
  )? (["f","F","d","D"])?
    | "." (["0"-"9"])+ ()? (["f","F","d","D"])?
    | (["0"-"9"])+  (["f","F","d","D"])?
    | (["0"-"9"])+ ()? ["f","F","d","D"]
   )
  >
}
 TOKEN: { <#exponent: ["e","E"] (["+","-"])? (["0"-"9"])+ > }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
 TOKEN: {  }
//  TOKEN: {  |   > }
// TOKEN: {  }
/* SPECIAL_TOKEN: { // catch all for Java block processing
  
}*/

////////////////////
// non-parsed tokens

// we make comments and spaces special tokens to support an editor
 SPECIAL_TOKEN: { // ignore whitespace
  )+>
}
 SPECIAL_TOKEN: { // single-line C++/Java style comments
  )? >
}
 SPECIAL_TOKEN: { // single-line comments CPSL style
  )? >
}
 SPECIAL_TOKEN: {  : WITHIN_COMMENT }
 MORE: {  }
 SPECIAL_TOKEN: {  : DEFAULT }

 TOKEN: { // catch all for Java block processing
  
}


//////////////
// the grammar
//////////////

MultiPhaseTransducer _MultiPhaseTransducer() :
{
  // macrosMap = new HashMap();
  SinglePhaseTransducer s = null;
  MultiPhaseTransducer m = new MultiPhaseTransducer();
  m.setBaseURL(baseURL);
  Token mptNameTok = null;
  Token phaseNameTok = null;
  String javaimportblock = null;
  String controllerstartedblock = null;
  String controllerfinishedblock = null;
  String controllerabortedblock = null;
  boolean haveControllerStartedBlock = false;
  boolean haveControllerFinishedBlock = false;
  boolean haveControllerAbortedBlock = false;
}
{
  // transducer name
  (
     mptNameTok=
    { m.setName(mptNameTok.image); }
  )?

  // spts
  (
    // sptrannies in this file
    (
      (javaimportblock=JavaImportBlock())
      (
        (  controllerstartedblock=ControllerStartedBlock()
           { if(haveControllerStartedBlock)
               throw new ParseException("Only one ControllerStarted block allowed");
             else
               haveControllerStartedBlock = true;
           }
        ) |
        (controllerfinishedblock=ControllerFinishedBlock()
           { if(haveControllerFinishedBlock)
               throw new ParseException("Only one ControllerFinished block allowed");
             else
               haveControllerFinishedBlock = true;
           }
        ) |
        (controllerabortedblock=ControllerAbortedBlock()
           { if(haveControllerAbortedBlock)
               throw new ParseException("Only one ControllerAborted block allowed");
             else
               haveControllerAbortedBlock = true;
           }
        )
      )*
      (
        
        try {
            s=SinglePhaseTransducer(javaimportblock) {
                m.addPhase(s.getName(), s);
                s.setBaseURL(baseURL);
                s.setControllerEventBlocks(controllerstartedblock,
                  controllerfinishedblock,controllerabortedblock,javaimportblock);
              // only the first SPT in a MPT file should define/execute the blocks
              controllerstartedblock = null;
              controllerfinishedblock = null;
              controllerabortedblock = null;
            }
        } catch (Throwable e) {
            // try to wrap the exception with info about what file/resource
            // it occurred in.
            throw(
              new ParseException("Cannot parse a phase in " +
                  baseURL + ": " + e.getMessage()
              ));
          }
      )+
    )
    |
    // sptrannies in external files
    (
      
      (
        phaseNameTok=
        {
          ParseCpsl parser = null;

            // check file exists
            String sptPath = phaseNameTok.image + ".jape";
            URL sptURL = null;
            try{
              sptURL = new URL(baseURL, sptPath);
            }catch(MalformedURLException mue){
              throw(new ParseException(errorMsgPrefix(phaseNameTok)+
                "Read error " + mue.toString()));
            }

            // sptURL can never be null at this point because the only way that could
            // happen would be if an exception occurred above, but that would trigger
            // the ParserException above
            if(sptURL == null){
              throw(new ParseException(errorMsgPrefix(phaseNameTok)+
                "Resource not found: base = " + baseURL.toString() +
                " path = " + sptPath
              ));
            }

            // construct a parser and parse it
            fireStatusChangedEvent("Reading " + phaseNameTok.image + "...");
            try {
              parser = spawn(sptURL);
            } catch (IOException e) {
              throw(
                new ParseException(errorMsgPrefix(phaseNameTok)+
                  "Cannot open URL " + sptURL.toExternalForm()
                )
              );
            } 

          // adding the resultant spt to m
          if(parser != null) {
          	List phases = parser.MultiPhaseTransducer().getPhases();

            //s = parser.SinglePhaseTransducer();
            //if(s != null)
            //  m.addPhase(s.getName(), s);

            if(phases != null) {
              for(int i=0; i < phases.size(); i++) {
                m.addPhase(
                  phases.get(i).getName(),
                  phases.get(i)
                  );
              }
            }
          }
        } // an SPT in an external file
      )+  // external file phase identifiers
    )     // external file phases declaration
  )          // SPTs
  
  {
//move this out of here so the input file gets closed properly
//    m.finish(); // swap the various JGL types for Java arrays
    return m;
  }

} // _MultiPhaseTransducer


SinglePhaseTransducer SinglePhaseTransducer(String javaimportblock) :
{
  ruleNumber = 0;
  Token phaseNameTok = null;
  String phaseName = null;
  Token inputTok = null;
  SinglePhaseTransducer t = null;
  Rule newRule = null;
  bindingNameSet = new HashSet();
  Token optionNameTok = null;
  Token optionValueTok = null;
}
{
  
   phaseNameTok=
  { phaseName = toJavaIdentifier(phaseNameTok.image);
    t = createSinglePhaseTransducer(phaseName); curSPT = t; }

  (
    (
      
      ( ( inputTok =  | inputTok =  ) {t.addInput(stringValueOf(inputTok));})*
    )
    |
    (