gate.jape.parser.ParseCpsl.jj Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
/* ParseCpsl.jj - parser for CPSL grammars
Hamish, 8/7/98
$Id: ParseCpsl.jj 20054 2017-02-02 06:44:12Z markagreenwood $
*/
options {
CACHE_TOKENS = true;
IGNORE_CASE = false;
DEBUG_PARSER = false;
DEBUG_TOKEN_MANAGER = false;
LOOKAHEAD = 1;
FORCE_LA_CHECK = false;
CHOICE_AMBIGUITY_CHECK = 2;
OTHER_AMBIGUITY_CHECK = 1;
STATIC = false;
DEBUG_LOOKAHEAD = false;
ERROR_REPORTING = true;
JAVA_UNICODE_ESCAPE = false;
UNICODE_INPUT = true;
USER_TOKEN_MANAGER = false;
USER_CHAR_STREAM = false;
BUILD_PARSER = true;
BUILD_TOKEN_MANAGER = true;
SANITY_CHECK = true;
}
PARSER_BEGIN(ParseCpsl)
package gate.jape.parser;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import gate.Factory;
import gate.util.*;
import gate.jape.*;
import gate.jape.constraint.*;
import gate.event.*;
import org.apache.log4j.Logger;
/**
* A parser for the CPSL language. Generated using JavaCC.
* @author Hamish Cunningham
*/
public class ParseCpsl implements JapeConstants {
private static final long serialVersionUID = -2754817550046808372L;
private static final Logger log = Logger.getLogger(ParseCpsl.class);
/** Construct from a URL and an encoding
*/
public ParseCpsl(URL url, String encoding) throws IOException {
this(url, encoding, new HashMap());
}
/** Construct from a URL and an encoding
*/
public ParseCpsl(URL url, String encoding, Map existingMacros) throws IOException {
this(url, encoding, existingMacros, new HashMap());
}
public ParseCpsl(URL url, String encoding, Map existingMacros, Map existingTemplates) throws IOException {
this(new BomStrippingInputStreamReader(url.openStream(), encoding),
existingMacros, existingTemplates);
baseURL = url;
this.encoding = encoding;
}
public ParseCpsl(java.io.Reader stream, Map existingMacros) {
this(stream, existingMacros, new HashMap());
}
public ParseCpsl(java.io.Reader stream, Map existingMacros, Map existingTemplates) {
this(stream);
macrosMap = existingMacros;
templatesMap = existingTemplates;
}
//StatusReporter Implementation
public void addStatusListener(StatusListener listener){
myStatusListeners.add(listener);
}
public void removeStatusListener(StatusListener listener){
myStatusListeners.remove(listener);
}
protected void fireStatusChangedEvent(String text){
Iterator listenersIter = myStatusListeners.iterator();
while(listenersIter.hasNext())
listenersIter.next().statusChanged(text);
}
protected SinglePhaseTransducer createSinglePhaseTransducer(String name){
try {
Constructor extends SinglePhaseTransducer> c = sptClass.getConstructor
(String.class);
return c.newInstance(name);
} catch (NoSuchMethodException e) { // Shouldn't happen
throw new RuntimeException(e);
} catch (IllegalArgumentException e) { // Shouldn't happen
throw new RuntimeException(e);
} catch (InstantiationException e) { // Shouldn't happen
throw new RuntimeException(e);
} catch (IllegalAccessException e) { // Shouldn't happen
throw new RuntimeException(e);
} catch (InvocationTargetException e) { // Happens if the constructor throws an exception
throw new RuntimeException(e);
}
}
protected ParseCpsl spawn(URL sptURL) throws IOException{
ParseCpsl newParser = new ParseCpsl(sptURL, encoding, macrosMap, templatesMap);
newParser.setSptClass(this.sptClass);
return newParser;
}
protected void finishSPT(SinglePhaseTransducer t) throws ParseException {
if(ruleNumber == 0)
throw(new ParseException("no rules defined in transducer " + t.getName()));
t.setBaseURL(baseURL);
}
protected void finishBPE(BasicPatternElement bpe) {
}
/**
* Attempt to parse a multi phase transducer from the current file. This
* method ensures that the JAPE file reader is properly closed when the
* method completes, whether it completes successfully or throws an
* exception.
*/
public MultiPhaseTransducer MultiPhaseTransducer() throws ParseException {
try {
return _MultiPhaseTransducer();
}
finally {
// this is a bit nasty but I couldn't find a better way to get at the
// underlying Reader
if(jj_input_stream.inputStream != null) {
try {
jj_input_stream.inputStream.close();
}
catch(IOException e) {
log.warn("Couldn't close input stream while parsing " + baseURL, e);
}
}
}
}
protected String toJavaIdentifier(String japeIdentifier) {
return japeIdentifier.replace("-", "_");
}
/**
* Normalise for quoted and unquoted strings - if the token is a string,
* strip the quotes off its image, otherwise return the image as-is.
*/
protected String stringValueOf(Token tok) {
if(tok.kind == string) {
// quoted string - strip the quotes
return tok.image.substring(1, tok.image.length() - 1);
} else {
return tok.image;
}
}
/**
* Append the given string to the end of the given buffer as a Java string
* literal. If str
is null
, we append the four
* characters n, u, l, l. Otherwise, we append the contents of str surrounded
* by double quotes, except that characters in str are escaped as necessary
* to be a legal Java string literal: backspace, formfeed, tab, newline and
* return are replaced by their escape sequences \b, \f, etc.; single and double
* quote and backslash are preceded by an extra backslash; other non-ASCII
* and non-printing characters are rendered as Unicode escapes (backslash-u
* followed by four hex digits).
*/
protected void appendJavaStringLiteral(StringBuffer buf, String str) {
if(str == null) {
buf.append("null");
}
else {
Formatter formatter = null;
buf.append("\"");
for(int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
switch(c) {
case '\b':
buf.append("\\b");
break;
case '\f':
buf.append("\\f");
break;
case '\n':
buf.append("\\n");
break;
case '\r':
buf.append("\\r");
break;
case '\t':
buf.append("\\t");
break;
case '\"':
buf.append("\\\"");
break;
case '\'':
buf.append("\\\'");
break;
case '\\':
buf.append("\\\\");
break;
default:
if(c < 32 || c > 127) {
if(formatter == null) formatter = new Formatter(buf);
formatter.format("\\u%04X", Integer.valueOf(c));
}
else {
buf.append(c);
}
break;
}
}
buf.append("\"");
}
}
protected void appendAnnotationAdd(StringBuffer blockBuffer, String newAnnotType, String annotSetName)
{
String nl = Strings.getNl();
blockBuffer.append(" if(outputAS == inputAS) { // use nodes directly" + nl);
blockBuffer.append(" outputAS.add(" + nl);
blockBuffer.append(" " + annotSetName + ".firstNode(), ");
blockBuffer.append(annotSetName + ".lastNode(), " + nl);
blockBuffer.append(" ");
appendJavaStringLiteral(blockBuffer, newAnnotType);
blockBuffer.append(", features" + nl);
blockBuffer.append(" );" + nl);
blockBuffer.append(" }" + nl);
blockBuffer.append(" else { // use offsets" + nl);
blockBuffer.append(" try {" + nl);
blockBuffer.append(" outputAS.add(" + nl);
blockBuffer.append(" " + annotSetName + ".firstNode().getOffset(), ");
blockBuffer.append(annotSetName + ".lastNode().getOffset(), " + nl);
blockBuffer.append(" ");
appendJavaStringLiteral(blockBuffer, newAnnotType);
blockBuffer.append(", features" + nl);
blockBuffer.append(" );" + nl);
blockBuffer.append(" }" + nl);
blockBuffer.append(" catch(gate.util.InvalidOffsetException ioe) {" + nl);
blockBuffer.append(" throw new gate.util.GateRuntimeException(\"Invalid offset exception generated \" +" + nl);
blockBuffer.append(" \"from offsets taken from same document!\");" + nl);
blockBuffer.append(" }" + nl);
blockBuffer.append(" }" + nl);
blockBuffer.append(" // end of RHS assignment block");
}
/**
* Takes a string containing ${key} placeholders and substitutes
* in the corresponding values from the given map. If there is
* no value in the map for a particular placeholder it is left
* un-resolved, i.e. given a template of "${key1}/${key2}" and
* a values map of just [key1: "hello"], this method would return
* "hello/${key2}".
*/
protected Pair substituteTemplate(Token templateNameTok,
Map values) throws ParseException {
Pair template = templatesMap.get(templateNameTok.image);
if(template == null) {
throw new ParseException(errorMsgPrefix(templateNameTok) +
"unknown template name " + templateNameTok.image);
}
Pair returnVal = null;
Set unusedParams = new HashSet(values.keySet());
if(((Integer)template.first).intValue() == string) {
log.debug("Substituting template " + templateNameTok.image + " with map "
+ values + ". Template is " + template);
StringBuffer buf = new StringBuffer();
Matcher mat = Pattern.compile("\\$\\{([^\\}]+)\\}")
.matcher((String)template.second);
while(mat.find()) {
String key = mat.group(1);
if(values.containsKey(key)) {
mat.appendReplacement(buf,
Matcher.quoteReplacement(String.valueOf(values.get(key))));
unusedParams.remove(key);
}
else {
mat.appendReplacement(buf, "\\${");
buf.append(key);
buf.append("}");
}
}
mat.appendTail(buf);
returnVal = new Pair();
returnVal.first = Integer.valueOf(string);
returnVal.second = buf.toString();
log.debug("Template substitution produced " + returnVal.second);
}
else {
returnVal = template;
}
// check that there were no invalid parameters
if(!unusedParams.isEmpty()) {
throw new ParseException(errorMsgPrefix(templateNameTok) +
"invalid parameters " + unusedParams +
" for template " + templateNameTok.image);
}
else {
return returnVal;
}
}
public void setBaseURL (URL newURL) {
baseURL = newURL;
}
public void setEncoding (String newEncoding) {
encoding = newEncoding;
}
public void setSptClass(Class extends SinglePhaseTransducer> sptClass) {
this.sptClass = sptClass;
}
private String errorMsgPrefix(Token t) {
return ((baseURL != null) ? baseURL.toExternalForm() : "(No URL)")+
( (t == null) ? " " :
":"+t.beginLine+":"+t.beginColumn+": ");
}
private transient List myStatusListeners = new LinkedList();
/** Position of the current rule */
private int ruleNumber;
/** A list of all the bindings we made this time, for checking
* the RHS during parsing.
*/
private Set bindingNameSet = null;
/** A table of macro definitions. */
protected Map macrosMap;
/**
* A table of template definitions. Keys are template names,
* values are Pairs of token kind and value, as returned by
* AttrVal.
*/
protected Map templatesMap;
protected URL baseURL;
protected String encoding;
protected Class extends SinglePhaseTransducer> sptClass =
SinglePhaseTransducer.class;
protected SinglePhaseTransducer curSPT;
} // class ParseCpsl
PARSER_END(ParseCpsl)
///////////////////
// lexical analysis
///////////////////
////////////////
// utility stuff
TOKEN: {
<#space: ("\n" | "\r" | "\t" | "\f" | " ")>
}
TOKEN: { <#spaces: ("\n" | "\r" | "\t" | "\f" | " ")+> }
TOKEN: { <#newline: ("\n" | "\r" | "\n\r" | "\r\n")> }
TOKEN: { <#digits: (["0"-"9"])+> }
TOKEN: { <#letter: ["A"-"Z", "a"-"z"]> }
TOKEN: { <#letterOrUnderscore: ["A"-"Z", "a"-"z", "_"]> }
TOKEN: { <#letters: (["A"-"Z", "a"-"z"])+> }
TOKEN: { <#lettersAndDigits: (["A"-"Z", "a"-"z", "0"-"9"])+> }
TOKEN: {
<#letterOrDigitOrDash: ["A"-"Z", "a"-"z", "0"-"9", "-", "_"]>
}
TOKEN: {
<#lettersAndDigitsAndDashes: (["A"-"Z", "a"-"z", "0"-"9", "-", "_"])+>
}
////////////////
// parsed tokens
TOKEN [IGNORE_CASE]: { }
// phases has its own lexical state so we can deal with relative paths
// pointing to grammar files
TOKEN [IGNORE_CASE]: { :IN_PHASES }
TOKEN: {
}
SPECIAL_TOKEN: { // ignore whitespace
)+>
}
SPECIAL_TOKEN: { // single-line C++/Java style comments
)? >
}
SPECIAL_TOKEN: { // single-line comments CPSL style
)? >
}
SPECIAL_TOKEN: {
: PHASES_WITHIN_COMMENT
}
MORE: { }
SPECIAL_TOKEN: {
: IN_PHASES
}
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN [IGNORE_CASE]: { }
TOKEN: { }
TOKEN: { }
TOKEN: { " | "<" | ">=" | "<=" | "=~" | "!~" | "==~" | "!=~"> }
TOKEN: { }
TOKEN: { }
//starts a string
MORE:
{
"\"" : IN_STRING
}
//reads the contents of the string
MORE :
{
"\\n" { image.setLength(image.length() - 2); image.append("\n"); }
|
"\\r" { image.setLength(image.length() - 2); image.append("\r"); }
|
"\\t" { image.setLength(image.length() - 2); image.append("\t"); }
|
"\\b" { image.setLength(image.length() - 2); image.append("\b"); }
|
"\\f" { image.setLength(image.length() - 2); image.append("\f"); }
|
"\\\"" { image.setLength(image.length() - 2); image.append("\""); }
|
"\\\'" { image.setLength(image.length() - 2); image.append("\'"); }
|
"\\\\" { image.setLength(image.length() - 2); image.append("\\"); }
|
<"\\u" (["0"-"9","A"-"F","a"-"f"]) (["0"-"9","A"-"F","a"-"f"])
(["0"-"9","A"-"F","a"-"f"]) (["0"-"9","A"-"F","a"-"f"])>
{
String digits = image.substring(image.length() - 4, image.length());
image.setLength(image.length() - 6);
image.append((char)Integer.parseInt(digits, 16));
}
|
< ~["\"", "\\"] > // Disallow backslashes that weren't caught by previous rules
// Note that here you don't need any action.
}
//finishes the string
TOKEN :
{
{
// image.setLength(image.length() - 1);
matchedToken.image = image.toString();
} : DEFAULT
}
TOKEN: { }
TOKEN: { ()* > }
TOKEN: {
)? (["f","F","d","D"])?
| "." (["0"-"9"])+ ()? (["f","F","d","D"])?
| (["0"-"9"])+ (["f","F","d","D"])?
| (["0"-"9"])+ ()? ["f","F","d","D"]
)
>
}
TOKEN: { <#exponent: ["e","E"] (["+","-"])? (["0"-"9"])+ > }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
TOKEN: { }
// TOKEN: { | > }
// TOKEN: { }
/* SPECIAL_TOKEN: { // catch all for Java block processing
}*/
////////////////////
// non-parsed tokens
// we make comments and spaces special tokens to support an editor
SPECIAL_TOKEN: { // ignore whitespace
)+>
}
SPECIAL_TOKEN: { // single-line C++/Java style comments
)? >
}
SPECIAL_TOKEN: { // single-line comments CPSL style
)? >
}
SPECIAL_TOKEN: { : WITHIN_COMMENT }
MORE: { }
SPECIAL_TOKEN: { : DEFAULT }
TOKEN: { // catch all for Java block processing
}
//////////////
// the grammar
//////////////
MultiPhaseTransducer _MultiPhaseTransducer() :
{
// macrosMap = new HashMap();
SinglePhaseTransducer s = null;
MultiPhaseTransducer m = new MultiPhaseTransducer();
m.setBaseURL(baseURL);
Token mptNameTok = null;
Token phaseNameTok = null;
String javaimportblock = null;
String controllerstartedblock = null;
String controllerfinishedblock = null;
String controllerabortedblock = null;
boolean haveControllerStartedBlock = false;
boolean haveControllerFinishedBlock = false;
boolean haveControllerAbortedBlock = false;
}
{
// transducer name
(
mptNameTok=
{ m.setName(mptNameTok.image); }
)?
// spts
(
// sptrannies in this file
(
(javaimportblock=JavaImportBlock())
(
( controllerstartedblock=ControllerStartedBlock()
{ if(haveControllerStartedBlock)
throw new ParseException("Only one ControllerStarted block allowed");
else
haveControllerStartedBlock = true;
}
) |
(controllerfinishedblock=ControllerFinishedBlock()
{ if(haveControllerFinishedBlock)
throw new ParseException("Only one ControllerFinished block allowed");
else
haveControllerFinishedBlock = true;
}
) |
(controllerabortedblock=ControllerAbortedBlock()
{ if(haveControllerAbortedBlock)
throw new ParseException("Only one ControllerAborted block allowed");
else
haveControllerAbortedBlock = true;
}
)
)*
(
try {
s=SinglePhaseTransducer(javaimportblock) {
m.addPhase(s.getName(), s);
s.setBaseURL(baseURL);
s.setControllerEventBlocks(controllerstartedblock,
controllerfinishedblock,controllerabortedblock,javaimportblock);
// only the first SPT in a MPT file should define/execute the blocks
controllerstartedblock = null;
controllerfinishedblock = null;
controllerabortedblock = null;
}
} catch (Throwable e) {
// try to wrap the exception with info about what file/resource
// it occurred in.
throw(
new ParseException("Cannot parse a phase in " +
baseURL + ": " + e.getMessage()
));
}
)+
)
|
// sptrannies in external files
(
(
phaseNameTok=
{
ParseCpsl parser = null;
// check file exists
String sptPath = phaseNameTok.image + ".jape";
URL sptURL = null;
try{
sptURL = new URL(baseURL, sptPath);
}catch(MalformedURLException mue){
throw(new ParseException(errorMsgPrefix(phaseNameTok)+
"Read error " + mue.toString()));
}
// sptURL can never be null at this point because the only way that could
// happen would be if an exception occurred above, but that would trigger
// the ParserException above
if(sptURL == null){
throw(new ParseException(errorMsgPrefix(phaseNameTok)+
"Resource not found: base = " + baseURL.toString() +
" path = " + sptPath
));
}
// construct a parser and parse it
fireStatusChangedEvent("Reading " + phaseNameTok.image + "...");
try {
parser = spawn(sptURL);
} catch (IOException e) {
throw(
new ParseException(errorMsgPrefix(phaseNameTok)+
"Cannot open URL " + sptURL.toExternalForm()
)
);
}
// adding the resultant spt to m
if(parser != null) {
List phases = parser.MultiPhaseTransducer().getPhases();
//s = parser.SinglePhaseTransducer();
//if(s != null)
// m.addPhase(s.getName(), s);
if(phases != null) {
for(int i=0; i < phases.size(); i++) {
m.addPhase(
phases.get(i).getName(),
phases.get(i)
);
}
}
}
} // an SPT in an external file
)+ // external file phase identifiers
) // external file phases declaration
) // SPTs
{
//move this out of here so the input file gets closed properly
// m.finish(); // swap the various JGL types for Java arrays
return m;
}
} // _MultiPhaseTransducer
SinglePhaseTransducer SinglePhaseTransducer(String javaimportblock) :
{
ruleNumber = 0;
Token phaseNameTok = null;
String phaseName = null;
Token inputTok = null;
SinglePhaseTransducer t = null;
Rule newRule = null;
bindingNameSet = new HashSet();
Token optionNameTok = null;
Token optionValueTok = null;
}
{
phaseNameTok=
{ phaseName = toJavaIdentifier(phaseNameTok.image);
t = createSinglePhaseTransducer(phaseName); curSPT = t; }
(
(
( ( inputTok = | inputTok = ) {t.addInput(stringValueOf(inputTok));})*
)
|
(