resources.tokeniser.DefaultTokeniser.rules Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
The newest version!
#DefaultTokeniser.rules#
#diana 28/6/00#
#update 9/7/00#
# updated by MAG 25/05/21
# Replaced all uses of X_LETTER with (X_LETTER (NON_SPACING_MARK)*)
# to allow for cases where two unicode characters are used to create
# an accented character rather than the single combined character.
# There were easier ways of doing this for the lowercase, upperInitial,
# and allCaps but I couldn't figure out a similar way of for the mixedCaps
# rules so stuck with a consistent approach throughout.
#Tokeniser rule file
#Each rule should be on one line
#Lines that end with "\" are appended with the next one. This facility \
is used for longer rules that cannot be written on a single line
#
#Lines starting with "#" are treated as comment
//Lines starting with "//" are treated as comment
# Empty lines are ignored.
#A rule has a left hand side (LHS) and a right hand side (RHS);
#the LHS is a regular expression tha has to be matched on the input
#the RHS describes the annotations to be added to the AnnotationSet.
#LHS is separated from the RHS by '>'
#LHS knows about the following operators:
# + (1..n)
# * (0..n)
# | (boolean OR)
#
#RHS uses as separator ';' and has the following format
#{LHS} > {Annotation type};{attribute1}={value1};...;{attribute n}={value n}
#The primitive constructs are:
#UNASSIGNED
#UPPERCASE_LETTER
#LOWERCASE_LETTER
#TITLECASE_LETTER
#MODIFIER_LETTER
#OTHER_LETTER
#NON_SPACING_MARK
#ENCLOSING_MARK
#COMBINING_SPACING_MARK
#DECIMAL_DIGIT_NUMBER
#LETTER_NUMBER
#OTHER_NUMBER
#SPACE_SEPARATOR
#LINE_SEPARATOR
#PARAGRAPH_SEPARATOR
#CONTROL
#FORMAT
#PRIVATE_USE
#SURROGATE
#DASH_PUNCTUATION
#START_PUNCTUATION
#END_PUNCTUATION
#CONNECTOR_PUNCTUATION
#OTHER_PUNCTUATION
#MATH_SYMBOL
#CURRENCY_SYMBOL
#MODIFIER_SYMBOL
#OTHER_SYMBOL
#...representing the corresponding enumerated Unicode category types
# See java.lang.Character for the Java version you are using
#------- The rules start here -----------------
#words#
// a word can be any combination of letters, including hyphens,
// but excluding symbols and punctuation, e.g. apostrophes
// Note that there is an alternative version of the tokeniser that
// treats hyphens as separate tokens
("UPPERCASE_LETTER" (NON_SPACING_MARK)*) ((LOWERCASE_LETTER (NON_SPACING_MARK)*) ((LOWERCASE_LETTER (NON_SPACING_MARK)*)|DASH_PUNCTUATION|FORMAT)*)* > Token;orth=upperInitial;kind=word;
("UPPERCASE_LETTER" (NON_SPACING_MARK)*) (DASH_PUNCTUATION|FORMAT)* ((UPPERCASE_LETTER (NON_SPACING_MARK)*)|DASH_PUNCTUATION|FORMAT)+ > Token;orth=allCaps;kind=word;
("LOWERCASE_LETTER" (NON_SPACING_MARK)*) ((LOWERCASE_LETTER (NON_SPACING_MARK)*)|DASH_PUNCTUATION|FORMAT)* > Token;orth=lowercase;kind=word;
// MixedCaps is any mixture of caps and small letters that doesn't
// fit in the preceding categories
(("LOWERCASE_LETTER" (NON_SPACING_MARK)*) ("LOWERCASE_LETTER" (NON_SPACING_MARK)*)+("UPPERCASE_LETTER" (NON_SPACING_MARK)*)+ \
((UPPERCASE_LETTER (NON_SPACING_MARK)*)|(LOWERCASE_LETTER (NON_SPACING_MARK)*))*)|\
(("LOWERCASE_LETTER" (NON_SPACING_MARK)*) ("LOWERCASE_LETTER" (NON_SPACING_MARK)*)*("UPPERCASE_LETTER" (NON_SPACING_MARK)*)+\
((UPPERCASE_LETTER (NON_SPACING_MARK)*)|(LOWERCASE_LETTER (NON_SPACING_MARK)*)|DASH_PUNCTUATION|FORMAT)*)|\
(("UPPERCASE_LETTER" (NON_SPACING_MARK)*) (DASH_PUNCTUATION)* ("UPPERCASE_LETTER" (NON_SPACING_MARK)*) ((UPPERCASE_LETTER (NON_SPACING_MARK)*)|(LOWERCASE_LETTER (NON_SPACING_MARK)*)|DASH_PUNCTUATION|FORMAT)*\
(("LOWERCASE_LETTER" (NON_SPACING_MARK)*))+ ((UPPERCASE_LETTER (NON_SPACING_MARK)*)|(LOWERCASE_LETTER (NON_SPACING_MARK)*)|DASH_PUNCTUATION|FORMAT)*)|\
(("UPPERCASE_LETTER" (NON_SPACING_MARK)*) ("LOWERCASE_LETTER" (NON_SPACING_MARK)*)+ (("UPPERCASE_LETTER" (NON_SPACING_MARK)*)+ ("LOWERCASE_LETTER" (NON_SPACING_MARK)*)+)+)|\
(((UPPERCASE_LETTER (NON_SPACING_MARK)*))+ ((LOWERCASE_LETTER (NON_SPACING_MARK)*))+ ((UPPERCASE_LETTER (NON_SPACING_MARK)*))+)\
> Token;orth=mixedCaps;kind=word;
(OTHER_LETTER|COMBINING_SPACING_MARK|NON_SPACING_MARK)+ >Token;kind=word;type=other;
#numbers#
// a number is any combination of digits
"DECIMAL_DIGIT_NUMBER"+ >Token;kind=number;
"OTHER_NUMBER"+ >Token;kind=number;
#whitespace#
(SPACE_SEPARATOR) >SpaceToken;kind=space;
(CONTROL) >SpaceToken;kind=control;
#symbols#
(MODIFIER_SYMBOL|MATH_SYMBOL|OTHER_SYMBOL) > Token;kind=symbol;
CURRENCY_SYMBOL > Token;kind=symbol;symbolkind=currency;
#punctuation#
(DASH_PUNCTUATION|FORMAT) >Token;kind=punctuation;subkind=dashpunct;
(CONNECTOR_PUNCTUATION|OTHER_PUNCTUATION)>Token;kind=punctuation;
("START_PUNCTUATION"|"INITIAL_QUOTE_PUNCTUATION") >Token;kind=punctuation;position=startpunct;
("END_PUNCTUATION"|"FINAL_QUOTE_PUNCTUATION") >Token;kind=punctuation;position=endpunct;
© 2015 - 2024 Weber Informatics LLC | Privacy Policy