gate.plugins.Lang_French.tokeniser.FrenchTokeniser.rules Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of gate-extraction Show documentation

Gate based component, that can process the Text units to extract informations using Gate's tools (such as grammars, gazetteers, tokenizer or POS Taggers). This project contains two versions, a simple component and webservice one.

There is a newer version: 2.0

Show newest version

#DiTokeniser.rules#
#diana 28/6/00#
#update 9/7/00#

#Tokeniser rule file
#Each rule should be on one line
#Lines that end with "\" are appended with the next one. This facility \
 is used for longer rules that cannot be written on a single line
#
#Lines starting with "#" are treated as comment
//Lines starting with "//" are treated as comment
# Empty lines are ignored.

#A rule has a left hand side (LHS) and a right hand side (RHS);
#the LHS is a regular expression tha has to be matched on the input
#the RHS describes the annotations to be added to the AnnotationSet.
#LHS is separated from the RHS by '>'
#LHS knows about the following operators:
#	+ (1..n)
#	* (0..n)
#	| (boolean OR)
#
#RHS uses as separator ';' and has the following format
#{LHS} > {Annotation type};{attribute1}={value1};...;{attribute n}={value n}


#The primitive constructs are:
#UNASSIGNED
#UPPERCASE_LETTER
#LOWERCASE_LETTER
#TITLECASE_LETTER
#MODIFIER_LETTER
#OTHER_LETTER
#NON_SPACING_MARK
#ENCLOSING_MARK
#COMBINING_SPACING_MARK
#DECIMAL_DIGIT_NUMBER
#LETTER_NUMBER
#OTHER_NUMBER
#SPACE_SEPARATOR
#LINE_SEPARATOR
#PARAGRAPH_SEPARATOR
#CONTROL
#FORMAT
#PRIVATE_USE
#SURROGATE
#DASH_PUNCTUATION
#START_PUNCTUATION
#END_PUNCTUATION
#CONNECTOR_PUNCTUATION
#OTHER_PUNCTUATION
#MATH_SYMBOL
#CURRENCY_SYMBOL
#MODIFIER_SYMBOL
#OTHER_SYMBOL
#...representing the corresponding enumerated Unicode category types

#------- The rules start here -----------------

#words#
// a word can be any combination of letters, including hyphens,
// but excluding symbols and punctuation, e.g. apostrophes


"UPPERCASE_LETTER" (LOWERCASE_LETTER (LOWERCASE_LETTER|DASH_PUNCTUATION)*)* > Token;orth=upperInitial;kind=word;
"UPPERCASE_LETTER" (DASH_PUNCTUATION)* (UPPERCASE_LETTER|DASH_PUNCTUATION)+ > Token;orth=allCaps;kind=word;
"LOWERCASE_LETTER" (LOWERCASE_LETTER|DASH_PUNCTUATION)* > Token;orth=lowercase;kind=word;

UPPERCASE_LETTER (LOWERCASE_LETTER)+ DASH_PUNCTUATION (LOWERCASE_LETTER)+ \
DASH_PUNCTUATION UPPERCASE_LETTER (LOWERCASE_LETTER)+ \
>Token;orth=upperInitial;kind=word;

// MixedCaps is any mixture of caps and small letters that doesn't
// fit in the preceding categories

("LOWERCASE_LETTER" "LOWERCASE_LETTER"+"UPPERCASE_LETTER"+ \
 (UPPERCASE_LETTER|LOWERCASE_LETTER)*)|\
("LOWERCASE_LETTER" "LOWERCASE_LETTER"*"UPPERCASE_LETTER"+\
 (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*)|\
("UPPERCASE_LETTER" "UPPERCASE_LETTER" (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*\
 ("LOWERCASE_LETTER")+ (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*)|\
("UPPERCASE_LETTER" "LOWERCASE_LETTER"+ ("UPPERCASE_LETTER"+ "LOWERCASE_LETTER"+)+)|\
 ((UPPERCASE_LETTER)+ (LOWERCASE_LETTER)+ (UPPERCASE_LETTER)+)\
> Token;orth=mixedCaps;kind=word;

(OTHER_LETTER|COMBINING_SPACING_MARK|NON_SPACING_MARK)+ >Token;kind=word;type=other;

#numbers#
// a number is any combination of digits
"DECIMAL_DIGIT_NUMBER"+ >Token;kind=number;

#whitespace#
(SPACE_SEPARATOR) >SpaceToken;kind=space;
(CONTROL) >SpaceToken;kind=control;

#symbols#
(MODIFIER_SYMBOL|MATH_SYMBOL|OTHER_SYMBOL) > Token;kind=symbol;
CURRENCY_SYMBOL > Token;kind=symbol;symbolkind=currency;

#punctuation#
"DASH_PUNCTUATION" >Token;kind=punctuation;subkind=dashpunct;
(CONNECTOR_PUNCTUATION|OTHER_PUNCTUATION)>Token;kind=punctuation;
"START_PUNCTUATION" >Token;kind=punctuation;position=startpunct;
"END_PUNCTUATION" >Token;kind=punctuation;position=endpunct;