jtok-user.jtok.en.en_punct.xml Maven / Gradle / Ivy
The newest version!
<?xml version="1.0" encoding="UTF-8"?> <!-- This contains the punctuation description for English. --> <PUNCTUATION_DESCRIPTION> <!-- This defines the specific punctuation elements. The regexp --> <!-- attribute is the regular expression used to match --> <!-- the punctuation. --> <!-- ATTENTION! Because the regex package we use always --> <!-- stops at the first disjunct that could be matched, the --> <!-- order of the definition with a class is critical! So make --> <!-- sure that punctuations with the potentially longer match --> <!-- stand BEFORE others, e.g. use the order `abc|ab|a' --> <!-- instead of `a|ab|abc' --> <DEFINITIONS> <!-- possible terminal punctuation --> <PERIOD regexp="\." class="TERM_PUNCT_P"/> <!-- terminal punctuation --> <MULTI_PERIOD regexp="\.{2,}" class="TERM_PUNCT"/> <EXCLAM regexp="!+" class="TERM_PUNCT"/> <QUEST regexp="\?+" class="TERM_PUNCT"/> <!-- opening punctuation --> <OINF regexp="<<+" class="OPEN_PUNCT"/> <RSQUOTE_L regexp="``?" class="OPEN_PUNCT"/> <OGUILLEMET regexp="«" class="OPEN_PUNCT"/> <ODQUO1 regexp="“" class="OPEN_PUNCT"/> <!-- Unicode U+201C --> <ODQUO2 regexp="„" class="OPEN_PUNCT"/> <!-- Unicode U+201F --> <ODQUO3 regexp="‟" class="OPEN_PUNCT"/> <!-- Unicode U+201E --> <OSQUO1 regexp="‘" class="OPEN_PUNCT"/> <!-- Unicode U+2018 --> <OSQUO2 regexp="‚" class="OPEN_PUNCT"/> <!-- Unicode U+201A --> <OSQUO3 regexp="‛" class="OPEN_PUNCT"/> <!-- Unicode U+201B --> <OSAQUO regexp="‹" class="OPEN_PUNCT"/> <!-- Unicode U+2039 --> <!-- opening brackets --> <OPAR regexp="\(" class="OPEN_BRACKET"/> <OPAR2 regexp="\{" class="OPEN_BRACKET"/> <OCROCHE regexp="\[" class="OPEN_BRACKET"/> <!-- closing punctuation --> <CSUP regexp=">>+" class="CLOSE_PUNCT"/> <CGUILLEMET regexp="»" class="CLOSE_PUNCT"/> <CDQUO regexp="”" class="CLOSE_PUNCT"/> <!-- Unicode U+201D --> <CSQUO regexp="’" class="CLOSE_PUNCT"/> <!-- Unicode U+2019 --> <CSAQUO regexp="›" class="CLOSE_PUNCT"/> <!-- Unicode U+203A --> <!-- closing brackets --> <CPAR regexp="\)" class="CLOSE_BRACKET"/> <CPAR2 regexp="\}" class="CLOSE_BRACKET"/> <CCROCHE regexp="\]" class="CLOSE_BRACKET"/> <!-- ambigue opening/closing punctuation --> <OCQUOTE regexp="\"" class="OPENCLOSE_PUNCT"/> <RSQUOTE_R regexp="''?" class="OPENCLOSE_PUNCT"/> <!-- this is a matcher for the special case when two internal --> <!-- punctuations don't cause a split; it's used for --> <!-- enclitics after abbreviations like in L.A.'s --> <!-- and abbreviations in composita. --> <SPECIAL_INT regexp="\.'|\.-" class="PUNCTUATION"/> <!-- other punctuation --> <LT regexp="<" class="PUNCTUATION"/> <GT regexp=">" class="PUNCTUATION"/> <EQ regexp="=" class="PUNCTUATION"/> <OCHYPHEN regexp="--+" class="PUNCTUATION"/> <FIGDASH regexp="‒" class="PUNCTUATION"/> <!-- Unicode U+2012 --> <ENDASH regexp="–" class="PUNCTUATION"/> <!-- Unicode U+2013 --> <EMDASH regexp="—" class="PUNCTUATION"/> <!-- Unicode U+2014 --> <HBAR regexp="―" class="PUNCTUATION"/> <!-- Unicode U+2015 --> <BULLET regexp="•|■|□" class="PUNCTUATION"/> <PERCENT regexp="%" class="PUNCTUATION"/> <TM regexp="©|®|™" class="PUNCTUATION"/> <SECTION regexp="§" class="PUNCTUATION"/> <!-- only match a hyphen when it's not followed by a digit --> <!-- because it's part of a signed digit then, like in -1 --> <HYPHEN regexp="-(?!\d)" class="PUNCTUATION"/> <SCOLON regexp=";" class="PUNCTUATION"/> <COLON regexp=":" class="PUNCTUATION"/> <!-- only match a comma when it's not followed by a digit --> <!-- because it's then digit internal, like in 3,5 --> <COMMA regexp=",(?!\d)" class="PUNCTUATION"/> <SLASH regexp="\/" class="PUNCTUATION"/> <BSLASH regexp="\\" class="PUNCTUATION"/> <PIPE regexp="\|" class="PUNCTUATION"/> <STAR regexp="\*" class="PUNCTUATION"/> <TILDE regexp="\~" class="PUNCTUATION"/> <AMP regexp="\&" class="PUNCTUATION"/> <NBL regexp="@" class="PUNCTUATION"/> </DEFINITIONS> <!-- These are the mandatory rules defined as --> <!-- regular expressions with the definitions above. --> <!-- Rule names may not be altered! --> <!-- The default behavior is to treat the punctuation --> <!-- defined above as breaking punctuation, that is --> <!-- this punctuations are always used as separator --> <!-- and never as part of clitics or abbreviation. --> <RULES> <!-- The all-rule is used for basic splitting of strings --> <!-- into punctuation and non-punctuation parts. --> <!-- It's basically a disjunction of the punctuations --> <!-- defined above. --> <!-- ATTENTION! Because the regex package we use always --> <!-- stops at the first disjunct that could be matched, the --> <!-- order of the disjuncts in the rule is critical! So make --> <!-- sure that punctuations with the potentially longer match --> <!-- stand BEFORE others, e.g. use the order `abc|ab|a' --> <!-- instead of `a|ab|abc' --> <ALL_PUNCT_RULE> (<SPECIAL_INT/>)(?=\w)|<MULTI_PERIOD/>|<PERIOD/>|<OINF/>|<LT/>|<CSUP/>|<GT/>|<EQ/>|<OCHYPHEN/>|<HYPHEN/>|<RSQUOTE_L/>|<RSQUOTE_R/>|<OGUILLEMET/>|<CGUILLEMET/>|<CDQUO/>|<CSAQUO/>|<CSQUO/>|<ODQUO1/>|<ODQUO2/>|<ODQUO3/>|<OSAQUO/>|<OSQUO1/>|<OSQUO2/>|<OSQUO3/>|<OPAR/>|<CPAR/>|<OPAR2/>|<CPAR2/>|<OCROCHE/>|<CCROCHE/>|<OCQUOTE/>|<EXCLAM/>|<QUEST/>|<SCOLON/>|<COLON/>|<COMMA/>|<SLASH/>|<BSLASH/>|<PIPE/>|<STAR/>|<TILDE/>|<AMP/>|<NBL/>|<FIGDASH/>|<ENDASH/>|<EMDASH/>|<HBAR/>|<BULLET/>|<PERCENT/>|<TM/>|<SECTION/> </ALL_PUNCT_RULE> <!-- punctuation which can mark clitics; this rule is used --> <!-- to check if closer clitics examination is needed; --> <!-- using '.' here has the effect that ALL tokens are --> <!-- checked for special prefixes and suffixes. --> <CLITIC_PUNCT_RULE> . </CLITIC_PUNCT_RULE> <!-- punctuation which can be found in compounds --> <INTERNAL_PUNCT_RULE> <SPECIAL_INT/>|<RSQUOTE_R/>|<HYPHEN/>|<PERIOD/>|<AMP/>|<SLASH/>|<COLON/>|<NBL/> </INTERNAL_PUNCT_RULE> <!-- non-breaking punctuation on the left side of a token; --> <!-- use the beginning-of-input matcher \A to easily --> <!-- identify this punctuation --> <NON_BREAKING_LEFT_PUNCT_RULE class="PUNCTUATION"> \A(<NBL/>) </NON_BREAKING_LEFT_PUNCT_RULE> <!-- non-breaking punctuation on the right side of a token; --> <!-- use the end-of-input matcher \Z to easily --> <!-- identify this punctuation --> <NON_BREAKING_RIGHT_PUNCT_RULE class="TERM_PUNCT_P"> <PERIOD/>\Z </NON_BREAKING_RIGHT_PUNCT_RULE> <!-- punctuation which can be found only within sentences --> <INTERNAL_TU_PUNCT_RULE> <COMMA/>|<SCOLON/>|<COLON/> </INTERNAL_TU_PUNCT_RULE> </RULES> </PUNCTUATION_DESCRIPTION>
© 2015 - 2025 Weber Informatics LLC | Privacy Policy