All Downloads are FREE. Search and download functionalities are using the official Maven repository.

connectors.okapi-connector-moses.1.46.0.source-code.word_break_rules.txt Maven / Gradle / Ivy

# ============= Custom Rules ================
# Abbreviation: Uppercase alpha chars separated by period and optionally followed by a period 
$Abbreviation = [A-Z](\.[A-Z0-9])+(\.)*;
# Hyphenated Word : sequence of letter or digit, (punctuated by [/+&_-], with following letter or digit sequence)+
$HyphenatedWord = [A-Za-z0-9]+([\-\+\&_][A-Za-z0-9]+)+;
# Email address: sequence of letters, digits and punctuation followed by @ and followed by another sequence
$EmailAddress = [A-Za-z0-9_\-\.]+\@[A-Za-z][A-Za-z0-9_]+\.[a-z]+;
# Internet Addresses: http://www.foo.com(/bar)
$InternetAddress = [a-z]+\:\/\/[a-z0-9]+(\.[a-z0-9]+)+(\/[a-z0-9][a-z0-9\.]+);
# XML markup: A run begins with < and ends with the first matching >
$XmlMarkup = \<[^\>]+\>; 
# Emoticon: A run that starts with :;B8{[ and contains only one or more of the following -=/{})(
$Emoticon = [B8\:\;\{\[][-=\/\{\}\)\(]+; 
# Internet IP Address - a block of 4 numbers of max 3 numbers each separated by period
$InternetIpAddress = [0-9]+\.[0-9]+\.[0-9]+\.[0-9]+;
# Internet Site Address - such as www.ibm.com
$InternetSiteAddress = [a-z][a-z0-9]*(\.[a-z0-9])+;

!!chain;
$CR           = [\p{Word_Break = CR}];
$LF           = [\p{Word_Break = LF}];
$Newline      = [\p{Word_Break = Newline}];
$Extend       = [\p{Word_Break = Extend}];
$Format       = [\p{Word_Break = Format}];
$Hiragana     = [:Hiragana:];
$Katakana     = [\p{Word_Break = Katakana}];
$Han          = [:Han:];
$ALetter      = [\p{Word_Break = ALetter}];
$MidNumLet    = [\p{Word_Break = MidNumLet}];
$MidLetter    = [\p{Word_Break = MidLetter}];
$MidNum       = [\p{Word_Break = MidNum}];
$Numeric      = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Control        = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji      = [$Han $Hiragana $Katakana];
$dictionaryCJK  = [$KanaKanji $HangulSyllable];
$dictionary     = [$ComplexContext $dictionaryCJK];
$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
$MidNumEx       = $MidNum       ($Extend |  $Format)*;
$NumericEx      = $Numeric      ($Extend |  $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
$Ideographic    = [\p{Ideographic}];
$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;

!!forward;
$CR $LF;
[^$CR $LF $Newline]? ($Extend |  $Format)+;
$NumericEx {100};
$ALetterEx {200};
$HangulSyllable {200};
$KatakanaEx {400};
$HiraganaEx {400};
$IdeographicEx {400};
$ALetterEx $ALetterEx {200};
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
$NumericEx $NumericEx {100};
$ALetterEx $NumericEx {200};
$NumericEx $ALetterEx {200};
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$KatakanaEx  $KatakanaEx {400};
$ALetterEx      $ExtendNumLetEx {200};
$NumericEx      $ExtendNumLetEx {100};
$KatakanaEx     $ExtendNumLetEx {400};
$ExtendNumLetEx $ExtendNumLetEx {200};
$ExtendNumLetEx $ALetterEx  {200};
$ExtendNumLetEx $NumericEx  {100};
$ExtendNumLetEx $KatakanaEx {400};
$Regional_IndicatorEx $Regional_IndicatorEx;
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400};

# =========== Custom Forwards ====================
$Abbreviation {500};
$HyphenatedWord {501};
$EmailAddress {502};
$InternetAddress {503};
$XmlMarkup {504};
$Emoticon {505};
$InternetIpAddress {506};
$InternetSiteAddress {507};

!!reverse;
$BackALetterEx            = ($Format | $Extend)* $ALetterPlus;
$BackMidNumLetEx          = ($Format | $Extend)* $MidNumLet;
$BackNumericEx            = ($Format | $Extend)* $Numeric;
$BackMidNumEx             = ($Format | $Extend)* $MidNum;
$BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
$BackKatakanaEx           = ($Format | $Extend)* $Katakana;
$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
$BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
$LF $CR;
($Format | $Extend)*  [^$CR $LF $Newline]?;
$BackALetterEx $BackALetterEx;
$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
$BackNumericEx $BackNumericEx;
$BackNumericEx $BackALetterEx;
$BackALetterEx $BackNumericEx;
$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
$BackKatakanaEx $BackKatakanaEx;
$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji;

!!safe_reverse;
($Extend | $Format)+ .?;
($MidLetter | $MidNumLet) $BackALetterEx;
($MidNum | $MidNumLet) $BackNumericEx;
$dictionary $dictionary;

!!safe_forward;
($Extend | $Format)+ .?;
($MidLetterEx | $MidNumLetEx) $ALetterEx;
($MidNumEx | $MidNumLetEx) $NumericEx;
$dictionary $dictionary;





© 2015 - 2025 Weber Informatics LLC | Privacy Policy