org.antlr.v4.unicode.UnicodeDataTemplateController Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of antlr4 Show documentation
Show all versions of antlr4 Show documentation
The ANTLR 4 grammar compiler.
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.unicode;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.RangeValueIterator;
import org.antlr.v4.runtime.misc.IntervalSet;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
/**
* StringTemplate controller used to generate parameters to feed
* to {@code unicodedata.st} to code-generate {@code UnicodeData.java},
* used by the tool for Unicode property escapes like {@code \\p\{Lu\}}.
*
* Uses ICU to iterate over Unicode character categories, properties,
* and script codes, as well as aliases for those codes.
*
* This class exists in its own Maven module to avoid adding a
* dependency from the tool onto the (large) ICU runtime.
*/
public abstract class UnicodeDataTemplateController {
private static void addIntervalForCategory(
Map categoryMap,
String categoryName,
int start,
int finish) {
IntervalSet intervalSet = categoryMap.get(categoryName);
if (intervalSet == null) {
intervalSet = new IntervalSet();
categoryMap.put(categoryName, intervalSet);
}
intervalSet.add(start, finish);
}
private static void addPropertyAliases(
Map propertyAliases,
String propertyName,
int property) {
int nameChoice = UProperty.NameChoice.LONG;
while (true) {
String alias;
try {
alias = UCharacter.getPropertyName(property, nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
}
assert alias != null;
addPropertyAlias(propertyAliases, alias, propertyName);
nameChoice++;
}
}
private static void addPropertyAlias(
Map propertyAliases,
String alias,
String propertyName) {
propertyAliases.put(alias, propertyName);
}
public static Map getProperties() {
Map propertyCodePointRanges = new LinkedHashMap<>();
addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeIntPropertyCodesToCodePointRanges(propertyCodePointRanges);
addTR35ExtendedPictographicPropertyCodesToCodePointRanges(propertyCodePointRanges);
addEmojiPresentationPropertyCodesToCodePointRanges(propertyCodePointRanges);
Map propertyAliases = new LinkedHashMap<>();
addUnicodeCategoryCodesToNames(propertyAliases);
addUnicodeBinaryPropertyCodesToNames(propertyAliases);
addUnicodeScriptCodesToNames(propertyAliases);
addUnicodeBlocksToNames(propertyAliases);
addUnicodeIntPropertyCodesToNames(propertyAliases);
propertyAliases.put("EP", "Extended_Pictographic");
Map properties = new LinkedHashMap<>();
properties.put("propertyCodePointRanges", propertyCodePointRanges);
properties.put("propertyAliases", propertyAliases);
return properties;
}
private static String getShortPropertyName(int property) {
String propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.SHORT);
// For some reason, a few properties only have long names.
if (propertyName == null) {
propertyName = UCharacter.getPropertyName(property, UProperty.NameChoice.LONG);
}
return propertyName;
}
private static void addUnicodeCategoryCodesToCodePointRanges(Map propertyCodePointRanges) {
RangeValueIterator iter = UCharacter.getTypeIterator();
RangeValueIterator.Element element = new RangeValueIterator.Element();
while (iter.next(element)) {
String categoryName = UCharacter.getPropertyValueName(
UProperty.GENERAL_CATEGORY_MASK,
1 << element.value,
UProperty.NameChoice.SHORT);
addIntervalForCategory(propertyCodePointRanges, categoryName, element.start, element.limit - 1);
// Add short category so Ll, Lu, Lo, etc. all show up under L
String shortCategoryName = categoryName.substring(0, 1);
addIntervalForCategory(propertyCodePointRanges, shortCategoryName, element.start, element.limit - 1);
}
}
private static void addUnicodeCategoryCodesToNames(Map propertyAliases) {
RangeValueIterator iter = UCharacter.getTypeIterator();
RangeValueIterator.Element element = new RangeValueIterator.Element();
while (iter.next(element)) {
int generalCategoryMask = 1 << element.value;
String categoryName = UCharacter.getPropertyValueName(
UProperty.GENERAL_CATEGORY_MASK,
generalCategoryMask,
UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.LONG;
while (true) {
String alias;
try {
alias = UCharacter.getPropertyValueName(
UProperty.GENERAL_CATEGORY_MASK,
generalCategoryMask,
nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
}
assert alias != null;
addPropertyAlias(propertyAliases, alias, categoryName);
nameChoice++;
}
}
// Add short categories
addPropertyAlias(propertyAliases, "Control", "C");
addPropertyAlias(propertyAliases, "Letter", "L");
addPropertyAlias(propertyAliases, "Number", "N");
addPropertyAlias(propertyAliases, "Mark", "M");
addPropertyAlias(propertyAliases, "Punctuation", "P");
addPropertyAlias(propertyAliases, "Symbol", "S");
addPropertyAlias(propertyAliases, "Space", "Z");
}
private static void addUnicodeBinaryPropertyCodesToCodePointRanges(Map propertyCodePointRanges) {
for (int property = UProperty.BINARY_START;
property < UProperty.BINARY_LIMIT;
property++) {
String propertyName = getShortPropertyName(property);
IntervalSet intervalSet = new IntervalSet();
UnicodeSet unicodeSet = new UnicodeSet();
unicodeSet.applyIntPropertyValue(property, 1);
for (UnicodeSet.EntryRange range : unicodeSet.ranges()) {
intervalSet.add(range.codepoint, range.codepointEnd);
}
propertyCodePointRanges.put(propertyName, intervalSet);
}
}
private static void addUnicodeBinaryPropertyCodesToNames(Map propertyAliases) {
for (int property = UProperty.BINARY_START;
property < UProperty.BINARY_LIMIT;
property++) {
String propertyName = getShortPropertyName(property);
addPropertyAliases(propertyAliases, propertyName, property);
}
}
private static void addIntPropertyRanges(int property, String namePrefix, Map propertyCodePointRanges) {
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
propertyValue++) {
UnicodeSet set = new UnicodeSet();
set.applyIntPropertyValue(property, propertyValue);
String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
IntervalSet intervalSet = propertyCodePointRanges.get(propertyName);
if (intervalSet == null) {
intervalSet = new IntervalSet();
propertyCodePointRanges.put(propertyName, intervalSet);
}
addUnicodeSetToIntervalSet(set, intervalSet);
}
}
private static void addUnicodeSetToIntervalSet(UnicodeSet unicodeSet, IntervalSet intervalSet) {
for (UnicodeSet.EntryRange range : unicodeSet.ranges()) {
intervalSet.add(range.codepoint, range.codepointEnd);
}
}
private static void addUnicodeIntPropertyCodesToCodePointRanges(Map propertyCodePointRanges) {
for (int property = UProperty.INT_START;
property < UProperty.INT_LIMIT;
property++) {
String propertyName = getShortPropertyName(property);
addIntPropertyRanges(property, propertyName + "=", propertyCodePointRanges);
}
}
private static void addTR35ExtendedPictographicPropertyCodesToCodePointRanges(Map propertyCodePointRanges) {
IntervalSet set = new IntervalSet();
// Generated using scripts/parse-extended-pictographic/parse.py
set.add(0x1F774, 0x1F77F);
set.add(0x2700, 0x2701);
set.add(0x2703, 0x2704);
set.add(0x270E);
set.add(0x2710, 0x2711);
set.add(0x2765, 0x2767);
set.add(0x1F030, 0x1F093);
set.add(0x1F094, 0x1F09F);
set.add(0x1F10D, 0x1F10F);
set.add(0x1F12F);
set.add(0x1F16C, 0x1F16F);
set.add(0x1F1AD, 0x1F1E5);
set.add(0x1F260, 0x1F265);
set.add(0x1F203, 0x1F20F);
set.add(0x1F23C, 0x1F23F);
set.add(0x1F249, 0x1F24F);
set.add(0x1F252, 0x1F25F);
set.add(0x1F266, 0x1F2FF);
set.add(0x1F7D5, 0x1F7FF);
set.add(0x1F000, 0x1F003);
set.add(0x1F005, 0x1F02B);
set.add(0x1F02C, 0x1F02F);
set.add(0x1F322, 0x1F323);
set.add(0x1F394, 0x1F395);
set.add(0x1F398);
set.add(0x1F39C, 0x1F39D);
set.add(0x1F3F1, 0x1F3F2);
set.add(0x1F3F6);
set.add(0x1F4FE);
set.add(0x1F53E, 0x1F548);
set.add(0x1F54F);
set.add(0x1F568, 0x1F56E);
set.add(0x1F571, 0x1F572);
set.add(0x1F57B, 0x1F586);
set.add(0x1F588, 0x1F589);
set.add(0x1F58E, 0x1F58F);
set.add(0x1F591, 0x1F594);
set.add(0x1F597, 0x1F5A3);
set.add(0x1F5A6, 0x1F5A7);
set.add(0x1F5A9, 0x1F5B0);
set.add(0x1F5B3, 0x1F5BB);
set.add(0x1F5BD, 0x1F5C1);
set.add(0x1F5C5, 0x1F5D0);
set.add(0x1F5D4, 0x1F5DB);
set.add(0x1F5DF, 0x1F5E0);
set.add(0x1F5E2);
set.add(0x1F5E4, 0x1F5E7);
set.add(0x1F5E9, 0x1F5EE);
set.add(0x1F5F0, 0x1F5F2);
set.add(0x1F5F4, 0x1F5F9);
set.add(0x2605);
set.add(0x2607, 0x260D);
set.add(0x260F, 0x2610);
set.add(0x2612);
set.add(0x2616, 0x2617);
set.add(0x2619, 0x261C);
set.add(0x261E, 0x261F);
set.add(0x2621);
set.add(0x2624, 0x2625);
set.add(0x2627, 0x2629);
set.add(0x262B, 0x262D);
set.add(0x2630, 0x2637);
set.add(0x263B, 0x2647);
set.add(0x2654, 0x265F);
set.add(0x2661, 0x2662);
set.add(0x2664);
set.add(0x2667);
set.add(0x2669, 0x267A);
set.add(0x267C, 0x267E);
set.add(0x2680, 0x2691);
set.add(0x2695);
set.add(0x2698);
set.add(0x269A);
set.add(0x269D, 0x269F);
set.add(0x26A2, 0x26A9);
set.add(0x26AC, 0x26AF);
set.add(0x26B2, 0x26BC);
set.add(0x26BF, 0x26C3);
set.add(0x26C6, 0x26C7);
set.add(0x26C9, 0x26CD);
set.add(0x26D0);
set.add(0x26D2);
set.add(0x26D5, 0x26E8);
set.add(0x26EB, 0x26EF);
set.add(0x26F6);
set.add(0x26FB, 0x26FC);
set.add(0x26FE, 0x26FF);
set.add(0x2388);
set.add(0x1FA00, 0x1FFFD);
set.add(0x1F0A0, 0x1F0AE);
set.add(0x1F0B1, 0x1F0BF);
set.add(0x1F0C1, 0x1F0CF);
set.add(0x1F0D1, 0x1F0F5);
set.add(0x1F0AF, 0x1F0B0);
set.add(0x1F0C0);
set.add(0x1F0D0);
set.add(0x1F0F6, 0x1F0FF);
set.add(0x1F80C, 0x1F80F);
set.add(0x1F848, 0x1F84F);
set.add(0x1F85A, 0x1F85F);
set.add(0x1F888, 0x1F88F);
set.add(0x1F8AE, 0x1F8FF);
set.add(0x1F900, 0x1F90B);
set.add(0x1F91F);
set.add(0x1F928, 0x1F92F);
set.add(0x1F931, 0x1F932);
set.add(0x1F94C);
set.add(0x1F95F, 0x1F96B);
set.add(0x1F992, 0x1F997);
set.add(0x1F9D0, 0x1F9E6);
set.add(0x1F90C, 0x1F90F);
set.add(0x1F93F);
set.add(0x1F94D, 0x1F94F);
set.add(0x1F96C, 0x1F97F);
set.add(0x1F998, 0x1F9BF);
set.add(0x1F9C1, 0x1F9CF);
set.add(0x1F9E7, 0x1F9FF);
set.add(0x1F6C6, 0x1F6CA);
set.add(0x1F6D3, 0x1F6D4);
set.add(0x1F6E6, 0x1F6E8);
set.add(0x1F6EA);
set.add(0x1F6F1, 0x1F6F2);
set.add(0x1F6F7, 0x1F6F8);
set.add(0x1F6D5, 0x1F6DF);
set.add(0x1F6ED, 0x1F6EF);
set.add(0x1F6F9, 0x1F6FF);
propertyCodePointRanges.put("Extended_Pictographic", set);
UnicodeSet emojiRKUnicodeSet = new UnicodeSet("[\\p{GCB=Regional_Indicator}\\*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]");
IntervalSet emojiRKIntervalSet = new IntervalSet();
addUnicodeSetToIntervalSet(emojiRKUnicodeSet, emojiRKIntervalSet);
propertyCodePointRanges.put("EmojiRK", emojiRKIntervalSet);
UnicodeSet emojiNRKUnicodeSet = new UnicodeSet("[\\p{Emoji=Yes}]");
emojiNRKUnicodeSet.removeAll(emojiRKUnicodeSet);
IntervalSet emojiNRKIntervalSet = new IntervalSet();
addUnicodeSetToIntervalSet(emojiNRKUnicodeSet, emojiNRKIntervalSet);
propertyCodePointRanges.put("EmojiNRK", emojiNRKIntervalSet);
}
private static void addEmojiPresentationPropertyCodesToCodePointRanges(Map propertyCodePointRanges) {
UnicodeSet emojiDefaultUnicodeSet = new UnicodeSet("[[\\p{Emoji=Yes}]&[\\p{Emoji_Presentation=Yes}]]");
IntervalSet emojiDefaultIntervalSet = new IntervalSet();
addUnicodeSetToIntervalSet(emojiDefaultUnicodeSet, emojiDefaultIntervalSet);
propertyCodePointRanges.put("EmojiPresentation=EmojiDefault", emojiDefaultIntervalSet);
UnicodeSet textDefaultUnicodeSet = new UnicodeSet("[[\\p{Emoji=Yes}]&[\\p{Emoji_Presentation=No}]]");
IntervalSet textDefaultIntervalSet = new IntervalSet();
addUnicodeSetToIntervalSet(textDefaultUnicodeSet, textDefaultIntervalSet);
propertyCodePointRanges.put("EmojiPresentation=TextDefault", textDefaultIntervalSet);
UnicodeSet textUnicodeSet = new UnicodeSet("[\\p{Emoji=No}]");
IntervalSet textIntervalSet = new IntervalSet();
addUnicodeSetToIntervalSet(textUnicodeSet, textIntervalSet);
propertyCodePointRanges.put("EmojiPresentation=Text", textIntervalSet);
}
private static void addIntPropertyAliases(int property, String namePrefix, Map propertyAliases) {
String propertyName = getShortPropertyName(property);
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
propertyValue++) {
String aliasTarget = propertyName + "=" + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.SHORT;
String alias;
while (true) {
try {
alias = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
}
assert alias != null;
addPropertyAlias(propertyAliases, alias, aliasTarget);
nameChoice++;
}
}
}
private static void addUnicodeScriptCodesToNames(Map propertyAliases) {
addIntPropertyAliases(UProperty.SCRIPT, "", propertyAliases);
}
private static void addUnicodeBlocksToNames(Map propertyAliases) {
addIntPropertyAliases(UProperty.BLOCK, "In", propertyAliases);
}
private static void addUnicodeIntPropertyCodesToNames(Map propertyAliases) {
for (int property = UProperty.INT_START;
property < UProperty.INT_LIMIT;
property++) {
int nameChoice = UProperty.NameChoice.SHORT + 1;
while (true) {
String propertyNameAlias;
try {
propertyNameAlias = UCharacter.getPropertyName(property, nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
}
addIntPropertyAliases(property, propertyNameAlias + "=", propertyAliases);
nameChoice++;
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy