All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.in.IndicNormalizer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.in;

import static java.lang.Character.UnicodeBlock.*;
import static org.apache.lucene.analysis.util.StemmerUtil.*;

import java.util.BitSet;
import java.util.IdentityHashMap;

/**
 * Normalizes the Unicode representation of text in Indian languages.
 *
 * 

Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I and graphical * decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html */ class IndicNormalizer { private static class ScriptData { final int flag; final int base; BitSet decompMask; ScriptData(int flag, int base) { this.flag = flag; this.base = base; } } private static final IdentityHashMap scripts = new IdentityHashMap<>(9); private static int flag(Character.UnicodeBlock ub) { return scripts.get(ub).flag; } static { scripts.put(DEVANAGARI, new ScriptData(1, 0x0900)); scripts.put(BENGALI, new ScriptData(2, 0x0980)); scripts.put(GURMUKHI, new ScriptData(4, 0x0A00)); scripts.put(GUJARATI, new ScriptData(8, 0x0A80)); scripts.put(ORIYA, new ScriptData(16, 0x0B00)); scripts.put(TAMIL, new ScriptData(32, 0x0B80)); scripts.put(TELUGU, new ScriptData(64, 0x0C00)); scripts.put(KANNADA, new ScriptData(128, 0x0C80)); scripts.put(MALAYALAM, new ScriptData(256, 0x0D00)); } /** * Decompositions according to Unicode 5.2, and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html * *

Most of these are not handled by unicode normalization anyway. * *

The numbers here represent offsets into the respective codepages, with -1 representing null * and 0xFF representing zero-width joiner. * *

the columns are: ch1, ch2, ch3, res, flags ch1, ch2, and ch3 are the decomposition res is * the composition, and flags are the scripts to which it applies. */ private static final int[][] decompositions = { /* devanagari, gujarati vowel candra O */ {0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari short O */ {0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI)}, /* devanagari, gujarati letter O */ {0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari letter AI, gujarati letter AU */ {0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari, bengali, gurmukhi, gujarati, oriya AA */ { 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) }, /* devanagari letter candra A */ {0x05, 0x45, -1, 0x72, flag(DEVANAGARI)}, /* gujarati vowel candra E */ {0x05, 0x45, -1, 0x0D, flag(GUJARATI)}, /* devanagari letter short A */ {0x05, 0x46, -1, 0x04, flag(DEVANAGARI)}, /* gujarati letter E */ {0x05, 0x47, -1, 0x0F, flag(GUJARATI)}, /* gurmukhi, gujarati letter AI */ {0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI)}, /* devanagari, gujarati vowel candra O */ {0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari short O */ {0x05, 0x4A, -1, 0x12, flag(DEVANAGARI)}, /* devanagari, gujarati letter O */ {0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */ {0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI)}, /* devanagari, gujarati vowel candra O */ {0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari short O */ {0x06, 0x46, -1, 0x12, flag(DEVANAGARI)}, /* devanagari, gujarati letter O */ {0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari letter AI, gujarati letter AU */ {0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI)}, /* malayalam letter II */ {0x07, 0x57, -1, 0x08, flag(MALAYALAM)}, /* devanagari letter UU */ {0x09, 0x41, -1, 0x0A, flag(DEVANAGARI)}, /* tamil, malayalam letter UU (some styles) */ {0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM)}, /* malayalam letter AI */ {0x0E, 0x46, -1, 0x10, flag(MALAYALAM)}, /* devanagari candra E */ {0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI)}, /* devanagari short E */ {0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI)}, /* devanagari AI */ {0x0F, 0x47, -1, 0x10, flag(DEVANAGARI)}, /* oriya AI */ {0x0F, 0x57, -1, 0x10, flag(ORIYA)}, /* malayalam letter OO */ {0x12, 0x3E, -1, 0x13, flag(MALAYALAM)}, /* telugu, kannada letter AU */ {0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA)}, /* telugu letter OO */ {0x12, 0x55, -1, 0x13, flag(TELUGU)}, /* tamil, malayalam letter AU */ {0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM)}, /* oriya letter AU */ {0x13, 0x57, -1, 0x14, flag(ORIYA)}, /* devanagari qa */ {0x15, 0x3C, -1, 0x58, flag(DEVANAGARI)}, /* devanagari, gurmukhi khha */ {0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI)}, /* devanagari, gurmukhi ghha */ {0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI)}, /* devanagari, gurmukhi za */ {0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI)}, /* devanagari dddha, bengali, oriya rra */ {0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA)}, /* devanagari, bengali, oriya rha */ {0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA)}, /* malayalam chillu nn */ {0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM)}, /* bengali khanda ta */ {0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI)}, /* devanagari nnna */ {0x28, 0x3C, -1, 0x29, flag(DEVANAGARI)}, /* malayalam chillu n */ {0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM)}, /* devanagari, gurmukhi fa */ {0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI)}, /* devanagari, bengali yya */ {0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI)}, /* telugu letter vocalic R */ {0x2C, 0x41, 0x41, 0x0B, flag(TELUGU)}, /* devanagari rra */ {0x30, 0x3C, -1, 0x31, flag(DEVANAGARI)}, /* malayalam chillu rr */ {0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM)}, /* malayalam chillu l */ {0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM)}, /* devanagari llla */ {0x33, 0x3C, -1, 0x34, flag(DEVANAGARI)}, /* malayalam chillu ll */ {0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM)}, /* telugu letter MA */ {0x35, 0x41, -1, 0x2E, flag(TELUGU)}, /* devanagari, gujarati vowel sign candra O */ {0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari vowel sign short O */ {0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI)}, /* devanagari, gujarati vowel sign O */ {0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI)}, /* devanagari, gujarati vowel sign AU */ {0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI)}, /* kannada vowel sign II */ {0x3F, 0x55, -1, 0x40, flag(KANNADA)}, /* gurmukhi vowel sign UU (when stacking) */ {0x41, 0x41, -1, 0x42, flag(GURMUKHI)}, /* tamil, malayalam vowel sign O */ {0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM)}, /* kannada vowel sign OO */ {0x46, 0x42, 0x55, 0x4B, flag(KANNADA)}, /* kannada vowel sign O */ {0x46, 0x42, -1, 0x4A, flag(KANNADA)}, /* malayalam vowel sign AI (if reordered twice) */ {0x46, 0x46, -1, 0x48, flag(MALAYALAM)}, /* telugu, kannada vowel sign EE */ {0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA)}, /* telugu, kannada vowel sign AI */ {0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA)}, /* tamil, malayalam vowel sign AU */ {0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM)}, /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */ {0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM)}, /* bengali, oriya vowel sign AU */ {0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA)}, /* kannada vowel sign OO */ {0x4A, 0x55, -1, 0x4B, flag(KANNADA)}, /* gurmukhi letter I */ {0x72, 0x3F, -1, 0x07, flag(GURMUKHI)}, /* gurmukhi letter II */ {0x72, 0x40, -1, 0x08, flag(GURMUKHI)}, /* gurmukhi letter EE */ {0x72, 0x47, -1, 0x0F, flag(GURMUKHI)}, /* gurmukhi letter U */ {0x73, 0x41, -1, 0x09, flag(GURMUKHI)}, /* gurmukhi letter UU */ {0x73, 0x42, -1, 0x0A, flag(GURMUKHI)}, /* gurmukhi letter OO */ {0x73, 0x4B, -1, 0x13, flag(GURMUKHI)}, }; static { for (ScriptData sd : scripts.values()) { sd.decompMask = new BitSet(0x7F); for (int i = 0; i < decompositions.length; i++) { final int ch = decompositions[i][0]; final int flags = decompositions[i][4]; if ((flags & sd.flag) != 0) sd.decompMask.set(ch); } } } /** * Normalizes input text, and returns the new length. The length will always be less than or equal * to the existing length. * * @param text input text * @param len valid length * @return normalized length */ int normalize(char[] text, int len) { for (int i = 0; i < len; i++) { final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]); final ScriptData sd = scripts.get(block); if (sd != null) { final int ch = text[i] - sd.base; if (sd.decompMask.get(ch)) len = compose(ch, block, sd, text, i, len); } } return len; } /** Compose into standard form any compositions in the decompositions table. */ private int compose( int ch0, Character.UnicodeBlock block0, ScriptData sd, char[] text, int pos, int len) { if (pos + 1 >= len) /* need at least 2 chars! */ return len; final int ch1 = text[pos + 1] - sd.base; final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]); if (block1 != block0) /* needs to be the same writing system */ return len; int ch2 = -1; if (pos + 2 < len) { ch2 = text[pos + 2] - sd.base; Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]); if (text[pos + 2] == '\u200D') // ZWJ ch2 = 0xFF; else if (block2 != block1) // still allow a 2-char match ch2 = -1; } for (int i = 0; i < decompositions.length; i++) if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) { if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) { text[pos] = (char) (sd.base + decompositions[i][3]); len = delete(text, pos + 1, len); if (decompositions[i][2] >= 0) len = delete(text, pos + 1, len); return len; } } return len; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy