org.apache.lucene.analysis.bn.BengaliNormalizer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import static org.apache.lucene.analysis.util.StemmerUtil.delete;
/**
* Normalizer for Bengali.
*
* Implements the Bengali-language specific algorithm specified in:
* A Double Metaphone encoding for Bangla and its application in spelling checker
* Naushad UzZaman and Mumit Khan.
* http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
*
*/
public class BengaliNormalizer {
/**
* Normalize an input buffer of Bengali text
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
switch (s[i]) {
// delete Chandrabindu
case '\u0981':
len = delete(s, i, len);
i--;
break;
// DirghoI kar -> RosshoI kar
case '\u09C0':
s[i] = '\u09BF';
break;
// DirghoU kar -> RosshoU kar
case '\u09C2':
s[i] = '\u09C1';
break;
// Khio (Ka + Hoshonto + Murdorno Sh)
case '\u0995':
if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
if (i == 0) {
s[i] = '\u0996';
len = delete(s, i + 2, len);
len = delete(s, i + 1, len);
} else {
s[i+1] = '\u0996';
len = delete(s, i + 2, len);
}
}
break;
// Nga to Anusvara
case '\u0999':
s[i] = '\u0982';
break;
// Ja Phala
case '\u09AF':
if(i - 2 == 0 && s[i-1] == '\u09CD') {
s[i - 1] = '\u09C7';
if(i + 1 < len && s[i+1] == '\u09BE') {
len = delete(s, i+1, len);
}
len = delete(s, i, len);
i --;
} else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){
len = delete(s, i, len);
len = delete(s, i-1, len);
i -=2;
}
break;
// Ba Phalaa
case '\u09AC':
if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
break;
if(i - 2 == 0) {
len = delete(s, i, len);
len = delete(s, i - 1, len);
i -= 2;
} else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
len = delete(s, i, len);
len = delete(s, i-1, len);
i -=2;
} else if(i - 2 >= 0){
s[i - 1] = s[i - 2];
len = delete(s, i, len);
i --;
}
break;
// Visarga
case '\u0983':
if(i == len -1) {
if(len <= 3) {
s[i] = '\u09B9';
} else {
len = delete(s, i, len);
}
} else {
s[i] = s[i+1];
}
break;
//All sh
case '\u09B6':
case '\u09B7':
s[i] = '\u09B8';
break;
//check na
case '\u09A3':
s[i] = '\u09A8';
break;
//check ra
case '\u09DC':
case '\u09DD':
s[i] = '\u09B0';
break;
case '\u09CE':
s[i] = '\u09A4';
break;
default:
break;
}
}
return len;
}
}