org.elasticsearch.index.analysis.GreeklishGenerator Maven / Gradle / Ivy
package org.elasticsearch.index.analysis;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.logging.ESLoggerFactory;
/**
* @author Tasos Stathopoulos
* Generates greeklish tokens for each element of list
* of greek tokens.
*/
public class GreeklishGenerator {
/**
* Elastic Search logger
*/
private static final Logger logger = ESLoggerFactory.getLogger(
GreeklishConverter.class.getName());
/**
* Constant variables that represent the character that substitutes a
* digraph.
*/
private static final String AI = "Α";
private static final String EI = "Ε";
private static final String OI = "Ο";
private static final String OY = "Υ";
private static final String EY = "Φ";
private static final String AY = "Β";
private static final String MP = "Μ";
private static final String GG = "Γ";
private static final String GK = "Κ";
private static final String NT = "Ν";
/**
* Each digraph is replaced by a special capital Greek character.
*/
private final Map digraphs = new HashMap();
/**
* This hash has keys all the possible conversions that can be applied and
* values the strings that can replace the corresponding Greek character.
*/
private final Map conversions = new HashMap();
/**
* The possible digraph cases.
*/
private static final String[][] digraphCases = new String[][] {
{ "αι", AI }, { "ει", EI }, { "οι", OI }, { "ου", OY },
{ "ευ", EY }, { "αυ", AY }, { "μπ", MP }, { "γγ", GG },
{ "γκ", GK }, { "ντ", NT } };
/**
* The possible string conversions for each case.
*/
private static final String[][] convertStrings = new String[][] {
{ AI, "ai", "e" }, { EI, "ei", "i" }, { OI, "oi", "i" },
{ OY, "ou", "oy", "u" }, { EY, "eu", "ef", "ev", "ey" },
{ AY, "au", "af", "av", "ay" }, { MP, "mp", "b" },
{ GG, "gg", "g" }, { GK, "gk", "g" }, { NT, "nt", "d" },
{ "α", "a" }, { "β", "b", "v" }, { "γ", "g" }, { "δ", "d" },
{ "ε", "e" }, { "ζ", "z" }, { "η", "h", "i" }, { "θ", "th" },
{ "ι", "i" }, { "κ", "k" }, { "λ", "l" }, { "μ", "m" },
{ "ν", "n" }, { "ξ", "ks", "x" }, { "ο", "o" }, { "π", "p" },
{ "ρ", "r" }, { "σ", "s" }, { "τ", "t" }, { "υ", "y", "u", "i" },
{ "φ", "f", "ph" }, { "χ", "x", "h", "ch" }, { "ψ", "ps" },
{ "ω", "w", "o", "v" } };
/**
* The maximum greeklish expansions per greek token.
*/
private final int maxExpansions;
/**
* A list of greeklish token per each greek word.
*/
private final List perWordGreeklish;
/**
* Keep the generated strings in a list. The populated list is
* returned to the filter.
* CopyOnWriteArrayList is used because it is thread safe and has the
* ability to add components while a thread iterates over its elements.
*/
private final List greeklishList;
/**
* Input token converted into String.
*/
private char[] inputToken;
/**
* Input token converted into String without substitutions.
* It is used for logging the processing token.
*/
private String initialToken;
// Constructor
public GreeklishGenerator(int maxExpansions) {
this.maxExpansions = maxExpansions;
this.greeklishList = new ArrayList();
this.perWordGreeklish = new CopyOnWriteArrayList();
// populate digraphs
for (String[] digraphCase : digraphCases) {
digraphs.put(digraphCase[0], digraphCase[1]);
}
// populate conversions
for (String[] convertString : convertStrings) {
conversions.put(convertString[0].charAt(0),
Arrays.copyOfRange(convertString, 1, convertString.length));
}
}
/**
* Gets a list of greek words and generates the greeklish version of
* each word.
* @param greekWords a list of greek words
* @return a list of greeklish words
*/
public List generateGreeklishWords(final List greekWords) {
greeklishList.clear();
for (String greekWord : greekWords) {
perWordGreeklish.clear();
initialToken = greekWord;
// Allocate space that is twice the length of the input token in
// order
// to cover
// worst case scenario where each Greek character is replaced by two
// latin characters
int allocatedSpace = 2 * greekWord.length();
for (String key : digraphs.keySet()) {
greekWord = greekWord.replaceAll(key, digraphs.get(key));
}
// Convert it back to array of characters. The iterations of each
// character will take place through this array.
inputToken = greekWord.toCharArray();
// Iterate through the characters of the token and generate
// greeklish
// words
for (char greekChar : inputToken) {
addCharacter(conversions.get(greekChar), allocatedSpace);
}
greeklishList.addAll(perWordGreeklish);
}
return greeklishList;
}
/**
* Add the matching latin characters to the generated greeklish tokens for a
* specific Greek character. For each different combination of latin
* characters, a new token is generated.
*
* @param convertStrings
* The latin characters that will be added to the tokens
* @param bufferSize
* The size of the buffer that will be allocated in case of new
* StringBuilder
*/
private void addCharacter(String[] convertStrings, int bufferSize) {
// If the token list is empty, create a new StringBuilder and add the
// latin characters
if (perWordGreeklish.isEmpty()) {
for (String convertString : convertStrings) {
if (perWordGreeklish.size() >= maxExpansions) {
logger.debug("Skipping for token [{}]", initialToken);
break;
}
StringBuilder greeklishWord = new StringBuilder(bufferSize);
greeklishWord.append(convertString);
perWordGreeklish.add(greeklishWord);
}
// Add the latin characters to each saved greeklish token, and
// generate new ones
// when the combinations are more than one.
} else {
for (StringBuilder atoken : perWordGreeklish) {
for (String convertString : Arrays.copyOfRange(convertStrings,
1, convertStrings.length)) {
if (perWordGreeklish.size() >= maxExpansions) {
logger.debug("Skipping for token [{}]", initialToken);
break;
}
StringBuilder newToken = new StringBuilder(atoken);
newToken.append(convertString);
perWordGreeklish.add(newToken);
}
atoken.append(convertStrings[0]);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy