All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.index.analysis.GreekReverseStemmer Maven / Gradle / Ivy

package org.elasticsearch.index.analysis;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.logging.ESLoggerFactory;

/**
 * @author Tasos Stathopoulos
 * Generates singular/plural variants of a greek word based
 * on a combination of predefined rules.
 */
public class GreekReverseStemmer {

	/**
	 * Elastic Search logger
	 */
	private static final Logger logger = ESLoggerFactory.getLogger(
			GreeklishConverter.class.getName());

	/**
	 * Constant variable that represent suffixes for pluralization of
	 * greeklish tokens.
	 */
	private static final String SUFFIX_MATOS = "ματοσ";
	private static final String SUFFIX_MATA = "ματα";
	private static final String SUFFIX_MATWN = "ματων";
	private static final String SUFFIX_AS = "ασ";
	private static final String SUFFIX_EIA = "εια";
	private static final String SUFFIX_EIO = "ειο";
	private static final String SUFFIX_EIOY = "ειου";
	private static final String SUFFIX_EIWN = "ειων";
	private static final String SUFFIX_IOY = "ιου";
	private static final String SUFFIX_IA = "ια";
	private static final String SUFFIX_IWN = "ιων";
	private static final String SUFFIX_OS = "οσ";
	private static final String SUFFIX_OI = "οι";
	private static final String SUFFIX_EIS = "εισ";
	private static final String SUFFIX_ES = "εσ";
	private static final String SUFFIX_HS = "ησ";
	private static final String SUFFIX_WN = "ων";
	private static final String SUFFIX_OY = "ου";
	private static final String SUFFIX_O = "ο";
	private static final String SUFFIX_H = "η";
	private static final String SUFFIX_A = "α";
	private static final String SUFFIX_I = "ι";

	/**
	 * This hash has as keys all the suffixes that we want to handle in order
	 * to generate singular/plural greek words.
	 */
	private final Map suffixes = new HashMap();

	/**
	 * The possible suffix strings.
	 */
	private static final String[][] suffixStrings = new String[][] {
		{SUFFIX_MATOS, "μα", "ματων", "ματα"},  // κουρεματος, ασυρματος
		{SUFFIX_MATA, "μα", "ματων", "ματοσ"},  // ενδυματα
		{SUFFIX_MATWN, "μα", "ματα", "ματοσ"},  // ασυρματων, ενδυματων
		{SUFFIX_AS, "α", "ων", "εσ"},  // πορτας, χαρτοφυλακας
		{SUFFIX_EIA, "ειο", "ειων", "ειου", "ειασ"},  // γραφεια, ενεργεια
		{SUFFIX_EIO, "εια", "ειων", "ειου"},  // γραφειο
		{SUFFIX_EIOY, "εια", "ειου", "ειο", "ειων"},  // γραφειου
		{SUFFIX_EIWN, "εια", "ειου", "ειο", "ειασ"},  // ασφαλειων, γραφειων
		{SUFFIX_IOY, "ι", "ια", "ιων", "ιο"},  // πεδιου, κυνηγιου
		{SUFFIX_IA, "ιου", "ι", "ιων", "ιασ", "ιο"},  // πεδία, αρμονια
		{SUFFIX_IWN, "ιου", "ια", "ι", "ιο"},  // καλωδιων, κατοικιδιων
		{SUFFIX_OS, "η", "ουσ", "ου", "οι", "ων"},  // κλιματισμος
		{SUFFIX_OI, "οσ", "ου", "ων"},  // μυλοι, οδηγοι, σταθμοι
		{SUFFIX_EIS, "η", "ησ", "εων"},  // συνδεσεις, τηλεορασεις
		{SUFFIX_ES, "η", "ασ", "ων", "ησ", "α"},  // αλυσιδες
		{SUFFIX_HS, "ων", "εσ", "η", "εων"},  // γυμναστικης, εκτυπωσης
		{SUFFIX_WN, "οσ", "εσ", "α", "η", "ησ", "ου", "οι", "ο", "α"},  //  ινων, καπνιστων, καρτων, κατασκευων
		{SUFFIX_OY, "ων", "α", "ο", "οσ"},  // λαδιου, μοντελισμου, παιδικου
		{SUFFIX_O, "α", "ου", "εων", "ων"},  // αυτοκινητο, δισκος
		{SUFFIX_H, "οσ", "ουσ", "εων", "εισ", "ησ", "ων"},  //βελη, ψυξη, τηλεοραση, αποτριχωση
		{SUFFIX_A, "ο" , "ου", "ων", "ασ", "εσ"},  // γιλεκα, εσωρουχα, ομπρελλα
		{SUFFIX_I, "ιου", "ια", "ιων"}  // γιαουρτι, γραναζι
	};

	/**
	 * The greek word list
	 */
	private List greekWords = new ArrayList();

	// Constructor
	public GreekReverseStemmer() {

		// populate suffixes
		for (String[] suffix : suffixStrings) {
			suffixes.put(suffix[0], Arrays.copyOfRange(suffix, 1, suffix.length));
		}
	}

	/**
	 * This method generates the greek variants of the greek token that
	 * receives.
	 *
	 * @param tokenString the greek word
	 * @return a list of the generated greek word variations
	 */
	public List generateGreekVariants(String tokenString) {
		// clear the list from variations of the previous greek token
		greekWords.clear();

		// add the initial greek token in the greek words
		greekWords.add(tokenString);

		// Find the first matching suffix and generate the
		// the variants of this word
		for (String[] suffix : suffixStrings) {
			if (tokenString.endsWith(suffix[0])) {
				// Add to greekWords the tokens with the desired suffixes
				generate_more_greek_words(tokenString, suffix[0]);
				break;
			}
		}
		return greekWords;
	}

	/**
	 * Generates more greek words based on the suffix of the original word
	 * @param inputSuffix the suffix that matched
	 */
	private void generate_more_greek_words(final String inputToken, final String inputSuffix) {
		for (String suffix : suffixes.get(inputSuffix)) {
			greekWords.add(inputToken.replaceAll(inputSuffix + "$", suffix));
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy