All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.thihy.es.analysis.paoding.ThihyPaodingTokenizerProvider Maven / Gradle / Ivy

package com.thihy.es.analysis.paoding;

import java.io.IOException;
import java.io.Reader;

import net.paoding.analysis.analyzer.PaodingTokenizer;
import net.paoding.analysis.analyzer.TokenCollector;
import net.paoding.analysis.analyzer.impl.MaxWordLengthTokenCollector;
import net.paoding.analysis.analyzer.impl.MostWordsTokenCollector;
import net.paoding.analysis.knife.Dictionaries;
import net.paoding.analysis.knife.Knife;
import net.paoding.analysis.knife.Paoding;

import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import org.elasticsearch.index.settings.IndexSettings;

import com.thihy.es.analysis.paoding.dict.DictionariesLoadContext;
import com.thihy.es.analysis.paoding.dict.DictionariesService;
import com.thihy.es.analysis.paoding.knife.KnifesService;

public class ThihyPaodingTokenizerProvider extends AbstractTokenizerFactory {
	private static final String MODE_MOST_TOKENS = "most_tokens";
	private static final String MODE_MAX_LENGTH = "max_length";
	private static final String[] DEFAULT_KNIFE_TYPES = { "letter", "number", "cjk" };

	private final Paoding paoding;
	private final ThihyPaodingMode mode;

	@Inject
	public ThihyPaodingTokenizerProvider(Index index, @IndexSettings Settings indexSettings, DictionariesService dictionariesService,
			KnifesService knifesService, @Assisted String name, @Assisted Settings settings) throws IOException {
		super(index, indexSettings, name, settings);
		// 1.read setting
		mode = readMode(settings);

		String dictType = settings.get("dict.type", "file");
		Settings dictSettings = settings.getAsSettings("dict." + dictType);
		String[] knifeTypes = settings.getAsArray("knife", DEFAULT_KNIFE_TYPES);
		// 2.1 create  dictionaries
		DictionariesLoadContext dictionariesLoadContext = DictionariesLoadContext.builder().index(index, indexSettings).analyzerOwner(name)
				.dictSettings(dictSettings).build();
		Dictionaries dictionaries = dictionariesService.load(dictType, dictionariesLoadContext);
		// 2.2 create knives
		Knife[] knives = new Knife[knifeTypes.length];
		for (int knifeNo = 0; knifeNo < knifeTypes.length; knifeNo++) {
			knives[knifeNo] = knifesService.createKnife(knifeTypes[knifeNo], dictionaries);
		}
		// 2.3 create paoding
		paoding = new Paoding();
		paoding.setKnives(knives);
	}

	private ThihyPaodingMode readMode(Settings settings) {
		String strMode = settings.get("mode", MODE_MOST_TOKENS);
		if (MODE_MOST_TOKENS.equals(strMode)) {
			return ThihyPaodingMode.MOST_WORDS;
		} else if (MODE_MAX_LENGTH.equals(strMode)) {
			return ThihyPaodingMode.MAX_WORD_LENGTH;
		} else {
			throw new IllegalArgumentException("The 'mode' is invalid. It should be '" + MODE_MOST_TOKENS + "' or '" + MODE_MAX_LENGTH
					+ "'.");
		}
	}

	protected TokenCollector createTokenCollector() {
		switch (mode) {
		case MOST_WORDS:
			return new MostWordsTokenCollector();
		case MAX_WORD_LENGTH:
			return new MaxWordLengthTokenCollector();
		default:
			throw new Error("never happened");
		}
	}

	@Override
	public Tokenizer create(Reader reader) {
		return new PaodingTokenizer(reader, paoding, createTokenCollector());
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy