All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.earcam.utilitarian.site.search.offline.SimpleTokenizer Maven / Gradle / Ivy

/*-
 * #%L
 * io.earcam.utilitarian.site.search.offline
 * %%
 * Copyright (C) 2017 earcam
 * %%
 * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
 *
 * You must choose to accept, in full - any individual or combination of
 * the following licenses:
 * 
 * #L%
 */
package io.earcam.utilitarian.site.search.offline;

import static java.util.Collections.emptyList;

import java.io.IOException;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.earcam.unexceptional.Closing;

public class SimpleTokenizer implements Processor {

	private static final Logger LOG = LoggerFactory.getLogger(SimpleTokenizer.class);


	public List tokenize(String input)
	{
		try {
			return Closing.closeAfterApplying(createAnalyzer(), input, this::tokens);
		} catch(UncheckedIOException e) {
			LOG.warn("Failed to tokenize '{}', due to {}", input, e.getMessage());
			LOG.debug("Failed to tokenize", e.getCause());
		}
		return emptyList();
	}


	private List tokens(Analyzer analyzer, String input)
	{
		return Closing.closeAfterApplying(analyzer.tokenStream(null, new StringReader(input)), this::streamTokens);
	}


	private List streamTokens(TokenStream stream) throws IOException
	{
		stream.reset();
		List tokens = new ArrayList<>();
		while(stream.incrementToken()) {
			tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
		}
		return tokens;
	}


	/**
	 * 

* Override this method to return a custom {@link Analyzer}. *

*

* Note; Use of Lucene for stemming, stopword filtering, etc must match * whatever is configured for lunrjs. *

* * @return an {@link Analyzer} for tokenizing */ protected Analyzer createAnalyzer() { return new SimpleAnalyzer(); } @Override public void process(Document document) { if(document.hasRaw() && !document.hasTokens()) { List tokenized = tokenize(document.raw()); document.tokens().addAll(tokenized); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy