org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker Maven / Gradle / Ivy
The newest version!
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.common.io.FastCharArrayReader;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
//TODO public for tests
public final class NoisyChannelSpellChecker {
public static final double REAL_WORD_LIKELYHOOD = 0.95d;
public static final int DEFAULT_TOKEN_LIMIT = 10;
private final double realWordLikelihood;
private final boolean requireUnigram;
private final int tokenLimit;
public NoisyChannelSpellChecker() {
this(REAL_WORD_LIKELYHOOD);
}
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
}
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
this.realWordLikelihood = nonErrorLikelihood;
this.requireUnigram = requireUnigram;
this.tokenLimit = tokenLimit;
}
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
final List candidateSetsList = new ArrayList<>();
SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
CandidateSet currentSet = null;
private TypeAttribute typeAttribute;
private final BytesRefBuilder termsRef = new BytesRefBuilder();
private boolean anyUnigram = false;
private boolean anyTokens = false;
@Override
public void reset(TokenStream stream) {
super.reset(stream);
typeAttribute = stream.addAttribute(TypeAttribute.class);
}
@Override
public void nextToken() throws IOException {
anyTokens = true;
BytesRef term = fillBytesRef(termsRef);
if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
return;
}
anyUnigram = true;
if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
assert currentSet != null;
long freq = 0;
if ((freq = generator.frequency(term)) > 0) {
currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
}
} else {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
}
}
@Override
public void end() {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
if (requireUnigram && !anyUnigram && anyTokens) {
throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
}
}
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
generator.drawCandidates(candidateSet);
}
double cutoffScore = Double.MIN_VALUE;
CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
if (confidence > 0.0) {
Candidate[] candidates = new Candidate[candidateSets.length];
for (int i = 0; i < candidates.length; i++) {
candidates[i] = candidateSets[i].originalTerm;
}
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
}
public TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException {
spare.copyUTF8Bytes(query);
return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars(), 0, spare.length()));
}
public static class Result {
public static final Result EMPTY = new Result(Correction.EMPTY, Double.MIN_VALUE);
public final Correction[] corrections;
public final double cutoffScore;
public Result(Correction[] corrections, double cutoffScore) {
this.corrections = corrections;
this.cutoffScore = cutoffScore;
}
}
}