Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.Lucene.EarlyTerminatingCollector;
import org.elasticsearch.common.text.StringText;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.ParsedQuery;
import org.elasticsearch.script.CompiledScript;
import org.elasticsearch.script.ExecutableScript;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.search.suggest.Suggest.Suggestion;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
import org.elasticsearch.search.suggest.*;
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public final class PhraseSuggester extends Suggester {
private final BytesRef SEPARATOR = new BytesRef(" ");
private static final String SUGGESTION_TEMPLATE_VAR_NAME = "suggestion";
private final ScriptService scriptService;
@Inject
public PhraseSuggester(ScriptService scriptService) {
this.scriptService = scriptService;
}
/*
* More Ideas:
* - add ability to find whitespace problems -> we can build a poor mans decompounder with our index based on a automaton?
* - add ability to build different error models maybe based on a confusion matrix?
* - try to combine a token with its subsequent token to find / detect word splits (optional)
* - for this to work we need some way to defined the position length of a candidate
* - phonetic filters could be interesting here too for candidate selection
*/
@Override
public Suggestion> innerExecute(String name, PhraseSuggestionContext suggestion, IndexSearcher searcher,
CharsRefBuilder spare) throws IOException {
double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood();
final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize());
final IndexReader indexReader = searcher.getIndexReader();
List generators = suggestion.generators();
final int numGenerators = generators.size();
final List gens = new ArrayList<>(generators.size());
for (int i = 0; i < numGenerators; i++) {
PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i);
DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator);
Terms terms = MultiFields.getTerms(indexReader, generator.field());
if (terms != null) {
gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(),
indexReader, realWordErrorLikelihood, generator.size(), generator.preFilter(), generator.postFilter(), terms));
}
}
final String suggestField = suggestion.getField();
final Terms suggestTerms = MultiFields.getTerms(indexReader, suggestField);
if (gens.size() > 0 && suggestTerms != null) {
final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit());
final BytesRef separator = suggestion.separator();
TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator);
Result checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(),
suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize());
PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
response.addTerm(resultEntry);
final BytesRefBuilder byteSpare = new BytesRefBuilder();
final EarlyTerminatingCollector collector = Lucene.createExistsCollector();
final CompiledScript collateScript;
if (suggestion.getCollateQueryScript() != null) {
collateScript = suggestion.getCollateQueryScript();
} else if (suggestion.getCollateFilterScript() != null) {
collateScript = suggestion.getCollateFilterScript();
} else {
collateScript = null;
}
final boolean collatePrune = (collateScript != null) && suggestion.collatePrune();
for (int i = 0; i < checkerResult.corrections.length; i++) {
Correction correction = checkerResult.corrections[i];
spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, null, null));
boolean collateMatch = true;
if (collateScript != null) {
// Checks if the template query collateScript yields any documents
// from the index for a correction, collateMatch is updated
final Map vars = suggestion.getCollateScriptParams();
vars.put(SUGGESTION_TEMPLATE_VAR_NAME, spare.toString());
final ExecutableScript executable = scriptService.executable(collateScript, vars);
final BytesReference querySource = (BytesReference) executable.run();
final ParsedQuery parsedQuery;
if (suggestion.getCollateFilterScript() != null) {
parsedQuery = suggestion.getQueryParserService().parse(
QueryBuilders.constantScoreQuery(FilterBuilders.bytesFilter(querySource)));
} else {
parsedQuery = suggestion.getQueryParserService().parse(querySource);
}
collateMatch = Lucene.exists(searcher, parsedQuery.query(), collector);
}
if (!collateMatch && !collatePrune) {
continue;
}
Text phrase = new StringText(spare.toString());
Text highlighted = null;
if (suggestion.getPreTag() != null) {
spare.copyUTF8Bytes(correction.join(SEPARATOR, byteSpare, suggestion.getPreTag(), suggestion.getPostTag()));
highlighted = new StringText(spare.toString());
}
if (collatePrune) {
resultEntry.addOption(new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score), collateMatch));
} else {
resultEntry.addOption(new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score)));
}
}
} else {
response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE));
}
return response;
}
private PhraseSuggestion.Entry buildResultEntry(PhraseSuggestionContext suggestion, CharsRefBuilder spare, double cutoffScore) {
spare.copyUTF8Bytes(suggestion.getText());
return new PhraseSuggestion.Entry(new StringText(spare.toString()), 0, spare.length(), cutoffScore);
}
ScriptService scriptService() {
return scriptService;
}
@Override
public String[] names() {
return new String[] {"phrase"};
}
@Override
public SuggestContextParser getContextParser() {
return new PhraseSuggestParser(this);
}
}