All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.fst.WFSTCompletionLookup Maven / Gradle / Ivy

There is a newer version: 3.6.2
Show newest version
package org.apache.lucene.search.suggest.fst;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.MinResult;

/**
 * Suggester based on a weighted FST: it first traverses the prefix, 
 * then walks the n shortest paths to retrieve top-ranked
 * suggestions.
 * 

* NOTE: Although the {@link TermFreqIterator} API specifies * floating point weights, input weights should be whole numbers. * Input weights will be cast to a java integer, and any * negative, infinite, or NaN values will be rejected. * * @see Util#shortestPaths(FST, FST.Arc, Comparator, int) * @lucene.experimental */ public class WFSTCompletionLookup extends Lookup { /** * FST, weights are encoded as costs: (Integer.MAX_VALUE-weight) */ // NOTE: like FSTSuggester, this is really a WFSA, if you want to // customize the code to add some output you should use PairOutputs. private FST fst = null; /** * True if exact match suggestions should always be returned first. */ private final boolean exactFirst; /** * Calls {@link #WFSTCompletionLookup(boolean) WFSTCompletionLookup(true)} */ public WFSTCompletionLookup() { this(true); } /** * Creates a new suggester. * * @param exactFirst true if suggestions that match the * prefix exactly should always be returned first, regardless * of score. This has no performance impact, but could result * in low-quality suggestions. */ public WFSTCompletionLookup(boolean exactFirst) { this.exactFirst = exactFirst; } @Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); } @Override public boolean store(OutputStream output) throws IOException { try { if (fst == null) { return false; } fst.save(new OutputStreamDataOutput(output)); } finally { IOUtils.close(output); } return true; } @Override public boolean load(InputStream input) throws IOException { try { this.fst = new FST(new InputStreamDataInput(input), PositiveIntOutputs.getSingleton(true)); } finally { IOUtils.close(input); } return true; } @Override public List lookup(CharSequence key, boolean onlyMorePopular, int num) { assert num > 0; BytesRef scratch = new BytesRef(key); int prefixLength = scratch.length; Arc arc = new Arc(); // match the prefix portion exactly Long prefixOutput = null; try { prefixOutput = lookupPrefix(scratch, arc); } catch (IOException bogus) { throw new RuntimeException(bogus); } if (prefixOutput == null) { return Collections.emptyList(); } List results = new ArrayList(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.isFinal()) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return results; // that was quick } } // complete top-N MinResult completions[] = null; try { completions = Util.shortestPaths(fst, arc, weightComparator, num); } catch (IOException bogus) { throw new RuntimeException(bogus); } BytesRef suffix = new BytesRef(8); for (MinResult completion : completions) { scratch.length = prefixLength; // append suffix Util.toBytesRef(completion.input, suffix); scratch.append(suffix); spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add(new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output))); } return results; } private Long lookupPrefix(BytesRef scratch, Arc arc) throws /*Bogus*/IOException { assert 0 == fst.outputs.getNoOutput().longValue(); long output = 0; BytesReader bytesReader = fst.getBytesReader(0); fst.getFirstArc(arc); byte[] bytes = scratch.bytes; int pos = scratch.offset; int end = pos + scratch.length; while (pos < end) { if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return null; } else { output += arc.output.longValue(); } } return output; } /** * Returns the weight associated with an input string, * or null if it does not exist. */ public Object get(CharSequence key) { Arc arc = new Arc(); Long result = null; try { result = lookupPrefix(new BytesRef(key), arc); } catch (IOException bogus) { throw new RuntimeException(bogus); } if (result == null || !arc.isFinal()) { return null; } else { return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput)); } } /** cost -> weight */ private static int decodeWeight(long encoded) { return (int)(Integer.MAX_VALUE - encoded); } /** weight -> cost */ private static int encodeWeight(long value) { if (value < 0 || value > Integer.MAX_VALUE) { throw new UnsupportedOperationException("cannot encode value: " + value); } return Integer.MAX_VALUE - (int)value; } private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper { WFSTTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator) throws IOException { super(source, comparator, true); } @Override protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { if (spare.length + 5 >= buffer.length) { buffer = ArrayUtil.grow(buffer, spare.length + 5); } output.reset(buffer); output.writeBytes(spare.bytes, spare.offset, spare.length); output.writeByte((byte)0); // separator: not used, just for sort order output.writeInt(encodeWeight(weight)); writer.write(buffer, 0, output.getPosition()); } @Override protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { tmpInput.reset(scratch.bytes); tmpInput.skipBytes(scratch.length - 4); // suggestion + separator scratch.length -= 5; // sep + long return tmpInput.readInt(); } } static final Comparator weightComparator = new Comparator () { public int compare(Long left, Long right) { return left.compareTo(right); } }; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy