All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.suggest.FileDictionary Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;

/**
 * Dictionary represented by a text file.
 *
 * 

Format allowed: 1 entry per line:
* An entry can be:
* *

    *
  • suggestion *
  • suggestion fieldDelimiter weight *
  • suggestion fieldDelimiter weight fieldDelimiter payload *
* * where the default fieldDelimiter is {@value #DEFAULT_FIELD_DELIMITER}
* *

NOTE: * *

    *
  • In order to have payload enabled, the first entry has to have a payload *
  • If the weight for an entry is not specified then a value of 1 is used *
  • A payload cannot be specified without having the weight specified for an entry *
  • If the payload for an entry is not specified (assuming payload is enabled) then an empty * payload is returned *
  • An entry cannot have more than two fieldDelimiter *
* *

Example:
* word1 word2 TAB 100 TAB payload1
* word3 TAB 101
* word4 word3 TAB 102
*/ public class FileDictionary implements Dictionary { /** * Tab-delimited fields are most common thus the default, but one can override this via the * constructor */ public static final String DEFAULT_FIELD_DELIMITER = "\t"; private BufferedReader in; private String line; private boolean done = false; private final String fieldDelimiter; /** * Creates a dictionary based on an inputstream. Using {@link #DEFAULT_FIELD_DELIMITER} as the * field separator in a line. * *

NOTE: content is treated as UTF-8 */ public FileDictionary(InputStream dictFile) { this(dictFile, DEFAULT_FIELD_DELIMITER); } /** * Creates a dictionary based on a reader. Using {@link #DEFAULT_FIELD_DELIMITER} as the field * separator in a line. */ public FileDictionary(Reader reader) { this(reader, DEFAULT_FIELD_DELIMITER); } /** * Creates a dictionary based on a reader. Using fieldDelimiter to separate out the * fields in a line. */ public FileDictionary(Reader reader, String fieldDelimiter) { in = new BufferedReader(reader); this.fieldDelimiter = fieldDelimiter; } /** * Creates a dictionary based on an inputstream. Using fieldDelimiter to separate out * the fields in a line. * *

NOTE: content is treated as UTF-8 */ public FileDictionary(InputStream dictFile, String fieldDelimiter) { in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8)); this.fieldDelimiter = fieldDelimiter; } @Override public InputIterator getEntryIterator() { try { return new FileIterator(); } catch (IOException e) { throw new RuntimeException(e); } } final class FileIterator implements InputIterator { private long curWeight; private final BytesRefBuilder spare = new BytesRefBuilder(); private BytesRefBuilder curPayload = new BytesRefBuilder(); private boolean isFirstLine = true; private boolean hasPayloads = false; private FileIterator() throws IOException { line = in.readLine(); if (line == null) { done = true; IOUtils.close(in); } else { String[] fields = line.split(fieldDelimiter); if (fields.length > 3) { throw new IllegalArgumentException("More than 3 fields in one line"); } else if (fields.length == 3) { // term, weight, payload hasPayloads = true; spare.copyChars(fields[0]); readWeight(fields[1]); curPayload.copyChars(fields[2]); } else if (fields.length == 2) { // term, weight spare.copyChars(fields[0]); readWeight(fields[1]); } else { // only term spare.copyChars(fields[0]); curWeight = 1; } } } @Override public long weight() { return curWeight; } @Override public BytesRef next() throws IOException { if (done) { return null; } if (isFirstLine) { isFirstLine = false; return spare.get(); } line = in.readLine(); if (line != null) { String[] fields = line.split(fieldDelimiter); if (fields.length > 3) { throw new IllegalArgumentException("More than 3 fields in one line"); } else if (fields.length == 3) { // term, weight and payload spare.copyChars(fields[0]); readWeight(fields[1]); if (hasPayloads) { curPayload.copyChars(fields[2]); } } else if (fields.length == 2) { // term, weight spare.copyChars(fields[0]); readWeight(fields[1]); if (hasPayloads) { // have an empty payload curPayload = new BytesRefBuilder(); } } else { // only term spare.copyChars(fields[0]); curWeight = 1; if (hasPayloads) { curPayload = new BytesRefBuilder(); } } return spare.get(); } else { done = true; IOUtils.close(in); return null; } } @Override public BytesRef payload() { return (hasPayloads) ? curPayload.get() : null; } @Override public boolean hasPayloads() { return hasPayloads; } private void readWeight(String weight) { // keep reading floats for bw compat try { curWeight = Long.parseLong(weight); } catch ( @SuppressWarnings("unused") NumberFormatException e) { curWeight = (long) Double.parseDouble(weight); } } @Override public Set contexts() { return null; } @Override public boolean hasContexts() { return false; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy