org.apache.lucene.search.suggest.FileDictionary Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;

/**
 * Dictionary represented by a text file.
 *
 * Format allowed: 1 entry per line:

 * An entry can be: 

 *
 * 

 *   suggestion
 *   
suggestion fieldDelimiter weight
 *   
suggestion fieldDelimiter weight fieldDelimiter payload
 * 
 *
 * where the default fieldDelimiter is {@value #DEFAULT_FIELD_DELIMITER}

 *
 * NOTE:
 *
 * 

 *   In order to have payload enabled, the first entry has to have a payload
 *   
If the weight for an entry is not specified then a value of 1 is used
 *   
A payload cannot be specified without having the weight specified for an entry
 *   
If the payload for an entry is not specified (assuming payload is enabled) then an empty
 *       payload is returned
 *   
An entry cannot have more than two fieldDelimiter
 * 
 *
 * Example:

 * word1 word2 TAB 100 TAB payload1

 * word3 TAB 101

 * word4 word3 TAB 102

 */
public class FileDictionary implements Dictionary {

  /**
   * Tab-delimited fields are most common thus the default, but one can override this via the
   * constructor
   */
  public static final String DEFAULT_FIELD_DELIMITER = "\t";

  private BufferedReader in;
  private String line;
  private boolean done = false;
  private final String fieldDelimiter;

  /**
   * Creates a dictionary based on an inputstream. Using {@link #DEFAULT_FIELD_DELIMITER} as the
   * field separator in a line.
   *
   * 
NOTE: content is treated as UTF-8
   */
  public FileDictionary(InputStream dictFile) {
    this(dictFile, DEFAULT_FIELD_DELIMITER);
  }

  /**
   * Creates a dictionary based on a reader. Using {@link #DEFAULT_FIELD_DELIMITER} as the field
   * separator in a line.
   */
  public FileDictionary(Reader reader) {
    this(reader, DEFAULT_FIELD_DELIMITER);
  }

  /**
   * Creates a dictionary based on a reader. Using fieldDelimiter to separate out the
   * fields in a line.
   */
  public FileDictionary(Reader reader, String fieldDelimiter) {
    in = new BufferedReader(reader);
    this.fieldDelimiter = fieldDelimiter;
  }

  /**
   * Creates a dictionary based on an inputstream. Using fieldDelimiter to separate out
   * the fields in a line.
   *
   * NOTE: content is treated as UTF-8
   */
  public FileDictionary(InputStream dictFile, String fieldDelimiter) {
    in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
    this.fieldDelimiter = fieldDelimiter;
  }

  @Override
  public InputIterator getEntryIterator() {
    try {
      return new FileIterator();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  final class FileIterator implements InputIterator {
    private long curWeight;
    private final BytesRefBuilder spare = new BytesRefBuilder();
    private BytesRefBuilder curPayload = new BytesRefBuilder();
    private boolean isFirstLine = true;
    private boolean hasPayloads = false;

    private FileIterator() throws IOException {
      line = in.readLine();
      if (line == null) {
        done = true;
        IOUtils.close(in);
      } else {
        String[] fields = line.split(fieldDelimiter);
        if (fields.length > 3) {
          throw new IllegalArgumentException("More than 3 fields in one line");
        } else if (fields.length == 3) { // term, weight, payload
          hasPayloads = true;
          spare.copyChars(fields[0]);
          readWeight(fields[1]);
          curPayload.copyChars(fields[2]);
        } else if (fields.length == 2) { // term, weight
          spare.copyChars(fields[0]);
          readWeight(fields[1]);
        } else { // only term
          spare.copyChars(fields[0]);
          curWeight = 1;
        }
      }
    }

    @Override
    public long weight() {
      return curWeight;
    }

    @Override
    public BytesRef next() throws IOException {
      if (done) {
        return null;
      }
      if (isFirstLine) {
        isFirstLine = false;
        return spare.get();
      }
      line = in.readLine();
      if (line != null) {
        String[] fields = line.split(fieldDelimiter);
        if (fields.length > 3) {
          throw new IllegalArgumentException("More than 3 fields in one line");
        } else if (fields.length == 3) { // term, weight and payload
          spare.copyChars(fields[0]);
          readWeight(fields[1]);
          if (hasPayloads) {
            curPayload.copyChars(fields[2]);
          }
        } else if (fields.length == 2) { // term, weight
          spare.copyChars(fields[0]);
          readWeight(fields[1]);
          if (hasPayloads) { // have an empty payload
            curPayload = new BytesRefBuilder();
          }
        } else { // only term
          spare.copyChars(fields[0]);
          curWeight = 1;
          if (hasPayloads) {
            curPayload = new BytesRefBuilder();
          }
        }
        return spare.get();
      } else {
        done = true;
        IOUtils.close(in);
        return null;
      }
    }

    @Override
    public BytesRef payload() {
      return (hasPayloads) ? curPayload.get() : null;
    }

    @Override
    public boolean hasPayloads() {
      return hasPayloads;
    }

    private void readWeight(String weight) {
      // keep reading floats for bw compat
      try {
        curWeight = Long.parseLong(weight);
      } catch (
          @SuppressWarnings("unused")
          NumberFormatException e) {
        curWeight = (long) Double.parseDouble(weight);
      }
    }

    @Override
    public Set contexts() {
      return null;
    }

    @Override
    public boolean hasContexts() {
      return false;
    }
  }
}