org.apache.lucene.analysis.miscellaneous.FingerprintFilter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-common Show documentation
Additional Analyzers
There is a newer version: 8.11.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.miscellaneous;

import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;

/**
 * Filter outputs a single token which is a concatenation of the sorted and
 * de-duplicated set of input tokens. This can be useful for clustering/linking
 * use cases.
 */
public class FingerprintFilter extends TokenFilter {

  public static final int DEFAULT_MAX_OUTPUT_TOKEN_SIZE = 1024;
  public static final char DEFAULT_SEPARATOR = ' ';
  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

  private CharArraySet uniqueTerms = null;
  private final int maxOutputTokenSize;
  private AttributeSource.State finalState;

  private final char separator;
  private boolean inputEnded = false;


  /**
   * Create a new FingerprintFilter with default settings
   */
  public FingerprintFilter(TokenStream input) {
    this(input, DEFAULT_MAX_OUTPUT_TOKEN_SIZE, DEFAULT_SEPARATOR);
  }

  /**
   * Create a new FingerprintFilter with control over all settings
   * 
   * @param input
   *          the source of tokens to be summarized into a single token
   * @param maxOutputTokenSize
   *          the maximum length of the summarized output token. If exceeded, no
   *          output token is emitted
   * @param separator
   *          the character used to separate tokens combined into the single
   *          output token
   */
  public FingerprintFilter(TokenStream input, int maxOutputTokenSize,
      char separator) {
    super(input);
    this.maxOutputTokenSize = maxOutputTokenSize;
    this.separator = separator;
  }

  @Override
  public final boolean incrementToken() throws IOException {
    if (inputEnded) {
      return false;
    }
    boolean result = buildSingleOutputToken();
    finalState = captureState();
    return result;
  }

  /**
   * Gathers all tokens from input, de-duplicates, sorts then concatenates.
   * 
   * @return false for end of stream; true otherwise
   */
  private final boolean buildSingleOutputToken() throws IOException {
    inputEnded = false;

    char clonedLastTerm[] = null;
    uniqueTerms = new CharArraySet(8, false);
    int outputTokenSize = 0;
    while (input.incrementToken()) {
      if (outputTokenSize > maxOutputTokenSize) {
        continue;
      }

      final char term[] = termAttribute.buffer();
      final int length = termAttribute.length();

      if (!uniqueTerms.contains(term, 0, length)) {
        // clone the term, and add to the set of seen terms.
        clonedLastTerm = new char[length];
        System.arraycopy(term, 0, clonedLastTerm, 0, length);
        if (uniqueTerms.size() > 0) {
          outputTokenSize++; //Add 1 for the separator char we will output
        }
        uniqueTerms.add(clonedLastTerm);
        outputTokenSize += length;
      }
    }
    //Force end-of-stream operations to get the final state.
    input.end();
    inputEnded = true;

    //Gathering complete - now output exactly zero or one token:

    //Set the attributes for the single output token
    offsetAtt.setOffset(0, offsetAtt.endOffset());
    posLenAtt.setPositionLength(1);
    posIncrAtt.setPositionIncrement(1);
    typeAtt.setType("fingerprint");

    //No tokens gathered - no output
    if (uniqueTerms.size() < 1) {
      termAttribute.setEmpty();
      return false;
    }

    //Tokens gathered are too large - no output
    if (outputTokenSize > maxOutputTokenSize) {
      termAttribute.setEmpty();
      uniqueTerms.clear();
      return false;
    }

    // Special case - faster option when we have a single token
    if (uniqueTerms.size() == 1) {
      termAttribute.setEmpty().append(new String(clonedLastTerm));
      uniqueTerms.clear();
      return true;
    }

    // Sort the set of deduplicated tokens and combine 
    Object[] items = uniqueTerms.toArray();

    Arrays.sort(items, new Comparator