org.apache.lucene.analysis.path.PathHierarchyTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-common Show documentation
Apache Lucene (module: common)
There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.path;

import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;

/**
 * Tokenizer for path-like hierarchies.
 *
 * Take something like:
 *
 * 
 *  /something/something/else
 * 
 *
 * and make:
 *
 *  *  /something
 *  /something/something
 *  /something/something/else
 * 
 */
@IgnoreRandomChains(reason = "broken offsets")
public class PathHierarchyTokenizer extends Tokenizer {

  public PathHierarchyTokenizer() {
    this(DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
  }

  public PathHierarchyTokenizer(int skip) {
    this(DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip);
  }

  public PathHierarchyTokenizer(int bufferSize, char delimiter) {
    this(bufferSize, delimiter, delimiter, DEFAULT_SKIP);
  }

  public PathHierarchyTokenizer(char delimiter, char replacement) {
    this(DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP);
  }

  public PathHierarchyTokenizer(char delimiter, char replacement, int skip) {
    this(DEFAULT_BUFFER_SIZE, delimiter, replacement, skip);
  }

  public PathHierarchyTokenizer(
      AttributeFactory factory, char delimiter, char replacement, int skip) {
    this(factory, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip);
  }

  public PathHierarchyTokenizer(int bufferSize, char delimiter, char replacement, int skip) {
    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, bufferSize, delimiter, replacement, skip);
  }

  public PathHierarchyTokenizer(
      AttributeFactory factory, int bufferSize, char delimiter, char replacement, int skip) {
    super(factory);
    if (bufferSize < 0) {
      throw new IllegalArgumentException("bufferSize cannot be negative");
    }
    if (skip < 0) {
      throw new IllegalArgumentException("skip cannot be negative");
    }
    termAtt.resizeBuffer(bufferSize);

    this.delimiter = delimiter;
    this.replacement = replacement;
    this.skip = skip;
    resultToken = new StringBuilder(bufferSize);
  }

  private static final int DEFAULT_BUFFER_SIZE = 1024;
  public static final char DEFAULT_DELIMITER = '/';
  public static final int DEFAULT_SKIP = 0;

  private final char delimiter;
  private final char replacement;
  private final int skip;

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
  private int startPosition = 0;
  private int skipped = 0;
  private boolean endDelimiter = false;
  private StringBuilder resultToken;

  private int charsRead = 0;

  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();
    termAtt.append(resultToken);
    if (resultToken.length() == 0) {
      posAtt.setPositionIncrement(1);
    } else {
      posAtt.setPositionIncrement(0);
    }
    int length = 0;
    boolean added = false;
    if (endDelimiter) {
      termAtt.append(replacement);
      length++;
      endDelimiter = false;
      added = true;
    }

    while (true) {
      int c = input.read();
      if (c >= 0) {
        charsRead++;
      } else {
        if (skipped > skip) {
          length += resultToken.length();
          termAtt.setLength(length);
          offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
          if (added) {
            resultToken.setLength(0);
            resultToken.append(termAtt.buffer(), 0, length);
          }
          return added;
        } else {
          return false;
        }
      }
      if (!added) {
        added = true;
        skipped++;
        if (skipped > skip) {
          termAtt.append(c == delimiter ? replacement : (char) c);
          length++;
        } else {
          startPosition++;
        }
      } else {
        if (c == delimiter) {
          if (skipped > skip) {
            endDelimiter = true;
            break;
          }
          skipped++;
          if (skipped > skip) {
            termAtt.append(replacement);
            length++;
          } else {
            startPosition++;
          }
        } else {
          if (skipped > skip) {
            termAtt.append((char) c);
            length++;
          } else {
            startPosition++;
          }
        }
      }
    }
    length += resultToken.length();
    termAtt.setLength(length);
    offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + length));
    resultToken.setLength(0);
    resultToken.append(termAtt.buffer(), 0, length);
    return true;
  }

  @Override
  public final void end() throws IOException {
    super.end();
    // set final offset
    int finalOffset = correctOffset(charsRead);
    offsetAtt.setOffset(finalOffset, finalOffset);
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    resultToken.setLength(0);
    charsRead = 0;
    endDelimiter = false;
    skipped = 0;
    startPosition = 0;
  }
}