org.apache.lucene.analysis.util.SegmentingTokenizerBase Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-common Show documentation
Additional Analyzers
There is a newer version: 8.11.3
Show newest version
package org.apache.lucene.analysis.util;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;

/**
 * Breaks text into sentences with a {@link BreakIterator} and
 * allows subclasses to decompose these sentences into words.
 * 
 * This can be used by subclasses that need sentence context 
 * for tokenization purposes, such as CJK segmenters.
 * 

 * Additionally it can be used by subclasses that want to mark
 * sentence boundaries (with a custom attribute, extra token, position
 * increment, etc) for downstream processing.
 * 
 * @lucene.experimental
 */
public abstract class SegmentingTokenizerBase extends Tokenizer {
  protected static final int BUFFERMAX = 1024;
  protected final char buffer[] = new char[BUFFERMAX];
  /** true length of text in the buffer */
  private int length = 0; 
  /** length in buffer that can be evaluated safely, up to a safe end point */
  private int usableLength = 0; 
  /** accumulated offset of previous buffers for this reader, for offsetAtt */
  protected int offset = 0;
  
  private final BreakIterator iterator;
  private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();

  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  /**
   * Construct a new SegmenterBase, using
   * the provided BreakIterator for sentence segmentation.
   * 
   * Note that you should never share BreakIterators across different
   * TokenStreams, instead a newly created or cloned one should always
   * be provided to this constructor.
   */
  public SegmentingTokenizerBase(Reader reader, BreakIterator iterator) {
    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, reader, iterator);
  }
  
  /**
   * Construct a new SegmenterBase, also supplying the AttributeFactory
   */
  public SegmentingTokenizerBase(AttributeFactory factory, Reader reader, BreakIterator iterator) {
    super(factory, reader);
    this.iterator = iterator;
  }

  @Override
  public final boolean incrementToken() throws IOException {
    if (length == 0 || !incrementWord()) {
      while (!incrementSentence()) {
        refill();
        if (length <= 0) // no more bytes to read;
          return false;
      }
    }
    
    return true;
  }
  
  @Override
  public void reset() throws IOException {
    super.reset();
    wrapper.setText(buffer, 0, 0);
    iterator.setText(wrapper);
    length = usableLength = offset = 0;
  }
  
  @Override
  public final void end() throws IOException {
    super.end();
    final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
    offsetAtt.setOffset(finalOffset, finalOffset);
  }  

  /** Returns the last unambiguous break position in the text. */
  private int findSafeEnd() {
    for (int i = length - 1; i >= 0; i--)
      if (isSafeEnd(buffer[i]))
        return i + 1;
    return -1;
  }
  
  /** For sentence tokenization, these are the unambiguous break positions. */
  protected boolean isSafeEnd(char ch) {
    switch(ch) {
      case 0x000D:
      case 0x000A:
      case 0x0085:
      case 0x2028:
      case 0x2029:
        return true;
      default:
        return false;
    }
  }

  /**
   * Refill the buffer, accumulating the offset and setting usableLength to the
   * last unambiguous break position
   */
  private void refill() throws IOException {
    offset += usableLength;
    int leftover = length - usableLength;
    System.arraycopy(buffer, usableLength, buffer, 0, leftover);
    int requested = buffer.length - leftover;
    int returned = read(input, buffer, leftover, requested);
    length = returned < 0 ? leftover : returned + leftover;
    if (returned < requested) /* reader has been emptied, process the rest */
      usableLength = length;
    else { /* still more data to be read, find a safe-stopping place */
      usableLength = findSafeEnd();
      if (usableLength < 0)
        usableLength = length; /*
                                * more than IOBUFFER of text without breaks,
                                * gonna possibly truncate tokens
                                */
    }

    wrapper.setText(buffer, 0, Math.max(0, usableLength));
    iterator.setText(wrapper);
  }
  
  // TODO: refactor to a shared readFully somewhere
  // (NGramTokenizer does this too):
  /** commons-io's readFully, but without bugs if offset != 0 */
  private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
    assert length >= 0 : "length must not be negative: " + length;
 
    int remaining = length;
    while (remaining > 0) {
      int location = length - remaining;
      int count = input.read(buffer, offset + location, remaining);
      if (-1 == count) { // EOF
        break;
      }
      remaining -= count;
    }
    return length - remaining;
  }

  /**
   * return true if there is a token from the buffer, or null if it is
   * exhausted.
   */
  private boolean incrementSentence() throws IOException {
    if (length == 0) // we must refill the buffer
      return false;
    
    while (true) {
      int start = iterator.current();

      if (start == BreakIterator.DONE)
        return false; // BreakIterator exhausted

      // find the next set of boundaries
      int end = iterator.next();

      if (end == BreakIterator.DONE)
        return false; // BreakIterator exhausted

      setNextSentence(start, end);
      if (incrementWord()) {
        return true;
      }
    }
  }
  
  /** Provides the next input sentence for analysis */
  protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
  
  /** Returns true if another word is available */
  protected abstract boolean incrementWord();
}