de.digitalcollections.solrocr.iter.FileBytesCharIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.
The newest version!
package de.digitalcollections.solrocr.iter;

import de.digitalcollections.solrocr.model.SourcePointer;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;

/**
 * ATTENTION: This breaks the semantics of {@link java.text.CharacterIterator} and {@link
 * java.lang.CharSequence} since all indices are byte offsets into the underlying file,
 * not character indices. All methods that don't operate on indices should work as
 * expected.
 *
 * Please note that this means that this type will only work with {@link java.text.BreakIterator}
 * types that don't mess with the index themselves.
 */
public class FileBytesCharIterator implements IterableCharSequence, AutoCloseable {
  private final byte[] copyBuf = new byte[128 * 1024];
  private final Path filePath; // For copy-constructor
  private final FileChannel chan;
  private final MappedByteBuffer buf;
  private final int numBytes;
  private final SourcePointer ptr;
  private final Charset charset;

  private int current;

  public FileBytesCharIterator(Path path, SourcePointer ptr) throws IOException {
    this(path, StandardCharsets.UTF_8, ptr);
  }

  public FileBytesCharIterator(Path path, Charset charset, SourcePointer ptr) throws IOException {
    this.ptr = ptr;
    this.charset = charset;
    this.filePath = path;
    chan = (FileChannel) Files.newByteChannel(path, StandardOpenOption.READ);
    this.numBytes = (int) chan.size();
    this.buf = chan.map(MapMode.READ_ONLY, 0, chan.size());
    if (this.charset == StandardCharsets.UTF_8) {
      byte[] b = new byte[4];
      buf.get(b);
      int[] validationBuf = new int[4];
      for (int i = 0; i < b.length; i++) {
        validationBuf[i] = b[i] & 0xFF;
      }
      // TODO: This is a pretty spotty heuristic, maybe there's something in the stdlib?
      if (!(validationBuf[0] == 0xEF && validationBuf[1] == 0xBB && validationBuf[2] == 0xBF)
          && ((validationBuf[0] >> 3) != 0b11110)
          && ((validationBuf[0] >> 4) != 0b1110)
          && ((validationBuf[0] >> 5) != 0b110)
          && ((validationBuf[0] >> 7) != 0)) {
        throw new IllegalArgumentException("File is not UTF-8 encoded");
      }
    }
  }

  public FileBytesCharIterator(FileBytesCharIterator other) throws IOException {
    this(other.filePath, other.charset, other.ptr);
    this.current = other.current;
  }

  @Override
  public int length() {
    return numBytes;
  }

  /** Move offset to the left until we're on an UTF8 starting byte * */
  private int adjustOffset(int b, int offset) {
    while ((b >> 6) == 0b10) {
      offset -= 1;
      b = this.buf.get(offset) & 0xFF;
    }
    return offset;
  }

  private int adjustOffset(int offset) {
    if (offset == numBytes) {
      return offset;
    }
    int b = this.buf.get(offset) & 0xFF;
    return adjustOffset(b, offset);
  }

  /**
   * Get ASCII character at the given byte offset.
   *
   * Note that for performance reason this will simply return `?` if the byte at the given
   * position is not ASCII. This is done for a 25% performance boost while highlighting, with the
   * reasoning that the `charAt` method is only used by the `BreakIterator` implementations to find
   * OCR blocks. Every format supported by this plugin uses element names and attribute names that
   * are pure ASCII, so we're not missing out on anything relevant, as long as the user doesn't put
   * non-ASCII characters into attribute values.
   */
  @Override
  public char charAt(int offset) {
    int b = buf.get(offset) & 0xFF; // bytes are signed in Java....
    if (b < 0x80) {
      // Optimization: It's just ASCII, so simply cast to a char
      return (char) b;
    } else {
      // Dirty dirty dirty speed hack, see method docstring.
      return '?';
    }
  }

  @Override
  public CharSequence subSequence(int start, int end) {
    if (start < 0 || end < 0 || end > this.numBytes || end < start) {
      throw new IndexOutOfBoundsException();
    }
    if (charset == StandardCharsets.UTF_8) {
      start = adjustOffset(start);
      end = adjustOffset(end);
    }
    byte[] buf = new byte[end - start];
    this.buf.position(start);
    this.buf.get(buf);
    return new String(buf, charset);
  }

  public CharSequence subSequence(int start, int end, boolean forceAscii) {
    if (!forceAscii) {
      return subSequence(start, end);
    }
    if (start < 0 || end < 0 || end > this.numBytes || end < start) {
      throw new IndexOutOfBoundsException();
    }
    int copyLen = end - start;
    this.buf.position(start);
    this.buf.get(copyBuf, 0, end - start);

    // Faster pure-ASCII decoding, just treat everything as ASCII, a good chunk faster than
    // `new String(buf, StandardCharsets.US_ASCII)`, which has a few sanity checks.
    // Ignore the deprecation warning, the drawbacks of this constructor don't concern us
    // in this case, since we don't care about misinterpreted codepoints.
    // Bonus: With the String compaction available in JDK >= 9, this should be *significantly*
    //        faster than the constructor with an explicit charset.
    return new String(copyBuf, 0, 0, copyLen);
  }

  @Override
  public char first() {
    this.current = this.getBeginIndex();
    return this.current();
  }

  @Override
  public char last() {
    this.current = this.getEndIndex() - 1;
    return this.current();
  }

  @Override
  public char current() {
    if (this.current == this.numBytes) {
      return DONE;
    }
    return this.charAt(current);
  }

  @Override
  public char next() {
    char c = this.current();
    int inc = 1;
    if (Character.isHighSurrogate(c) || c > '\u07FF') {
      inc = 3;
    } else if (c > '\u007F') {
      inc = 2;
    }
    this.current = Math.min(this.current + inc, this.numBytes);
    if (this.current == this.numBytes) {
      return DONE;
    }
    return this.current();
  }

  @Override
  public char previous() {
    if (this.current > 0) {
      char c = this.current();
      int dec = 1;
      if (Character.isLowSurrogate(c) || c > '\u07FF') {
        dec = 3;
      } else if (c > '\u007F') {
        dec = 2;
      }
      this.current = Math.max(this.current - dec, 0);
      return this.current();
    } else {
      return DONE;
    }
  }

  @Override
  public char setIndex(int offset) {
    this.current = offset;
    try {
      return this.current();
    } catch (IndexOutOfBoundsException e) {
      throw new IllegalArgumentException(e);
    }
  }

  @Override
  public int getBeginIndex() {
    return 0;
  }

  @Override
  public int getEndIndex() {
    return numBytes;
  }

  @Override
  public int getIndex() {
    return current;
  }

  @Override
  public Object clone() {
    try {
      return new FileBytesCharIterator(this);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public String getIdentifier() {
    return this.filePath.toAbsolutePath().toString();
  }

  @Override
  public OffsetType getOffsetType() {
    return OffsetType.BYTES;
  }

  @Override
  public Charset getCharset() {
    return this.charset;
  }

  @Override
  public SourcePointer getPointer() {
    return ptr;
  }

  @Override
  public void close() throws IOException {
    chan.close();
  }
}