de.digitalcollections.solrocr.util.FileBytesCharIterator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-ocrhighlighting Show documentation
Show all versions of solr-ocrhighlighting Show documentation
Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR)
without having to store the OCR documents in the index.
package de.digitalcollections.solrocr.util;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** ATTENTION: This breaks the semantics of {@link java.text.CharacterIterator} and {@link java.lang.CharSequence}
* since all indices are byte offsets into the underlying file, not character indices.
* All methods that don't operate on indices should work as expected.
*
* Please note that this means that this type will only work with {@link java.text.BreakIterator} types
* that don't mess with the index themselves.
*/
public class FileBytesCharIterator implements IterableCharSequence {
private final Path filePath; // For copy-constructor
private final MappedByteBuffer buf;
private final int numBytes;
private final Charset charset;
private int current;
public FileBytesCharIterator(Path path) throws IOException {
this(path, StandardCharsets.UTF_8);
}
public FileBytesCharIterator(Path path, Charset charset) throws IOException {
this.charset = charset;
this.filePath = path;
FileChannel channel = (FileChannel) Files.newByteChannel(path, StandardOpenOption.READ);
this.numBytes = (int) channel.size();
this.buf = channel.map(MapMode.READ_ONLY, 0, channel.size());
if (this.charset == StandardCharsets.UTF_8) {
byte[] b = new byte[4];
buf.get(b);
int[] validationBuf = new int[4];
for (int i = 0; i < b.length; i++) {
validationBuf[i] = b[i] & 0xFF;
}
// TODO: This is a pretty spotty heuristic, maybe there's something in the stdlib?
if (!(validationBuf[0] == 0xEF && validationBuf[1] == 0xBB && validationBuf[2] == 0xBF)
&& ((validationBuf[0] >> 3) != 0b11110)
&& ((validationBuf[0] >> 4) != 0b1110)
&& ((validationBuf[0] >> 5) != 0b110)
&& ((validationBuf[0] >> 7) != 0)) {
throw new IllegalArgumentException("File is not UTF-8 encoded");
}
}
}
public FileBytesCharIterator(FileBytesCharIterator other) throws IOException {
this(other.filePath, other.charset);
this.current = other.current;
}
@Override
public int length() {
return numBytes;
}
private int adjustOffset(int offset) {
if (offset == numBytes) {
return offset;
}
int b = this.buf.get(offset) & 0xFF;
while ((b >> 6) == 0b10) {
offset -= 1;
b = this.buf.get(offset) & 0xFF;
}
return offset;
}
/** Get character at the given byte offset.
*
* Note that this will seek back if we're landing inside of a UTF-8 codepoint.
* Also, if the UTF-8 string results in an multi-char UTF-16 codepoint, this will return the first char if we're on
* the first to third byte of the UTF-8 sequence and the second char if we're on the fourth byte. (?? TODO correct ??)
*/
@Override
public char charAt(int offset) {
if (offset < 0 || offset >= this.numBytes) {
throw new IndexOutOfBoundsException();
}
if (this.charset == StandardCharsets.US_ASCII) {
return (char) buf.get(offset);
}
int originalOffset = offset;
offset = adjustOffset(offset);
int b = buf.get(offset) & 0xFF; // bytes are signed in Java....
int bytesToRead;
if ((b >> 7) == 0) {
bytesToRead = 1;
} else if ((b >> 5) == 0b110) {
bytesToRead = 2;
} else if ((b >> 4) == 0b1110) {
bytesToRead = 3;
} else if ((b >> 3) == 0b11110) {
bytesToRead = 4;
} else {
throw new IllegalArgumentException("Invalid UTF8?");
}
byte[] buf = new byte[bytesToRead];
this.buf.position(offset);
this.buf.get(buf);
String s = new String(buf, StandardCharsets.UTF_8);
if (s.length() == 1 || ((originalOffset - offset) < 3)) {
return s.charAt(0);
} else {
return s.charAt(1);
}
}
@Override
public CharSequence subSequence(int start, int end) {
if (start < 0 || end < 0 || end > this.numBytes || end < start) {
throw new IndexOutOfBoundsException();
}
if (charset == StandardCharsets.UTF_8) {
start = adjustOffset(start);
end = adjustOffset(end);
}
byte[] buf = new byte[end - start];
this.buf.position(start);
this.buf.get(buf);
return new String(buf, StandardCharsets.UTF_8);
}
@Override
public char first() {
this.current = this.getBeginIndex();
return this.current();
}
@Override
public char last() {
this.current = this.getEndIndex() - 1;
return this.current();
}
@Override
public char current() {
if (this.current == this.numBytes) {
return DONE;
}
return this.charAt(current);
}
@Override
public char next() {
char c = this.current();
int inc = 1;
if (Character.isHighSurrogate(c) || c > '\u07FF') {
inc = 3;
} else if (c > '\u007F') {
inc = 2;
}
this.current = Math.min(this.current + inc, this.numBytes);
if (this.current == this.numBytes) {
return DONE;
}
return this.current();
}
@Override
public char previous() {
if (this.current > 0) {
char c = this.current();
int dec = 1;
if (Character.isLowSurrogate(c) || c > '\u07FF') {
dec = 3;
} else if (c > '\u007F') {
dec = 2;
}
this.current = Math.max(this.current - dec, 0);
return this.current();
} else {
return DONE;
}
}
@Override
public char setIndex(int offset) {
this.current = offset;
try {
return this.current();
} catch (IndexOutOfBoundsException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public int getBeginIndex() {
return 0;
}
@Override
public int getEndIndex() {
return numBytes;
}
@Override
public int getIndex() {
return current;
}
@Override
public Object clone() {
try {
return new FileBytesCharIterator(this);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public String getIdentifier() {
return this.filePath.toAbsolutePath().toString();
}
@Override
public OffsetType getOffsetType() {
return OffsetType.BYTES;
}
@Override
public Charset getCharset() {
return this.charset;
}
}