All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.util.MultiFileBytesCharIterator Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

There is a newer version: 0.7.0
Show newest version
package de.digitalcollections.solrocr.util;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;

public class MultiFileBytesCharIterator implements IterableCharSequence {
  private final List paths;
  private final TreeMap offsetMap;
  private final Map pathToOffset;
  private final Charset charset;
  private final int numBytes;
  private int current;

  private final LoadingCache subiters = CacheBuilder.newBuilder()
      .maximumSize(1000)
      .build(new CacheLoader() {
        @Override
        public FileBytesCharIterator load(Path p) throws Exception {
          return new FileBytesCharIterator(p, charset);
        }
      });

  public MultiFileBytesCharIterator(List filePaths, Charset charset) throws IOException {
    this.paths = filePaths;
    this.charset = charset;
    this.offsetMap = new TreeMap<>();
    this.pathToOffset = new HashMap<>();
    int offset = 0;
    for (Path path : filePaths) {
      offsetMap.put(offset, path);
      pathToOffset.put(path, offset);
      offset += Files.size(path);
    }
    this.numBytes = offset;
  }

  public MultiFileBytesCharIterator(MultiFileBytesCharIterator other) throws IOException {
    this(other.paths, other.charset);
    this.current = other.current;
  }

  private IterableCharSequence getCharSeq(int offset) {
    try {
      Path path = offsetMap.floorEntry(offset).getValue();
      return subiters.get(path);
    } catch (ExecutionException e) {
      throw new RuntimeException(e);
    }
  }

  private int adjustOffset(int offset) {
    return offset - offsetMap.floorKey(offset);
  }

  @Override
  public String getIdentifier() {
    return String.format("{%s}", this.paths.stream().map(Path::toString).collect(Collectors.joining(", ")));
  }

  @Override
  public OffsetType getOffsetType() {
    return OffsetType.BYTES;
  }

  @Override
  public Charset getCharset() {
    return charset;
  }

  @Override
  public int length() {
    return numBytes;
  }

  @Override
  public char charAt(int offset) {
    if (offset < 0 || offset >= this.numBytes) {
      throw new IndexOutOfBoundsException();
    }
    IterableCharSequence seq = getCharSeq(offset);
    int adjustedOffset = adjustOffset(offset);
    return seq.charAt(adjustedOffset);
  }

  @Override
  public CharSequence subSequence(int start, int end) {
    if (start < 0 || end < 0 || end > this.numBytes || end < start) {
      throw new IndexOutOfBoundsException();
    }
    if (offsetMap.floorKey(start).equals(offsetMap.floorKey(end))) {
      // Easy mode, start and end are in the same file
      IterableCharSequence seq = getCharSeq(start);
      return seq.subSequence(adjustOffset(start), adjustOffset(end));
    } else {
      IterableCharSequence seq = getCharSeq(start);
      int adjustedStart = adjustOffset(start);
      StringBuilder sb = new StringBuilder(seq.subSequence(adjustedStart, seq.length()));
      seq = getCharSeq(end);
      int adjustedEnd = adjustOffset(end);
      sb.append(seq.subSequence(0, adjustedEnd));
      return sb.toString();
    }
  }

  @Override
  public char first() {
    this.current = this.getBeginIndex();
    return this.current();
  }

  @Override
  public char last() {
    this.current = this.getEndIndex() - 1;
    return this.current();
  }

  @Override
  public char current() {
    if (this.current == this.numBytes) {
      return DONE;
    }
    return this.charAt(current);
  }

  @Override
  public char next() {
    int inc = Utf8.encodedLength(Character.toString(this.current()));
    this.current = Math.min(this.current + inc, this.numBytes);
    if (this.current == this.numBytes) {
      return DONE;
    }
    return this.current();
  }

  @Override
  public char previous() {
    if (this.current > 0) {
      int dec = Utf8.encodedLength(Character.toString(this.current()));
      this.current = Math.max(this.current - dec, 0);
      return this.current();
    } else {
      return DONE;
    }
  }

  @Override
  public char setIndex(int offset) {
    this.current = offset;
    try {
      return this.current();
    } catch (IndexOutOfBoundsException e) {
      throw new IllegalArgumentException(e);
    }
  }

  @Override
  public int getBeginIndex() {
    return 0;
  }

  @Override
  public int getEndIndex() {
    return numBytes;
  }

  @Override
  public int getIndex() {
    return current;
  }

  @Override
  public Object clone() {
    try {
      return new MultiFileBytesCharIterator(this);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy