All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.digitalcollections.solrocr.model.OcrBox Maven / Gradle / Ivy

Go to download

Solr plugin to add support for highlighting directly from various OCR formats (hOCR/ALTO/MiniOCR) without having to store the OCR documents in the index.

The newest version!
package de.digitalcollections.solrocr.model;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;

@SuppressWarnings({"rawtypes", "unchecked"})
public class OcrBox implements Comparable {
  private final Comparator comparator =
      Comparator.comparing(OcrBox::getPage)
          .thenComparingDouble(OcrBox::getUly)
          .thenComparingDouble(OcrBox::getUlx);

  private String text;
  private int textOffset = -1;
  private final List alternatives = new ArrayList<>();
  private final List alternativeOffsets = new ArrayList<>();
  private String trailingChars = "";
  private OcrPage page;
  private float ulx = -1;
  private float uly = -1;
  private float lrx = -1;
  private float lry = -1;
  private UUID highlightSpan;
  private Integer parentRegionIdx;
  private String dehyphenatedForm;
  private Integer dehyphenatedOffset;
  private Boolean hyphenStart;
  private Double confidence;

  public OcrBox() {}

  // FIXME: Is this really ulx/uly?
  public OcrBox(
      String text, OcrPage page, float ulx, float uly, float lrx, float lry, UUID highlightSpan) {
    this.text = text;
    this.page = page;
    this.ulx = ulx;
    this.uly = uly;
    this.lrx = lrx;
    this.lry = lry;
    this.highlightSpan = highlightSpan;
  }

  private static void addDimension(SimpleOrderedMap map, String name, float val) {
    if (val >= 1 || val == 0) {
      map.add(name, (int) val);
    } else {
      map.add(name, val);
    }
  }

  public NamedList toNamedList() {
    SimpleOrderedMap snipRegion = new SimpleOrderedMap();
    addDimension(snipRegion, "ulx", this.getUlx());
    addDimension(snipRegion, "uly", this.getUly());
    addDimension(snipRegion, "lrx", this.getLrx());
    addDimension(snipRegion, "lry", this.getLry());
    if (this.getText() != null) {
      snipRegion.add("text", this.getText());
    }
    if (this.getParentRegionIdx() != null) {
      snipRegion.add("parentRegionIdx", this.getParentRegionIdx());
    }
    return snipRegion;
  }

  public NamedList toNamedList(List pages) {
    SimpleOrderedMap snipRegion = (SimpleOrderedMap) this.toNamedList();
    if (this.page != null) {
      snipRegion.remove("pageId");
      for (int i = 0; i < pages.size(); i++) {
        OcrPage p = pages.get(i);
        if (p.id.equals(this.page.id)) {
          snipRegion.add("pageIdx", i);
        }
      }
    }
    return snipRegion;
  }

  @Override
  public String toString() {
    final StringBuilder sb = new StringBuilder("OcrBox{");
    sb.append("text='").append(text).append('\'');
    if (textOffset >= 0) {
      sb.append('@').append(textOffset);
    }
    if (!this.alternatives.isEmpty()) {
      sb.append(", alternatives={");
      for (int i = 0; i < alternatives.size(); i++) {
        sb.append('\'').append(alternatives.get(i)).append('\'');
        if (!this.alternativeOffsets.isEmpty()) {
          sb.append('@').append(this.alternativeOffsets.get(i));
        }
        if (i != alternatives.size() - 1) {
          sb.append(',');
        }
      }
      sb.append('}');
    }
    if (this.trailingChars != null) {
      sb.append(", trailingChars='").append(trailingChars).append('\'');
    }
    if (this.page != null) {
      sb.append(", pageId='").append(page.id).append('\'');
    }
    if (this.ulx >= 0) {
      sb.append(", ulx=").append(ulx);
    }
    if (this.uly >= 0) {
      sb.append(", uly=").append(uly);
    }
    if (this.lrx >= 0) {
      sb.append(", lrx=").append(lrx);
    }
    if (this.lry >= 0) {
      sb.append(", lry=").append(lry);
    }
    if (this.highlightSpan != null) {
      sb.append(", highlightSpan=").append(highlightSpan);
    }
    if (this.dehyphenatedForm != null) {
      sb.append(", dehyphenatedForm='").append(dehyphenatedForm).append('\'');
      if (this.dehyphenatedOffset != null && this.dehyphenatedOffset >= 0) {
        sb.append('@').append(dehyphenatedOffset);
      }
    }
    if (this.hyphenStart != null) {
      sb.append(", hyphenStart=").append(hyphenStart);
    }
    if (this.confidence != null) {
      sb.append(", confidence=").append(confidence);
    }
    if (this.parentRegionIdx != null) {
      sb.append(", parentRegionIdx=").append(parentRegionIdx);
    }
    sb.append('}');
    return sb.toString();
  }

  @Override
  public int compareTo(OcrBox o) {
    return comparator.compare(this, o);
  }

  public OcrPage getPage() {
    return page;
  }

  public String getText() {
    return text;
  }

  public List getAlternatives() {
    return alternatives;
  }

  public List getAlternativeOffsets() {
    return alternativeOffsets;
  }

  public String getTrailingChars() {
    return trailingChars;
  }

  public int getTextOffset() {
    return textOffset;
  }

  public float getUlx() {
    return ulx;
  }

  public float getUly() {
    return uly;
  }

  public float getLrx() {
    return lrx;
  }

  public float getLry() {
    return lry;
  }

  public float getWidth() {
    return lrx - ulx;
  }

  public float getHeight() {
    return lry - uly;
  }

  public UUID getHighlightSpan() {
    return highlightSpan;
  }

  public boolean isInHighlight() {
    return highlightSpan != null;
  }

  public Integer getDehyphenatedOffset() {
    return dehyphenatedOffset;
  }

  public boolean isHyphenated() {
    return hyphenStart != null;
  }

  public String getDehyphenatedForm() {
    return dehyphenatedForm;
  }

  public Boolean isHyphenStart() {
    return isHyphenated() && hyphenStart;
  }

  public void setText(String text) {
    this.text = text;
  }

  public void addAlternative(String alternative, Integer offset) {
    this.alternatives.add(alternative);
    if (offset != null) {
      this.alternativeOffsets.add(offset);
    }
  }

  public void setTrailingChars(String trailingChars) {
    this.trailingChars = trailingChars;
  }

  public void setTextOffset(int textOffset) {
    this.textOffset = textOffset;
  }

  public Double getConfidence() {
    return confidence;
  }

  public void setConfidence(Double confidence) {
    this.confidence = confidence;
  }

  public void setPage(OcrPage page) {
    this.page = page;
  }

  public void setUlx(float ulx) {
    this.ulx = ulx;
  }

  public void setUly(float uly) {
    this.uly = uly;
  }

  public void setLrx(float lrx) {
    this.lrx = lrx;
  }

  public void setLry(float lry) {
    this.lry = lry;
  }

  public void setHighlightSpan(UUID highlightId) {
    this.highlightSpan = highlightId;
  }

  public void setHyphenInfo(Boolean hyphenStart, String dehyphenated) {
    this.hyphenStart = hyphenStart;
    this.dehyphenatedForm = dehyphenated;
  }

  public void setDehyphenatedOffset(Integer dehyphenatedOffset) {
    this.dehyphenatedOffset = dehyphenatedOffset;
  }

  public Integer getParentRegionIdx() {
    return parentRegionIdx;
  }

  public void setParentRegionIdx(int parentRegionIdx) {
    this.parentRegionIdx = parentRegionIdx;
  }

  public boolean contains(OcrBox other) {
    return other.page == this.page
        && other.ulx >= this.ulx
        && other.uly >= this.uly
        && other.lrx <= this.lrx
        && other.lry <= this.lry;
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;
    OcrBox ocrBox = (OcrBox) o;
    return textOffset == ocrBox.textOffset
        && Float.compare(ocrBox.ulx, ulx) == 0
        && Float.compare(ocrBox.uly, uly) == 0
        && Float.compare(ocrBox.lrx, lrx) == 0
        && Float.compare(ocrBox.lry, lry) == 0
        && Objects.equals(text, ocrBox.text)
        && Objects.equals(alternatives, ocrBox.alternatives)
        && Objects.equals(alternativeOffsets, ocrBox.alternativeOffsets)
        && Objects.equals(trailingChars, ocrBox.trailingChars)
        && Objects.equals(page, ocrBox.page)
        && Objects.equals(highlightSpan, ocrBox.highlightSpan)
        && Objects.equals(parentRegionIdx, ocrBox.parentRegionIdx)
        && Objects.equals(dehyphenatedForm, ocrBox.dehyphenatedForm)
        && Objects.equals(dehyphenatedOffset, ocrBox.dehyphenatedOffset)
        && Objects.equals(hyphenStart, ocrBox.hyphenStart)
        && Objects.equals(confidence, ocrBox.confidence);
  }

  @Override
  public int hashCode() {
    return Objects.hash(
        text,
        textOffset,
        alternatives,
        alternativeOffsets,
        trailingChars,
        page,
        ulx,
        uly,
        lrx,
        lry,
        highlightSpan,
        parentRegionIdx,
        dehyphenatedForm,
        dehyphenatedOffset,
        hyphenStart,
        confidence);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy