All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mindee.parsing.common.ocr.OcrPage Maven / Gradle / Ivy

The newest version!
package com.mindee.parsing.common.ocr;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.mindee.geometry.PolygonUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import lombok.Getter;

/**
 * Represents a page.
 */
@Getter
@JsonIgnoreProperties(ignoreUnknown = true)
public class OcrPage {

  /**
   * All the words on the page, in semi-random order.
   */
  @JsonProperty("all_words")
  private List allWords = new ArrayList<>();
  private List> allLines = new ArrayList<>();

  @JsonCreator
  public OcrPage(@JsonProperty("all_words") List allWords) {
    this.allWords = allWords;

    // make sure words are sorted from top to bottom
    this.allWords.sort(
        (word1, word2) -> PolygonUtils.CompareOnY(word1.getPolygon(), word2.getPolygon())
    );
  }

  /**
   * Determine if two words are on the same line.
   */
  private boolean areWordsOnSameLine(Word currentWord, Word nextWord) {
    boolean currentInNext = PolygonUtils.isPointInPolygonY(
        currentWord.getPolygon().getCentroid(),
        nextWord.getPolygon()
    );
    boolean nextInCurrent = PolygonUtils.isPointInPolygonY(
        nextWord.getPolygon().getCentroid(),
        currentWord.getPolygon()
    );
    // We need to check both to eliminate any issues due to word order.
    return currentInNext || nextInCurrent;
  }

  /**
   * Order all the words on the page into lines.
   */
  protected List> toLines() {
    Word current = null;
    List indexes = new ArrayList<>();
    List> lines = new ArrayList<>();

    // go through each word ...
    for (Word ignored : this.allWords) {
      ArrayList line = new ArrayList<>();
      int idx = 0;
      // ... and compare it to all other words.
      for (Word word : this.allWords) {
        idx += 1;
        if (indexes.contains(idx)) {
          continue;
        }
        if (current == null) {
          current = word;
          indexes.add(idx);
          line = new ArrayList<>();
          line.add(word);
        } else if (this.areWordsOnSameLine(current, word)) {
          line.add(word);
          indexes.add(idx);
        }
      }
      current = null;
      if (!line.isEmpty()) {
        line.sort((word1, word2) -> PolygonUtils.CompareOnX(word1.getPolygon(), word2.getPolygon()));
        lines.add(line);
      }
    }
    return lines;
  }

  /**
   * All the words on the page, ordered in lines.
   */
  public List> getAllLines() {
    if (this.allLines.isEmpty()) {
      this.allLines = this.toLines();
    }
    return this.allLines;
  }

  public String toString() {
    return this.getAllLines().stream()
        .map(words -> words.stream().map(Word::getText).collect(Collectors.joining(" ")))
        .collect(Collectors.joining(String.format("%n")));
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy