com.formkiq.vision.pdf.PdfTextJoiningVerticalTransformer Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.pdf;
import static java.lang.Math.abs;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import com.formkiq.vision.comparator.DocumentBlockDistanceComparator;
import com.formkiq.vision.comparator.DocumentBlockRectangleComparator;
import com.formkiq.vision.crafter.DocumentRowLayout;
import com.formkiq.vision.document.DocumentText;
import com.formkiq.vision.pdf.predicate.DocumentRawRowPredicate;
import com.formkiq.vision.predicate.DocumentBlockLeftPredicate;
import com.formkiq.vision.predicate.DocumentBlockRightPredicate;
import com.formkiq.vision.predicate.DocumentBlockTopAndXContainsPredicate;
import com.formkiq.vision.predicate.TextEndsWithPredicate;
import com.formkiq.vision.predicate.TextStartsWithCapitalLetterPredicate;
/**
* {@link Function} for Join Vertically similiar {@link DocumentText}.
*
*/
public class PdfTextJoiningVerticalTransformer
implements Function> {
/** {@link DocumentRowLayout}. */
private Collection documentRawRows;
/** {@link List} {@link PdfText}. */
private List texts;
/** {@link List} {@link PdfField}. */
private List fields;
/**
* constructor.
* @param rows {@link Collection} {@link DocumentRowLayout}
* @param pdftexts {@link List} {@link PdfText}
* @param pdffields {@link List} {@link PdfField}
*/
public PdfTextJoiningVerticalTransformer(
final Collection rows,
final List pdftexts, final List pdffields) {
this.documentRawRows = rows;
this.fields = pdffields;
this.texts = pdftexts;
}
@Override
public List apply(final DocumentText text) {
Optional row0 = findDocumentRawRow(text);
List list = new ArrayList<>();
list.add(text);
Optional leftfield = getFieldLeftOfText(text);
Optional rightfield = getFieldRightOfText(text);
List similarTexts = getHorizontallySimilarText(text);
Collections.sort(similarTexts, new DocumentBlockRectangleComparator());
TextStartsWithCapitalLetterPredicate cp =
new TextStartsWithCapitalLetterPredicate();
TextEndsWithPredicate colonpred = new TextEndsWithPredicate(":", ".",
";");
for (Iterator itr = similarTexts.iterator(); itr.hasNext();) {
DocumentText p = itr.next();
if (cp.test(p) && colonpred.test(p)) {
itr.remove();
}
}
float currentY = text.getLowerLeftY();
for (DocumentText t : similarTexts) {
Optional row1 = findDocumentRawRow(t);
Optional lfield = getFieldLeftOfText(t);
if (!isEquals(row0, row1)) {
break;
}
// if has vertical stack of fields with matching text.
// IE: >field< >text<
// >field< >text<
if (hasMatchingField(leftfield, lfield)) {
break;
}
if (rightfield.isPresent()
&& new DocumentBlockTopAndXContainsPredicate(t).test(rightfield.get())) {
break;
}
float d = currentY - t.getUpperRightY();
if (d < t.getFontSize() && t.getFontSize() == text.getFontSize()) {
list.add(t);
} else {
break;
}
currentY = t.getLowerLeftY();
}
return list;
}
/**
* Find {@link DocumentRowLayout} for {@link DocumentText}.
* @param text {@link DocumentText}
* @return {@link Optional} {@link DocumentRowLayout}
*/
private Optional findDocumentRawRow(final DocumentText text) {
return this.documentRawRows.stream()
.filter(new DocumentRawRowPredicate(text)).findFirst();
}
/**
* Is {@link DocumentRowLayout} the same.
* @param r0 {@link DocumentRowLayout}
* @param r1 {@link DocumentRowLayout}
* @return boolean
*/
private boolean isEquals(final Optional r0,
final Optional r1) {
boolean eq = !r0.isPresent() && !r1.isPresent();
if (r0.isPresent() && r1.isPresent()) {
eq = r0.get().equals(r1.get());
}
return eq;
}
/**
* Whether is fields are vertically similar.
* @param f0 {@link Optional} {@link DocumentText}
* @param f1 {@link Optional} {@link DocumentText}
* @return boolean
*/
private boolean hasMatchingField(final Optional f0,
final Optional f1) {
boolean match = false;
if (f0.isPresent() && f1.isPresent()) {
PdfField ff0 = f0.get();
PdfField ff1 = f1.get();
match = new DocumentBlockTopAndXContainsPredicate(ff1).test(ff0);
}
return match;
}
/**
* Get Field Left of Text.
* @param text {@link DocumentText}
* @return {@link Optional} {@link PdfField}
*/
private Optional getFieldLeftOfText(final DocumentText text) {
final int maxDistance = 5;
DocumentBlockLeftPredicate lp = new DocumentBlockLeftPredicate(text);
DocumentBlockDistanceComparator dc =
new DocumentBlockDistanceComparator(text);
Optional field = this.fields.stream().filter(lp).min(dc);
float distance = field.isPresent()
? field.get().getUpperRightX() - text.getLowerLeftX()
: 0;
return field.isPresent() && distance < maxDistance ? field
: Optional.empty();
}
/**
* Get Field Right of Text.
* @param text {@link DocumentText}
* @return {@link Optional} {@link PdfField}
*/
private Optional getFieldRightOfText(final DocumentText text) {
final int maxDistance = 5;
DocumentBlockRightPredicate lp = new DocumentBlockRightPredicate(text);
DocumentBlockDistanceComparator dc =
new DocumentBlockDistanceComparator(text);
Optional field = this.fields.stream().filter(lp).min(dc);
float distance = field.isPresent()
? text.getUpperRightX() - field.get().getLowerLeftX()
: 0;
return field.isPresent() && distance < maxDistance ? field
: Optional.empty();
}
/**
* HorizontallySimilarText.
* @param text {@link DocumentText}
* @return {@link List} {@link DocumentText}
*/
private List getHorizontallySimilarText(final DocumentText text) {
final int delta = 3;
Predicate predicate = t -> text.getFontName().equals(t.getFontName())
&& text.getFontSize() == t.getFontSize()
&& abs(t.getLowerLeftX() - text.getLowerLeftX()) < delta
&& t.getUpperRightY() < text.getUpperRightY();
List lowerXList = this.texts.stream()
.filter(predicate)
.collect(Collectors.toList());
return lowerXList;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy