All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.pdf.PdfTextJoiningVerticalTransformer Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.pdf;

import static java.lang.Math.abs;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import com.formkiq.vision.comparator.DocumentBlockDistanceComparator;
import com.formkiq.vision.comparator.DocumentBlockRectangleComparator;
import com.formkiq.vision.crafter.DocumentRowLayout;
import com.formkiq.vision.document.DocumentText;
import com.formkiq.vision.pdf.predicate.DocumentRawRowPredicate;
import com.formkiq.vision.predicate.DocumentBlockLeftPredicate;
import com.formkiq.vision.predicate.DocumentBlockRightPredicate;
import com.formkiq.vision.predicate.DocumentBlockTopAndXContainsPredicate;
import com.formkiq.vision.predicate.TextEndsWithPredicate;
import com.formkiq.vision.predicate.TextStartsWithCapitalLetterPredicate;

/**
 * {@link Function} for Join Vertically similiar {@link DocumentText}.
 *
 */
public class PdfTextJoiningVerticalTransformer
        implements Function> {

    /** {@link DocumentRowLayout}. */
    private Collection documentRawRows;
    /** {@link List} {@link PdfText}. */
	private List texts;
	/** {@link List} {@link PdfField}. */
	private List fields;

	/**
	 * constructor.
	 * @param rows {@link Collection} {@link DocumentRowLayout}
	 * @param pdftexts {@link List} {@link PdfText}
	 * @param pdffields {@link List} {@link PdfField}
	 */
    public PdfTextJoiningVerticalTransformer(
            final Collection rows,
            final List pdftexts, final List pdffields) {
        this.documentRawRows = rows;
        this.fields = pdffields;
        this.texts = pdftexts;
    }

	@Override
	public List apply(final DocumentText text) {

	    Optional row0 = findDocumentRawRow(text);

		List list = new ArrayList<>();
		list.add(text);

		Optional leftfield = getFieldLeftOfText(text);
		Optional rightfield = getFieldRightOfText(text);

		List similarTexts = getHorizontallySimilarText(text);
		Collections.sort(similarTexts, new DocumentBlockRectangleComparator());

        TextStartsWithCapitalLetterPredicate cp =
                new TextStartsWithCapitalLetterPredicate();
        TextEndsWithPredicate colonpred = new TextEndsWithPredicate(":", ".",
                ";");

        for (Iterator itr = similarTexts.iterator(); itr.hasNext();) {
        	DocumentText p = itr.next();
            if (cp.test(p) && colonpred.test(p)) {
                itr.remove();
            }
        }

		float currentY = text.getLowerLeftY();

		for (DocumentText t : similarTexts) {

		    Optional row1 = findDocumentRawRow(t);

			Optional lfield = getFieldLeftOfText(t);

			if (!isEquals(row0, row1)) {
			    break;
			}

			// if has vertical stack of fields with matching text.
			// IE: >field< >text<
			//     >field< >text<
			if (hasMatchingField(leftfield, lfield)) {
				break;
			}

			if (rightfield.isPresent()
					&& new DocumentBlockTopAndXContainsPredicate(t).test(rightfield.get())) {
				break;
			}

			float d = currentY - t.getUpperRightY();
            if (d < t.getFontSize() && t.getFontSize() == text.getFontSize()) {
				list.add(t);
			} else {
				break;
			}

			currentY = t.getLowerLeftY();
		}

		return list;
	}

	/**
	 * Find {@link DocumentRowLayout} for {@link DocumentText}.
	 * @param text {@link DocumentText}
	 * @return {@link Optional} {@link DocumentRowLayout}
	 */
    private Optional findDocumentRawRow(final DocumentText text) {
        return this.documentRawRows.stream()
                .filter(new DocumentRawRowPredicate(text)).findFirst();
    }

    /**
     * Is {@link DocumentRowLayout} the same.
     * @param r0 {@link DocumentRowLayout}
     * @param r1 {@link DocumentRowLayout}
     * @return boolean
     */
    private boolean isEquals(final Optional r0,
            final Optional r1) {
        boolean eq = !r0.isPresent() && !r1.isPresent();

        if (r0.isPresent() && r1.isPresent()) {
            eq = r0.get().equals(r1.get());
        }

        return eq;
    }

    /**
	 * Whether is fields are vertically similar.
	 * @param f0 {@link Optional} {@link DocumentText}
	 * @param f1 {@link Optional} {@link DocumentText}
	 * @return boolean
	 */
    private boolean hasMatchingField(final Optional f0,
            final Optional f1) {

        boolean match = false;

        if (f0.isPresent() && f1.isPresent()) {

        	PdfField ff0 = f0.get();
        	PdfField ff1 = f1.get();

            match = new DocumentBlockTopAndXContainsPredicate(ff1).test(ff0);
        }

        return match;
    }

	/**
	 * Get Field Left of Text.
	 * @param text {@link DocumentText}
	 * @return {@link Optional} {@link PdfField}
	 */
    private Optional getFieldLeftOfText(final DocumentText text) {
        final int maxDistance = 5;

        DocumentBlockLeftPredicate lp = new DocumentBlockLeftPredicate(text);

        DocumentBlockDistanceComparator dc =
                new DocumentBlockDistanceComparator(text);

        Optional field = this.fields.stream().filter(lp).min(dc);

        float distance = field.isPresent()
                ? field.get().getUpperRightX() - text.getLowerLeftX()
                : 0;

        return field.isPresent() && distance < maxDistance ? field
                : Optional.empty();
    }

    /**
     * Get Field Right of Text.
     * @param text {@link DocumentText}
     * @return {@link Optional} {@link PdfField}
     */
    private Optional getFieldRightOfText(final DocumentText text) {
        final int maxDistance = 5;

        DocumentBlockRightPredicate lp = new DocumentBlockRightPredicate(text);

        DocumentBlockDistanceComparator dc =
                new DocumentBlockDistanceComparator(text);

        Optional field = this.fields.stream().filter(lp).min(dc);

        float distance = field.isPresent()
                ? text.getUpperRightX() - field.get().getLowerLeftX()
                : 0;

        return field.isPresent() && distance < maxDistance ? field
                : Optional.empty();
    }

	/**
	 * HorizontallySimilarText.
	 * @param text {@link DocumentText}
	 * @return {@link List} {@link DocumentText}
	 */
    private List getHorizontallySimilarText(final DocumentText text) {
        final int delta = 3;
		Predicate predicate = t -> text.getFontName().equals(t.getFontName())
				&& text.getFontSize() == t.getFontSize()
				&& abs(t.getLowerLeftX() - text.getLowerLeftX()) < delta
				&& t.getUpperRightY() < text.getUpperRightY();

		List lowerXList = this.texts.stream()
				.filter(predicate)
				.collect(Collectors.toList());

        return lowerXList;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy