All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.crafter.PageScratchPadBuilder Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.crafter;

import static com.formkiq.vision.predicate.DocumentBlockContainsPredicate.contains;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;

import com.formkiq.vision.document.DocumentBlockRectangle;
import com.formkiq.vision.document.DocumentSource;
import com.formkiq.vision.document.DocumentText;

/**
 * Converts a Document Page to a {@link PageScratchPad} for processing.
 *
 */
public class PageScratchPadBuilder
        implements Function {

	/** {@link DocumentSource}. */
	private DocumentSource document;

	/**
	 * constructor.
	 * @param source {@link DocumentSource}
	 */
	public PageScratchPadBuilder(final DocumentSource source) {
		this.document = source;
	}

	@Override
	public PageScratchPad apply(final Integer pageNumber) {

		PageScratchPad pad = new PageScratchPad(this.document, pageNumber);

		// calculate all TextLineExtractor for Page.
        List lines = new DocumentTextLineExtractor(
                this.document).apply(pageNumber);
        pad.setPageLines(lines);

        List walls = calculateDocumentWalls(pad);
        pad.setPageWalls(walls);

		return pad;
	}

	/**
	 * Calculate {@link DocumentWall} for Page.
	 * @param scratchPad {@link PageScratchPad}
	 * @return {@link List} {@link DocumentWall}
	 */
    private List calculateDocumentWalls(
    		final PageScratchPad scratchPad) {

    	Integer pageNumber = scratchPad.getDocumentPageNumber();
		List rawblocks = getRawBlocks(pageNumber);
		
		List rowlayouts = new DocumentRowLayoutBuilder(scratchPad)
				.apply(rawblocks);

		List walls = new DocumentRowLayoutToDocumentWall().apply(rowlayouts);

        walls = removeWallWithoutText(walls, pageNumber);

        return walls;
	}

    /**
     * Removes {@link DocumentWall} without Text.
     * @param walls {@link List} {@link DocumentWall}
     * @param pageNumber {@link Integer}
     * @return {@link List} {@link DocumentWall}
     */
    private List removeWallWithoutText(
            final List walls, final Integer pageNumber) {

        List list = new ArrayList<>(walls.size());

        List texts = this.document.getTexts(pageNumber.intValue());

        for (DocumentWall w : walls) {

            Optional o = texts.stream().filter(t -> contains(w, t))
                    .findFirst();
            if (o.isPresent()) {
                list.add(w);
            }
        }

        return list;
    }

	/**
     * Get {@link DocumentBlockRectangle} for Page.
     * @param pageNumber {@link Integer}
     * @return {@link List} {@link DocumentBlockRectangle}
     */
    private List getRawBlocks(final Integer pageNumber) {
        try {
            return this.document.getRawBlocks(pageNumber.intValue());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy