All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.crafter.BlockExtractorBuilder Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.crafter;

import static com.formkiq.vision.crafter.CollectionUtils.intersection;
import static com.formkiq.vision.predicate.DocumentBlockContainsPredicate.contains;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.apache.commons.lang3.Range;

import com.formkiq.vision.comparator.RangeFloatComparator;
import com.formkiq.vision.crafter.comparator.BlockExtractorComparator;
import com.formkiq.vision.document.DocumentBlockRectangle;

/**
 * {@link Function} that Splits {@link TextLineExtractor} from
 * {@link PageScratchPad} into {@link BlockExtractor}.
 *
 */
public class BlockExtractorBuilder
        implements Function> {

    /**
     * constructor.
     *
     */
    public BlockExtractorBuilder() {
    }

    @Override
    public List apply(final PageScratchPad pad) {

        List list = new ArrayList<>();

        List images = new ImageToBlockExtractorFunction(
                pad.getDocument()).apply(pad.getDocumentPageNumber());
        list.addAll(images);

        List be = buildBlockExtractorFromWall(pad);
        List> splits = splitExtractors(be);

        for (List split : splits) {

            if (!isDocumentWallBlockExtractor(split)) {
                split = merge(split);
            }

            list.addAll(split);
        }

        Collections.sort(list, new BlockExtractorComparator());

        list = convertTables(pad, list);

        return list;
    }

    /**
     * Convert {@link MultiBlockExtractor} or {@link TextBlockExtractor} to a
     * table.
     *
     * @param pad {@link PageScratchPad}
     * @param blocks {@link List} {@link BlockExtractor}
     * @return {@link List} {@link BlockExtractor}
     */
    private List convertTables(
            final PageScratchPad pad, final List blocks) {

        List list = new ArrayList<>();

        for (BlockExtractor block : blocks) {

            list.add(block);

            BlockExtractor we = new BlockExtractorTableFinder(pad).apply(block);

            if (we != null) {
                list.remove(block);
                list.add(we);
            }
        }

        return list;
    }

    /**
     * Is {@link BlockExtractor} only have {@link DocumentWallBlockExtractor}.
     * @param list {@link List} {@link BlockExtractor}
     * @return boolean
     */
    private boolean isDocumentWallBlockExtractor(
            final List list) {
        return list.stream()
                .filter(l -> !(l instanceof DocumentWallBlockExtractor))
                .count() == 0;
    }

    /**
     * Split {@link BlockExtractor} where there is a
     * {@link DocumentWallBlockExtractor}.
     *
     * @param blocks {@link List} {@link BlockExtractor}
     * @return {@link List} {@link BlockExtractor}
     */
    private List> splitExtractors(
            final List blocks) {

        List reorderblocks = reorderBlocks(blocks);
        List> list = splitExtractorsByType(reorderblocks);

        list.removeIf(l -> l.isEmpty());

        return list;
    }

    /**
     * Reorder {@link TextBlockExtractor} that are close together but have a
     * {@link DocumentWallBlockExtractor} inbetween.
     *
     * @param blocks {@link List} {@link BlockExtractor}
     * @return {@link List} {@link BlockExtractor}
     */
    private List reorderBlocks(
            final List blocks) {

        List list = new ArrayList<>(blocks);

        for (int i = 0; i < list.size(); i++) {

            BlockExtractor block = list.get(i);

            if (block instanceof DocumentWallBlockExtractor
                    && isTextExtractor(list, i - 1)
                    && isTextExtractor(list, i + 1)
                    && isYClose(list.get(i - 1), list.get(i + 1))
                    && isXClose(list.get(i - 1), list.get(i + 1))
                    && !isXClose(list.get(i + 1), block)) {
                Collections.swap(list, i, i + 1);
            }
        }

        return list;
    }

    /**
     * Is {@link BlockExtractor} close on X.
     * @param b1 {@link BlockExtractor}
     * @param b2 {@link BlockExtractor}
     * @return boolean
     */
    private boolean isXClose(final BlockExtractor b1,
            final BlockExtractor b2) {
        return b1.getX().isOverlappedBy(b2.getX());
    }

    /**
     * Is {@link BlockExtractor} a {@link TextBlockExtractor}.
     * @param list {@link List} {@link BlockExtractor}
     * @param i int
     * @return boolean
     */
    private boolean isTextExtractor(final List list,
            final int i) {
        boolean match = false;

        if (i > -1 && i < list.size()) {
            match = list.get(i) instanceof TextBlockExtractor;
        }

        return match;
    }

    /**
     * Split {@link BlockExtractor} where there is a
     * {@link DocumentWallBlockExtractor}.
     *
     * @param blocks {@link List} {@link BlockExtractor}
     * @return {@link List} {@link BlockExtractor}
     */
    private List> splitExtractorsByType(
            final List blocks) {

        List current = null;
        List> list = new ArrayList<>();

        for (BlockExtractor e : blocks) {

            if (current == null || e instanceof DocumentWallBlockExtractor) {
                current = new ArrayList<>();
                list.add(current);
            }

            current.add(e);

            if (e instanceof DocumentWallBlockExtractor) {
                current = new ArrayList<>();
                list.add(current);
            }
        }

        return list;
    }

    /**
     * Merge {@link BlockExtractor} that are close in distance.
     *
     * @param extractors
     *            {@link List} {@link BlockExtractor}
     * @return {@link List} {@link BlockExtractor}
     */
    private List merge(final List extractors) {

        List list = new ArrayList<>();

        BlockExtractor last = null;

        for (BlockExtractor e : extractors) {

            if (isMergeable(last, e)) {

                if (last instanceof MultiBlockExtractor) {
                    MultiBlockExtractor mb = (MultiBlockExtractor) last;
                    mb.addExtractor(e);
                } else {
                    list.remove(last);

                    MultiBlockExtractor mb = new MultiBlockExtractor();
                    mb.addExtractor(last);
                    mb.addExtractor(e);
                    list.add(mb);

                    last = mb;
                }

            } else {
                list.add(e);
                last = e;
            }
        }

        return list;
    }

    /**
     * Is {@link BlockExtractor} mergeable.
     *
     * @param e0
     *            {@link BlockExtractor}
     * @param e1
     *            {@link BlockExtractor}
     * @return boolean
     */
    private boolean isMergeable(final BlockExtractor e0,
            final BlockExtractor e1) {

        if (e0 == null || e1 == null || e0 instanceof DocumentWallBlockExtractor
                || e1 instanceof DocumentWallBlockExtractor) {
            return false;
        }

        return isYClose(e0, e1);
    }

    /**
     * Checks if {@link BlockExtractor} are close together.
     * @param e0 {@link BlockExtractor}
     * @param e1 {@link BlockExtractor}
     * @return boolean
     */
    private boolean isYClose(final BlockExtractor e0, final BlockExtractor e1) {

        if (e0 == null || e1 == null) {
            return false;
        }

        final int maxdistance = 20;
        float diff = e0.getY().getMinimum().floatValue()
                - e1.getY().getMaximum().floatValue();
        return diff <= maxdistance;
    }

    /**
     * Are the {@link Range} similar so that they should be 'linked' together.
     * Looking for table structures.
     *
     * @param r0
     *            {@link Collection} {@link Range}
     * @param r1
     *            {@link Collection} {@link Range}
     * @return boolean
     */
    public static boolean isSimilar(final Collection> r0,
            final Collection> r1) {

        boolean found = false;
        if (r0 == null || r1 == null) {
            return found;
        }

        List> l0 = new ArrayList<>(r0);
        Collections.sort(l0, new RangeFloatComparator());

        for (Range l : l0) {

            List> list = r1.stream()
                    .filter(ll -> l.isOverlappedBy(ll))
                    .collect(Collectors.toList());

            // if overlap more than 1 range no match
            int matchsize = list.size();

            if (matchsize > 1) {
                found = false;
                break;
            } else if (matchsize == 1) {
                Range r = list.get(0);

                long count = r0.stream().filter(ll -> r.isOverlappedBy(ll))
                        .count();
                if (count > 1) {
                    found = false;
                    break;
                }
            }

            found = true;
        }

        return found;
    }

    /**
     * Build {@link BlockExtractor} from {@link DocumentWall}.
     *
     * @param pad {@link PageScratchPad}
     * @return {@link List} {@link BlockExtractor}
     */
    private List buildBlockExtractorFromWall(
            final PageScratchPad pad) {

        List wallBlocks = buildFromWall(pad);
        List pageLines = pad.getPageLines();

        List list = new ArrayList<>();

        for (DocumentWallBlockExtractor w : wallBlocks) {

            boolean found = false;
            Collection bks = w.getBlocks();

            for (TextLineExtractor te : pageLines) {

                List rect = te.getRectangles();
                Collection i = intersection(rect, bks);

                if (!i.isEmpty()) {
                    rect.removeAll(bks);
                    found = true;
                }
            }

            if (found) {
                list.add(w);
            }
        }

        for (TextLineExtractor te : pageLines) {

            if (!te.getRectangles().isEmpty()) {
                TextBlockExtractor e = new TextBlockExtractor(
                        pad.getDocument());

                e.addLine(te);
                list.add(e);
            }
        }

        Collections.sort(list, new BlockExtractorComparator());

        return list;
    }

    /**
     * Build {@link BlockExtractor} from {@link DocumentWall}.
     *
     * @param pad
     *            {@link PageScratchPad}
     * @return {@link List} {@link DocumentWallBlockExtractor}
     */
    private List buildFromWall(
            final PageScratchPad pad) {

        List pageLines = pad.getPageLines();

        List linerects = pageLines.stream()
                .flatMap(s -> s.getRectangles().stream())
                .collect(Collectors.toList());

        List pageWalls = pad.getPageWalls();

        List e = pageWalls.stream().map(w -> {

            Collection lineblocks = linerects.stream()
                    .filter(r -> contains(w, r, 2))
                    .collect(Collectors.toList());

            return new DocumentWallBlockExtractor(pad, w, lineblocks);

        }).collect(Collectors.toList());

        e.removeIf(w -> w.getLineblocks().isEmpty());

        return e.stream().collect(Collectors.toList());
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy