All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.crafter.BlockExtractorTableFinder Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.crafter;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.apache.commons.lang3.Range;

import com.formkiq.vision.document.DocumentBlockRectangle;

/**
 * {@link Function} to convert {@link BlockExtractor} to
 * {@link DocumentWallBlockExtractor} or NULL if {@link BlockExtractor} is NOT a
 * table.
 *
 */
public class BlockExtractorTableFinder
        implements Function {

    /** {@link PageScratchPad}. */
    private PageScratchPad scratchPad;

    /**
     * constructor.
     * @param pad {@link PageScratchPad}
     */
    public BlockExtractorTableFinder(final PageScratchPad pad) {
        this.scratchPad = pad;
    }

    @Override
    public BlockExtractor apply(final BlockExtractor e) {

        boolean foundTable = false;
        MultiBlockExtractor mb = new MultiBlockExtractor();
        List alines = findTextLines(e);

		Map>> map = generateRanges(alines);

		List> split = splitByMergable(alines, map);

		for (List lines : split) {

		    List>> ranges = findRanges(lines, map);
		    List> mranges = RangeMerger.mergeRanges(ranges);

		    if (mranges != null) {

                List rects = lines.stream()
                        .flatMap(l -> l.getRectangles().stream())
                        .collect(Collectors.toList());

				DocumentWall w = new TextBlockExtractorToDocumentWall(this.scratchPad, mranges)
						.apply(rects);

                DocumentWallBlockExtractor de = new DocumentWallBlockExtractor(
                        this.scratchPad, w, rects);
                mb.addExtractor(de);

                foundTable = true;

		    } else {

                TextBlockExtractor tb = new TextBlockExtractor(
                        this.scratchPad.getDocument());

		        tb.addLines(lines);
		        mb.addExtractor(tb);
		    }
        }

        return foundTable ? mb : null;
    }

    /**
     * Get {@link List} of {@link Range} for a {@link List} of
     * {@link TextLineExtractor}.
     *
     * @param lines {@link List} {@link TextLineExtractor}
     * @param map {@link Map} {@link TextLineExtractor} {@link Range}
     * @return {@link List} {@link Range}
     */
    private List>> findRanges(
            final List lines,
            final Map>> map) {
        return lines.stream().map(l -> map.get(l)).collect(Collectors.toList());
    }

    /**
     * Split {@link TextLineExtractor} by which ones are mergable by Text Range.
     * @param lines {@link List} {@link TextLineExtractor}
     * @param map {@link Map} {@link TextLineExtractor} {@link Range}
     * @return {@link List} {@link TextLineExtractor}
     */
    private List> splitByMergable(
            final List lines,
            final Map>> map) {

    	List> lastrange = null;
    	List current = Collections.emptyList();
    	List> list = new ArrayList<>();

    	for (TextLineExtractor l : lines) {

    		List> range = map.get(l);

    		if (lastrange != null && RangeMerger.isMergeable(lastrange, range)) {
    			current.add(l);
    		} else {

    			current = new ArrayList<>();
    			current.add(l);
    			list.add(current);

    			lastrange = range;
    		}
		}

		return list;
	}

    /**
     * Generate {@link Map} of {@link TextLineExtractor} to {@link List}
     * {@link Range}.
     *
     * @param l
     *            {@link List} {@link TextLineExtractor}
     * @return {@link Map} {@link TextLineExtractor} {@link Range}
     */
    private Map>> generateRanges(
            final List l) {
        Map>> map = new HashMap<>();

        for (TextLineExtractor t : l) {
            List rects = t.getRectangles();
            List> ranges = RangeBuilder.buildXRange(rects);
            map.put(t, ranges);
        }

        return map;
    }

    /**
     * Find {@link TextLineExtractor} from {@link TextBlockExtractor} or
     * {@link MultiBlockExtractor}.
     *
     * @param block {@link BlockExtractor}
     * @return {@link List} {@link TextLineExtractor}
     */
    private List findTextLines(final BlockExtractor block) {
        List list = new ArrayList<>();

        if (block instanceof TextBlockExtractor) {

            list.addAll(((TextBlockExtractor) block).getLines());

        } else if (block instanceof MultiBlockExtractor) {

            MultiBlockExtractor be = (MultiBlockExtractor) block;

            List elist = be.getExtractors();
            List tlist = elist.stream()
                    .filter(e -> e instanceof TextBlockExtractor)
                    .map(e -> (TextBlockExtractor) e)
                    .collect(Collectors.toList());

            if (elist.size() == tlist.size()) {
                list = tlist.stream().flatMap(s -> s.getLines().stream())
                        .collect(Collectors.toList());
            }
        }

        return list;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy