com.formkiq.vision.crafter.BlockExtractorTableFinder Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.crafter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.lang3.Range;
import com.formkiq.vision.document.DocumentBlockRectangle;
/**
* {@link Function} to convert {@link BlockExtractor} to
* {@link DocumentWallBlockExtractor} or NULL if {@link BlockExtractor} is NOT a
* table.
*
*/
public class BlockExtractorTableFinder
implements Function {
/** {@link PageScratchPad}. */
private PageScratchPad scratchPad;
/**
* constructor.
* @param pad {@link PageScratchPad}
*/
public BlockExtractorTableFinder(final PageScratchPad pad) {
this.scratchPad = pad;
}
@Override
public BlockExtractor apply(final BlockExtractor e) {
boolean foundTable = false;
MultiBlockExtractor mb = new MultiBlockExtractor();
List alines = findTextLines(e);
Map>> map = generateRanges(alines);
List> split = splitByMergable(alines, map);
for (List lines : split) {
List>> ranges = findRanges(lines, map);
List> mranges = RangeMerger.mergeRanges(ranges);
if (mranges != null) {
List rects = lines.stream()
.flatMap(l -> l.getRectangles().stream())
.collect(Collectors.toList());
DocumentWall w = new TextBlockExtractorToDocumentWall(this.scratchPad, mranges)
.apply(rects);
DocumentWallBlockExtractor de = new DocumentWallBlockExtractor(
this.scratchPad, w, rects);
mb.addExtractor(de);
foundTable = true;
} else {
TextBlockExtractor tb = new TextBlockExtractor(
this.scratchPad.getDocument());
tb.addLines(lines);
mb.addExtractor(tb);
}
}
return foundTable ? mb : null;
}
/**
* Get {@link List} of {@link Range} for a {@link List} of
* {@link TextLineExtractor}.
*
* @param lines {@link List} {@link TextLineExtractor}
* @param map {@link Map} {@link TextLineExtractor} {@link Range}
* @return {@link List} {@link Range}
*/
private List>> findRanges(
final List lines,
final Map>> map) {
return lines.stream().map(l -> map.get(l)).collect(Collectors.toList());
}
/**
* Split {@link TextLineExtractor} by which ones are mergable by Text Range.
* @param lines {@link List} {@link TextLineExtractor}
* @param map {@link Map} {@link TextLineExtractor} {@link Range}
* @return {@link List} {@link TextLineExtractor}
*/
private List> splitByMergable(
final List lines,
final Map>> map) {
List> lastrange = null;
List current = Collections.emptyList();
List> list = new ArrayList<>();
for (TextLineExtractor l : lines) {
List> range = map.get(l);
if (lastrange != null && RangeMerger.isMergeable(lastrange, range)) {
current.add(l);
} else {
current = new ArrayList<>();
current.add(l);
list.add(current);
lastrange = range;
}
}
return list;
}
/**
* Generate {@link Map} of {@link TextLineExtractor} to {@link List}
* {@link Range}.
*
* @param l
* {@link List} {@link TextLineExtractor}
* @return {@link Map} {@link TextLineExtractor} {@link Range}
*/
private Map>> generateRanges(
final List l) {
Map>> map = new HashMap<>();
for (TextLineExtractor t : l) {
List rects = t.getRectangles();
List> ranges = RangeBuilder.buildXRange(rects);
map.put(t, ranges);
}
return map;
}
/**
* Find {@link TextLineExtractor} from {@link TextBlockExtractor} or
* {@link MultiBlockExtractor}.
*
* @param block {@link BlockExtractor}
* @return {@link List} {@link TextLineExtractor}
*/
private List findTextLines(final BlockExtractor block) {
List list = new ArrayList<>();
if (block instanceof TextBlockExtractor) {
list.addAll(((TextBlockExtractor) block).getLines());
} else if (block instanceof MultiBlockExtractor) {
MultiBlockExtractor be = (MultiBlockExtractor) block;
List elist = be.getExtractors();
List tlist = elist.stream()
.filter(e -> e instanceof TextBlockExtractor)
.map(e -> (TextBlockExtractor) e)
.collect(Collectors.toList());
if (elist.size() == tlist.size()) {
list = tlist.stream().flatMap(s -> s.getLines().stream())
.collect(Collectors.toList());
}
}
return list;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy