com.formkiq.vision.crafter.BlockExtractorBuilder Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.crafter;
import static com.formkiq.vision.crafter.CollectionUtils.intersection;
import static com.formkiq.vision.predicate.DocumentBlockContainsPredicate.contains;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.lang3.Range;
import com.formkiq.vision.comparator.RangeFloatComparator;
import com.formkiq.vision.crafter.comparator.BlockExtractorComparator;
import com.formkiq.vision.document.DocumentBlockRectangle;
/**
* {@link Function} that Splits {@link TextLineExtractor} from
* {@link PageScratchPad} into {@link BlockExtractor}.
*
*/
public class BlockExtractorBuilder
implements Function> {
/**
* constructor.
*
*/
public BlockExtractorBuilder() {
}
@Override
public List apply(final PageScratchPad pad) {
List list = new ArrayList<>();
List images = new ImageToBlockExtractorFunction(
pad.getDocument()).apply(pad.getDocumentPageNumber());
list.addAll(images);
List be = buildBlockExtractorFromWall(pad);
List> splits = splitExtractors(be);
for (List split : splits) {
if (!isDocumentWallBlockExtractor(split)) {
split = merge(split);
}
list.addAll(split);
}
Collections.sort(list, new BlockExtractorComparator());
list = convertTables(pad, list);
return list;
}
/**
* Convert {@link MultiBlockExtractor} or {@link TextBlockExtractor} to a
* table.
*
* @param pad {@link PageScratchPad}
* @param blocks {@link List} {@link BlockExtractor}
* @return {@link List} {@link BlockExtractor}
*/
private List convertTables(
final PageScratchPad pad, final List blocks) {
List list = new ArrayList<>();
for (BlockExtractor block : blocks) {
list.add(block);
BlockExtractor we = new BlockExtractorTableFinder(pad).apply(block);
if (we != null) {
list.remove(block);
list.add(we);
}
}
return list;
}
/**
* Is {@link BlockExtractor} only have {@link DocumentWallBlockExtractor}.
* @param list {@link List} {@link BlockExtractor}
* @return boolean
*/
private boolean isDocumentWallBlockExtractor(
final List list) {
return list.stream()
.filter(l -> !(l instanceof DocumentWallBlockExtractor))
.count() == 0;
}
/**
* Split {@link BlockExtractor} where there is a
* {@link DocumentWallBlockExtractor}.
*
* @param blocks {@link List} {@link BlockExtractor}
* @return {@link List} {@link BlockExtractor}
*/
private List> splitExtractors(
final List blocks) {
List reorderblocks = reorderBlocks(blocks);
List> list = splitExtractorsByType(reorderblocks);
list.removeIf(l -> l.isEmpty());
return list;
}
/**
* Reorder {@link TextBlockExtractor} that are close together but have a
* {@link DocumentWallBlockExtractor} inbetween.
*
* @param blocks {@link List} {@link BlockExtractor}
* @return {@link List} {@link BlockExtractor}
*/
private List reorderBlocks(
final List blocks) {
List list = new ArrayList<>(blocks);
for (int i = 0; i < list.size(); i++) {
BlockExtractor block = list.get(i);
if (block instanceof DocumentWallBlockExtractor
&& isTextExtractor(list, i - 1)
&& isTextExtractor(list, i + 1)
&& isYClose(list.get(i - 1), list.get(i + 1))
&& isXClose(list.get(i - 1), list.get(i + 1))
&& !isXClose(list.get(i + 1), block)) {
Collections.swap(list, i, i + 1);
}
}
return list;
}
/**
* Is {@link BlockExtractor} close on X.
* @param b1 {@link BlockExtractor}
* @param b2 {@link BlockExtractor}
* @return boolean
*/
private boolean isXClose(final BlockExtractor b1,
final BlockExtractor b2) {
return b1.getX().isOverlappedBy(b2.getX());
}
/**
* Is {@link BlockExtractor} a {@link TextBlockExtractor}.
* @param list {@link List} {@link BlockExtractor}
* @param i int
* @return boolean
*/
private boolean isTextExtractor(final List list,
final int i) {
boolean match = false;
if (i > -1 && i < list.size()) {
match = list.get(i) instanceof TextBlockExtractor;
}
return match;
}
/**
* Split {@link BlockExtractor} where there is a
* {@link DocumentWallBlockExtractor}.
*
* @param blocks {@link List} {@link BlockExtractor}
* @return {@link List} {@link BlockExtractor}
*/
private List> splitExtractorsByType(
final List blocks) {
List current = null;
List> list = new ArrayList<>();
for (BlockExtractor e : blocks) {
if (current == null || e instanceof DocumentWallBlockExtractor) {
current = new ArrayList<>();
list.add(current);
}
current.add(e);
if (e instanceof DocumentWallBlockExtractor) {
current = new ArrayList<>();
list.add(current);
}
}
return list;
}
/**
* Merge {@link BlockExtractor} that are close in distance.
*
* @param extractors
* {@link List} {@link BlockExtractor}
* @return {@link List} {@link BlockExtractor}
*/
private List merge(final List extractors) {
List list = new ArrayList<>();
BlockExtractor last = null;
for (BlockExtractor e : extractors) {
if (isMergeable(last, e)) {
if (last instanceof MultiBlockExtractor) {
MultiBlockExtractor mb = (MultiBlockExtractor) last;
mb.addExtractor(e);
} else {
list.remove(last);
MultiBlockExtractor mb = new MultiBlockExtractor();
mb.addExtractor(last);
mb.addExtractor(e);
list.add(mb);
last = mb;
}
} else {
list.add(e);
last = e;
}
}
return list;
}
/**
* Is {@link BlockExtractor} mergeable.
*
* @param e0
* {@link BlockExtractor}
* @param e1
* {@link BlockExtractor}
* @return boolean
*/
private boolean isMergeable(final BlockExtractor e0,
final BlockExtractor e1) {
if (e0 == null || e1 == null || e0 instanceof DocumentWallBlockExtractor
|| e1 instanceof DocumentWallBlockExtractor) {
return false;
}
return isYClose(e0, e1);
}
/**
* Checks if {@link BlockExtractor} are close together.
* @param e0 {@link BlockExtractor}
* @param e1 {@link BlockExtractor}
* @return boolean
*/
private boolean isYClose(final BlockExtractor e0, final BlockExtractor e1) {
if (e0 == null || e1 == null) {
return false;
}
final int maxdistance = 20;
float diff = e0.getY().getMinimum().floatValue()
- e1.getY().getMaximum().floatValue();
return diff <= maxdistance;
}
/**
* Are the {@link Range} similar so that they should be 'linked' together.
* Looking for table structures.
*
* @param r0
* {@link Collection} {@link Range}
* @param r1
* {@link Collection} {@link Range}
* @return boolean
*/
public static boolean isSimilar(final Collection> r0,
final Collection> r1) {
boolean found = false;
if (r0 == null || r1 == null) {
return found;
}
List> l0 = new ArrayList<>(r0);
Collections.sort(l0, new RangeFloatComparator());
for (Range l : l0) {
List> list = r1.stream()
.filter(ll -> l.isOverlappedBy(ll))
.collect(Collectors.toList());
// if overlap more than 1 range no match
int matchsize = list.size();
if (matchsize > 1) {
found = false;
break;
} else if (matchsize == 1) {
Range r = list.get(0);
long count = r0.stream().filter(ll -> r.isOverlappedBy(ll))
.count();
if (count > 1) {
found = false;
break;
}
}
found = true;
}
return found;
}
/**
* Build {@link BlockExtractor} from {@link DocumentWall}.
*
* @param pad {@link PageScratchPad}
* @return {@link List} {@link BlockExtractor}
*/
private List buildBlockExtractorFromWall(
final PageScratchPad pad) {
List wallBlocks = buildFromWall(pad);
List pageLines = pad.getPageLines();
List list = new ArrayList<>();
for (DocumentWallBlockExtractor w : wallBlocks) {
boolean found = false;
Collection bks = w.getBlocks();
for (TextLineExtractor te : pageLines) {
List rect = te.getRectangles();
Collection i = intersection(rect, bks);
if (!i.isEmpty()) {
rect.removeAll(bks);
found = true;
}
}
if (found) {
list.add(w);
}
}
for (TextLineExtractor te : pageLines) {
if (!te.getRectangles().isEmpty()) {
TextBlockExtractor e = new TextBlockExtractor(
pad.getDocument());
e.addLine(te);
list.add(e);
}
}
Collections.sort(list, new BlockExtractorComparator());
return list;
}
/**
* Build {@link BlockExtractor} from {@link DocumentWall}.
*
* @param pad
* {@link PageScratchPad}
* @return {@link List} {@link DocumentWallBlockExtractor}
*/
private List buildFromWall(
final PageScratchPad pad) {
List pageLines = pad.getPageLines();
List linerects = pageLines.stream()
.flatMap(s -> s.getRectangles().stream())
.collect(Collectors.toList());
List pageWalls = pad.getPageWalls();
List e = pageWalls.stream().map(w -> {
Collection lineblocks = linerects.stream()
.filter(r -> contains(w, r, 2))
.collect(Collectors.toList());
return new DocumentWallBlockExtractor(pad, w, lineblocks);
}).collect(Collectors.toList());
e.removeIf(w -> w.getLineblocks().isEmpty());
return e.stream().collect(Collectors.toList());
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy