com.formkiq.vision.crafter.PageScratchPadBuilder Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.crafter;
import static com.formkiq.vision.predicate.DocumentBlockContainsPredicate.contains;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import com.formkiq.vision.document.DocumentBlockRectangle;
import com.formkiq.vision.document.DocumentSource;
import com.formkiq.vision.document.DocumentText;
/**
* Converts a Document Page to a {@link PageScratchPad} for processing.
*
*/
public class PageScratchPadBuilder
implements Function {
/** {@link DocumentSource}. */
private DocumentSource document;
/**
* constructor.
* @param source {@link DocumentSource}
*/
public PageScratchPadBuilder(final DocumentSource source) {
this.document = source;
}
@Override
public PageScratchPad apply(final Integer pageNumber) {
PageScratchPad pad = new PageScratchPad(this.document, pageNumber);
// calculate all TextLineExtractor for Page.
List lines = new DocumentTextLineExtractor(
this.document).apply(pageNumber);
pad.setPageLines(lines);
List walls = calculateDocumentWalls(pad);
pad.setPageWalls(walls);
return pad;
}
/**
* Calculate {@link DocumentWall} for Page.
* @param scratchPad {@link PageScratchPad}
* @return {@link List} {@link DocumentWall}
*/
private List calculateDocumentWalls(
final PageScratchPad scratchPad) {
Integer pageNumber = scratchPad.getDocumentPageNumber();
List rawblocks = getRawBlocks(pageNumber);
List rowlayouts = new DocumentRowLayoutBuilder(scratchPad)
.apply(rawblocks);
List walls = new DocumentRowLayoutToDocumentWall().apply(rowlayouts);
walls = removeWallWithoutText(walls, pageNumber);
return walls;
}
/**
* Removes {@link DocumentWall} without Text.
* @param walls {@link List} {@link DocumentWall}
* @param pageNumber {@link Integer}
* @return {@link List} {@link DocumentWall}
*/
private List removeWallWithoutText(
final List walls, final Integer pageNumber) {
List list = new ArrayList<>(walls.size());
List texts = this.document.getTexts(pageNumber.intValue());
for (DocumentWall w : walls) {
Optional o = texts.stream().filter(t -> contains(w, t))
.findFirst();
if (o.isPresent()) {
list.add(w);
}
}
return list;
}
/**
* Get {@link DocumentBlockRectangle} for Page.
* @param pageNumber {@link Integer}
* @return {@link List} {@link DocumentBlockRectangle}
*/
private List getRawBlocks(final Integer pageNumber) {
try {
return this.document.getRawBlocks(pageNumber.intValue());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy