com.formkiq.vision.crafter.DocumentSectionCrafter Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.crafter;
import static com.formkiq.vision.crafter.CollectionUtils.isType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.formkiq.vision.document.DocumentSection;
import com.formkiq.vision.document.DocumentSectionContent;
import com.formkiq.vision.document.DocumentSource;
import com.formkiq.vision.document.DocumentTextTableContent;
/**
* {@link DocumentSection} Crafter.
*
*/
public class DocumentSectionCrafter
implements Function> {
/** {@link DocumentSource}. */
private DocumentSource document;
/**
* constructor.
* @param source {@link DocumentSource}
*/
public DocumentSectionCrafter(final DocumentSource source) {
this.document = source;
}
@Override
public List apply(final Integer pageNumber) {
PageScratchPad pagePad = new PageScratchPadBuilder(this.document)
.apply(pageNumber);
List blocks = new BlockExtractorBuilder()
.apply(pagePad);
List sections = blocks.stream()
.map(b -> b.toDocumentSection()).collect(Collectors.toList());
sections.forEach(s -> {
s.getContent().removeIf(c -> isEmpty(c));
});
sections.removeIf(s -> s.getContent().isEmpty());
sections = mergeTextTableSections(sections);
return sections;
}
/**
* Returns whether {@link DocumentSectionContent} is empty.
* @param content {@link DocumentSectionContent}
* @return boolean
*/
private boolean isEmpty(final DocumentSectionContent content) {
boolean empty = false;
if (content instanceof DocumentTextTableContent) {
empty = ((DocumentTextTableContent) content).getData().isEmpty();
}
return empty;
}
/**
* Merge similar {@link DocumentSection} together.
* @param sections {@link List} {@link DocumentSection}
* @return {@link List} {@link DocumentSection}
*/
private List mergeTextTableSections(
final List sections) {
List list = new ArrayList<>(sections.size());
DocumentSection last = null;
for (DocumentSection section : sections) {
if (last != null && isTextTableMergeable(last, section)) {
DocumentTextTableContent tt = (DocumentTextTableContent) last
.getContent().get(last.getContent().size() - 1);
List> dlist = section.getContent().stream()
.flatMap(t -> ((DocumentTextTableContent) t).getData()
.stream())
.collect(Collectors.toList());
tt.getData().addAll(dlist);
} else {
list.add(section);
last = section;
}
}
return list;
}
/**
* Is {@link DocumentSection} mergeable.
* @param s0 {@link DocumentSection}
* @param s1 {@link DocumentSection}
* @return boolean
*/
private boolean isTextTableMergeable(final DocumentSection s0,
final DocumentSection s1) {
return s0 != null && s1 != null
&& isType(s0.getContent(), DocumentTextTableContent.class)
&& isType(s1.getContent(), DocumentTextTableContent.class)
&& isDocumentTextTableContentLengthEqual(s0, s1);
}
/**
* Are {@link DocumentTextTableContent} the same length.
* @param s0 {@link DocumentSection}
* @param s1 {@link DocumentSection}
* @return boolean
*/
private boolean isDocumentTextTableContentLengthEqual(
final DocumentSection s0, final DocumentSection s1) {
Set length = new HashSet<>();
for (DocumentSection s : Arrays.asList(s0, s1)) {
for (DocumentSectionContent c : s.getContent()) {
DocumentTextTableContent t = (DocumentTextTableContent) c;
for (List ss : t.getData()) {
length.add(Integer.valueOf(ss.size()));
}
}
}
return length.size() == 1;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy