com.formkiq.vision.pdf.PDDocumentSource Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.pdf;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.form.PDCheckBox;
import org.apache.pdfbox.pdmodel.interactive.form.PDChoice;
import org.apache.pdfbox.pdmodel.interactive.form.PDComboBox;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
import com.formkiq.vision.document.DocumentBlock;
import com.formkiq.vision.document.DocumentBlockRectangle;
import com.formkiq.vision.document.DocumentContentField;
import com.formkiq.vision.document.DocumentField;
import com.formkiq.vision.document.DocumentFieldOption;
import com.formkiq.vision.document.DocumentFieldRequiredType;
import com.formkiq.vision.document.DocumentFieldType;
import com.formkiq.vision.document.DocumentImage;
import com.formkiq.vision.document.DocumentSource;
import com.formkiq.vision.document.DocumentText;
import com.formkiq.vision.predicate.DocumentBlockContainsPredicate;
import com.formkiq.vision.predicate.TextHasLettersPredicate;
/**
* {@link DocumentSource} for {@link PDDocument}.
*
*/
public class PDDocumentSource implements DocumentSource {
/** {@link PDDocument}. */
private PDDocument doc;
/** {@link Map} {@link DocumentField}. */
private Map> fieldMap;
/** {@link Map} {@link PdfText}. */
private Map> textMap;
/** {@link Map} {@link PdfImage} by Page. */
private Map> imageMap;
/** {@link Map} {@link PdfDocumentObjects}. */
private Map pdfTokenResultMap;
/**
* constructor.
* @param is {@link InputStream}
* @throws IOException IOException
*/
public PDDocumentSource(final InputStream is) throws IOException {
this(PDDocument.load(is));
}
/**
* constructor.
* @param document {@link PDDocument}
* @throws IOException IOException
*/
public PDDocumentSource(final PDDocument document)
throws IOException {
this.doc = document;
this.fieldMap = new PDDocumentToPDFieldListTransformer()
.transform(document);
this.textMap = new PDDocumentToTextTransformer().apply(document);
this.pdfTokenResultMap = new PDDocumentToPdfTokenResultTransformer()
.apply(document);
this.imageMap = new PDDocumentToImageTransformer(this.pdfTokenResultMap)
.apply(document);
removeTextCoveredByField();
}
@Override
public void close() throws IOException {
this.doc.close();
}
@Override
public DocumentContentField convertField(final DocumentField field) {
PDField pdfield = ((PdfField) field).getField();
DocumentContentField c = new DocumentContentField();
c.setName(pdfield.getFullyQualifiedName());
c.setOptions(getOptions(pdfield));
c.setRequired(pdfield.isRequired() ? DocumentFieldRequiredType.IMMEDIATE
: DocumentFieldRequiredType.OPTIONAL);
c.setType(pdfield instanceof PDSignatureField
? DocumentFieldType.SIGNATURE
: DocumentFieldType.INPUT);
c.setValue(getValue(pdfield));
c.setValues(getValues(pdfield));
c.setMultiplevalues(isMultiSelect(pdfield));
return c;
}
/**
* Get {@link PDField} value.
* @param field {@link PDField}
* @return {@link String}
*/
private String getValue(final PDField field) {
String val = null;
boolean isMultiSelect = Boolean.TRUE.equals(isMultiSelect(field));
if (!isMultiSelect) {
val = field.getValueAsString();
if (field instanceof PDChoice) {
PDChoice c = (PDChoice) field;
val = !c.getValue().isEmpty() ? c.getValue().get(0) : "";
}
}
return val;
}
/**
* Get {@link DocumentFieldOption}.
* @param field {@link PdfField}
* @return {@link List} {@link DocumentFieldOption}
*/
private List getOptions(final PDField field) {
if (field instanceof PDComboBox) {
PDComboBox box = (PDComboBox) field;
return toOptions(box.getOptions());
}
if (field instanceof PDCheckBox) {
PDCheckBox c = (PDCheckBox) field;
return toOptions(c.getOnValues());
}
return null;
}
/**
* Convert {@link String} to {@link DocumentFieldOption}.
* @param options {@link String}
* @return {@link List} {@link DocumentFieldOption}
*/
private List toOptions(
final Collection options) {
return options.stream().map(o -> {
DocumentFieldOption oo = new DocumentFieldOption();
oo.setLabel(o);
oo.setValue(o);
return oo;
}).collect(Collectors.toList());
}
/**
* Get Values for {@link PDField}.
* @param field {@link PDField}
* @return {@link List} {@link String}
*/
private List getValues(final PDField field) {
if (Boolean.TRUE.equals(isMultiSelect(field))) {
if (field instanceof PDChoice) {
return ((PDChoice) field).getValue();
}
}
return null;
}
/**
* Is {@link PDField} a multi select.
* @param field {@link PDField}
* @return {@link Boolean}
*/
private Boolean isMultiSelect(final PDField field) {
Boolean result = null;
if (field instanceof PDChoice) {
result = Boolean.valueOf(((PDChoice) field).isMultiSelect());
}
return result;
}
@Override
public String getDocumentName() {
List texts = this.textMap.getOrDefault(Integer.valueOf(0),
Collections.emptyList());
String name = new PDDocumentToNameTransformer(texts).apply(this.doc);
return name;
}
@Override
public List getFields(final int pageNumber) {
return this.fieldMap.getOrDefault(Integer.valueOf(pageNumber),
Collections.emptyList());
}
@Override
public List getImages(final Integer pageNumber) {
return this.imageMap.get(pageNumber);
}
@Override
public int getPageCount() {
PDPageTree pages = this.doc.getDocumentCatalog().getPages();
return pages.getCount();
}
@Override
public float getPageHeight(final int pageNumber) {
PDPage page = this.doc.getPage(pageNumber);
return page.getMediaBox().getHeight();
}
@Override
public float getPageWidth(final int pageNumber) {
PDPage page = this.doc.getPage(pageNumber);
return page.getMediaBox().getWidth();
}
@Override
public List getRawBlocks(final int pageNumber)
throws IOException {
List rects = this.pdfTokenResultMap
.get(Integer.valueOf(pageNumber)).getRectangles();
return rects.stream()
.map(r -> new DocumentBlock(r.getLowerLeftX(),
r.getLowerLeftY(), r.getUpperRightX(),
r.getUpperRightY()))
.collect(Collectors.toList());
}
@Override
public List getTexts(final int pageNumber) {
return this.textMap.getOrDefault(Integer.valueOf(pageNumber),
Collections.emptyList());
}
/**
* Remove {@link PdfText} that are covered by a {@link PdfField}.
*/
private void removeTextCoveredByField() {
TextHasLettersPredicate ht = new TextHasLettersPredicate();
for (Map.Entry> e : this.fieldMap.entrySet()) {
Integer pageNumber = e.getKey();
List texts = this.textMap.get(pageNumber);
List fields = e.getValue();
for (DocumentField field : fields) {
DocumentBlockContainsPredicate cp =
new DocumentBlockContainsPredicate(field);
List txt = texts.stream()
.filter(t -> cp.test(t) && ht.test(t))
.collect(Collectors.toList());
texts.removeAll(txt);
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy