All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.pdf.PDDocumentSource Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.pdf;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.form.PDCheckBox;
import org.apache.pdfbox.pdmodel.interactive.form.PDChoice;
import org.apache.pdfbox.pdmodel.interactive.form.PDComboBox;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;

import com.formkiq.vision.document.DocumentBlock;
import com.formkiq.vision.document.DocumentBlockRectangle;
import com.formkiq.vision.document.DocumentContentField;
import com.formkiq.vision.document.DocumentField;
import com.formkiq.vision.document.DocumentFieldOption;
import com.formkiq.vision.document.DocumentFieldRequiredType;
import com.formkiq.vision.document.DocumentFieldType;
import com.formkiq.vision.document.DocumentImage;
import com.formkiq.vision.document.DocumentSource;
import com.formkiq.vision.document.DocumentText;
import com.formkiq.vision.predicate.DocumentBlockContainsPredicate;
import com.formkiq.vision.predicate.TextHasLettersPredicate;

/**
 * {@link DocumentSource} for {@link PDDocument}.
 *
 */
public class PDDocumentSource implements DocumentSource {

    /** {@link PDDocument}. */
    private PDDocument doc;
    /** {@link Map} {@link DocumentField}. */
    private Map> fieldMap;
    /** {@link Map} {@link PdfText}. */
	private Map> textMap;
	/** {@link Map} {@link PdfImage} by Page. */
	private Map> imageMap;
	/** {@link Map} {@link PdfDocumentObjects}. */
	private Map pdfTokenResultMap;

	/**
	 * constructor.
	 * @param is {@link InputStream}
	 * @throws IOException IOException  
	 */
	public PDDocumentSource(final InputStream is) throws IOException {
		this(PDDocument.load(is));
	}
	
    /**
     * constructor.
     * @param document {@link PDDocument}
     * @throws IOException IOException
     */
    public PDDocumentSource(final PDDocument document)
            throws IOException {

        this.doc = document;

        this.fieldMap = new PDDocumentToPDFieldListTransformer()
                .transform(document);

        this.textMap = new PDDocumentToTextTransformer().apply(document);

        this.pdfTokenResultMap = new PDDocumentToPdfTokenResultTransformer()
                .apply(document);

        this.imageMap = new PDDocumentToImageTransformer(this.pdfTokenResultMap)
                .apply(document);

        removeTextCoveredByField();
    }

	@Override
	public void close() throws IOException {
		this.doc.close();
	}

    @Override
    public DocumentContentField convertField(final DocumentField field) {

		PDField pdfield = ((PdfField) field).getField();
        DocumentContentField c = new DocumentContentField();

        c.setName(pdfield.getFullyQualifiedName());
        c.setOptions(getOptions(pdfield));
        c.setRequired(pdfield.isRequired() ? DocumentFieldRequiredType.IMMEDIATE
                : DocumentFieldRequiredType.OPTIONAL);
        c.setType(pdfield instanceof PDSignatureField
                ? DocumentFieldType.SIGNATURE
                : DocumentFieldType.INPUT);
        c.setValue(getValue(pdfield));
        c.setValues(getValues(pdfield));
        c.setMultiplevalues(isMultiSelect(pdfield));

        return c;
    }

    /**
     * Get {@link PDField} value.
     * @param field {@link PDField}
     * @return {@link String}
     */
    private String getValue(final PDField field) {

        String val = null;
        boolean isMultiSelect = Boolean.TRUE.equals(isMultiSelect(field));

        if (!isMultiSelect) {

            val = field.getValueAsString();

            if (field instanceof PDChoice) {
                PDChoice c = (PDChoice) field;
                val = !c.getValue().isEmpty() ? c.getValue().get(0) : "";
            }
        }

        return val;
    }

    /**
     * Get {@link DocumentFieldOption}.
     * @param field {@link PdfField}
     * @return {@link List} {@link DocumentFieldOption}
     */
    private List getOptions(final PDField field) {

        if (field instanceof PDComboBox) {
            PDComboBox box = (PDComboBox) field;
            return toOptions(box.getOptions());
        }

        if (field instanceof PDCheckBox) {
            PDCheckBox c = (PDCheckBox) field;
            return toOptions(c.getOnValues());
        }

        return null;
    }

    /**
     * Convert {@link String} to {@link DocumentFieldOption}.
     * @param options {@link String}
     * @return {@link List} {@link DocumentFieldOption}
     */
    private List toOptions(
            final Collection options) {
		return options.stream().map(o -> {
			DocumentFieldOption oo = new DocumentFieldOption();
			oo.setLabel(o);
			oo.setValue(o);
			return oo;
		}).collect(Collectors.toList());
	}

    /**
     * Get Values for {@link PDField}.
     * @param field {@link PDField}
     * @return {@link List} {@link String}
     */
	private List getValues(final PDField field) {

		if (Boolean.TRUE.equals(isMultiSelect(field))) {
	        if (field instanceof PDChoice) {
	            return ((PDChoice) field).getValue();
	        }
		}

        return null;
    }

	/**
	 * Is {@link PDField} a multi select.
	 * @param field {@link PDField}
	 * @return {@link Boolean}
	 */
    private Boolean isMultiSelect(final PDField field) {

    	Boolean result = null;
        if (field instanceof PDChoice) {
            result = Boolean.valueOf(((PDChoice) field).isMultiSelect());
        }

        return result;
    }

    @Override
	public String getDocumentName() {
        List texts = this.textMap.getOrDefault(Integer.valueOf(0),
                Collections.emptyList());

		String name = new PDDocumentToNameTransformer(texts).apply(this.doc);
		return name;
	}

	@Override
    public List getFields(final int pageNumber) {
        return this.fieldMap.getOrDefault(Integer.valueOf(pageNumber),
                Collections.emptyList());
    }

	@Override
    public List getImages(final Integer pageNumber) {
        return this.imageMap.get(pageNumber);
    }

	@Override
    public int getPageCount() {
        PDPageTree pages = this.doc.getDocumentCatalog().getPages();
        return pages.getCount();
    }

    @Override
    public float getPageHeight(final int pageNumber) {
        PDPage page = this.doc.getPage(pageNumber);
        return page.getMediaBox().getHeight();
    }

    @Override
    public float getPageWidth(final int pageNumber) {
        PDPage page = this.doc.getPage(pageNumber);
        return page.getMediaBox().getWidth();
    }

    @Override
    public List getRawBlocks(final int pageNumber)
            throws IOException {

        List rects = this.pdfTokenResultMap
                .get(Integer.valueOf(pageNumber)).getRectangles();

        return rects.stream()
                .map(r -> new DocumentBlock(r.getLowerLeftX(),
                        r.getLowerLeftY(), r.getUpperRightX(),
                        r.getUpperRightY()))
                .collect(Collectors.toList());
    }

    @Override
	public List getTexts(final int pageNumber) {
		return this.textMap.getOrDefault(Integer.valueOf(pageNumber),
                Collections.emptyList());
	}

    /**
     * Remove {@link PdfText} that are covered by a {@link PdfField}.
     */
    private void removeTextCoveredByField() {

    	TextHasLettersPredicate ht = new TextHasLettersPredicate();

    	for (Map.Entry> e : this.fieldMap.entrySet()) {
    		Integer pageNumber = e.getKey();

    		List texts = this.textMap.get(pageNumber);
    		List fields = e.getValue();

            for (DocumentField field : fields) {

                DocumentBlockContainsPredicate cp =
                        new DocumentBlockContainsPredicate(field);

                List txt = texts.stream()
                        .filter(t -> cp.test(t) && ht.test(t))
                        .collect(Collectors.toList());

                texts.removeAll(txt);
            }
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy