All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.pdf.PDDocumentToTextTransformer Maven / Gradle / Ivy

/*
 * Copyright (C) 2017 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.pdf;

import static com.formkiq.vision.pdf.PDRectangleUtil.create;
import static java.lang.Math.round;
import static org.apache.commons.lang3.StringUtils.isEmpty;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import com.formkiq.vision.document.DocumentText;

/**
 * Create a map from a {@link PDDocument} to the Text by Page Number..
 *
 */
public class PDDocumentToTextTransformer extends PDFTextStripper
        implements Function>> {

    /** {@link List} of {@link DocumentText}. */
    private Map> textMap;

    /**
     * constructor.
     * @throws IOException IOException
     */
    public PDDocumentToTextTransformer() throws IOException {
        super();
        setSortByPosition(true);
        this.textMap = new HashMap<>();
    }

    @Override
    public Map> apply(final PDDocument doc) {

        try {
            getText(doc);
            mergeTextHorizontal();
            return this.textMap;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Merge Horizontally Similar Text.
     */
    private void mergeTextHorizontal() {

        for (Map.Entry> e : this.textMap.entrySet()) {

            List texts = e.getValue();
            PdfTextJoiningHorizontalTransformer ht =
                    new PdfTextJoiningHorizontalTransformer(texts);

            for (DocumentText text : new ArrayList<>(texts)) {

                List result = ht.apply(text);

                if (text.equals(result.get(0))) {

                    text.setText(result.stream().map(s -> s.getText())
                            .collect(Collectors.joining(" ")));

                    text.setUpperRightX(result.stream()
                            .map(r -> Float
                                    .valueOf(r.getUpperRightX()))
                            .max(Comparator.naturalOrder()).get().floatValue());

                    if (result.size() > 1) {
                    	texts.removeAll(result.subList(1, result.size()));
                    }
                    
                } else {
                    texts.remove(text);
                }
            }
        }
    }

	/**
    * Override the default functionality of PDFTextStripper.
    */
    @Override
    protected void writeString(final String o, final List list)
            throws IOException {

        Integer page = Integer.valueOf(getCurrentPageNo() - 1);

        if (!this.textMap.containsKey(page)) {
            this.textMap.put(page, new ArrayList<>());
        }

        List positions = removeNonPrintableAndExtraSpaces(list);

        List splitPoints = getSplitPoints(positions);

        List> splits = splitAtPoints(positions, splitPoints);

		for (List tps : splits) {

            String text = toString(tps);

			if (text.isEmpty()) {
				continue;
			}

			PDRectangle rect = calculateTextPosition(tps);

			PDFont font = tps.get(0).getFont();
            float fontSize = tps.stream()
                    .map(s -> Float.valueOf(s.getFontSizeInPt()))
                    .max(Float::compare).orElse(Float.valueOf(0)).floatValue();

			PdfText tf = new PdfText();
			tf.setText(text.replaceAll("\t", " "));
			tf.setRectangle(rect);
			tf.setFontSize(fontSize);
			tf.setFontName(font.getName());

			this.textMap.get(page).add(tf);
		}
    }

    /**
     * Conversions to {@link String}.
     * @param tps {@link List} {@link TextPosition}
     * @return {@link String}
     */
	private String toString(final List tps) {
		String text = tps.stream().map(s -> s.getUnicode())
		        .collect(Collectors.joining());
		return text;
	}

    /**
     * Split {@link TextPosition} by {@link List} {@link Integer}.
     * @param textPositions {@link List} {@link TextPosition}
     * @param splitPoints {@link List} {@link Integer}
     * @return {@link List} {@link TextPosition}
     */
    private List> splitAtPoints(
            final List textPositions,
            final List splitPoints) {

    	List> list = new ArrayList<>();

    	if (!splitPoints.isEmpty()) {
	    	int fromIndex = 0;
			for (int i = 0; i < splitPoints.size(); i++) {

				int toIndex = splitPoints.get(i).intValue();
                toIndex = toIndex > textPositions.size() ? textPositions.size()
                        : toIndex;

				list.add(textPositions.subList(fromIndex, toIndex));
				fromIndex = toIndex;
			}

    	} else {
    		list.add(textPositions);
    	}

		return list;
	}

    /**
     * Get Split Points for {@link List} {@link TextPosition}.
     * @param textList {@link List} {@link TextPosition}
     * @return {@link List} {@link Integer}
     */
	private List getSplitPoints(final List textList) {

    	String o = toString(textList);

    	List list = new ArrayList<>();
    	Pattern pattern = Pattern.compile("[\\s]*[_]+[\\s]*");

    	Matcher m = pattern.matcher(o);
    	while (m.find()) {
    		list.add(Integer.valueOf(m.start()));
    		list.add(Integer.valueOf(m.end()));
    	}

    	final int fudgewidth = 5;
    	int size = textList.size();
    	for (int i = 1; i < size; i++) {

            TextPosition p = textList.get(i - 1);
            TextPosition c = textList.get(i);

            float cpos = c.getTextMatrix().getTranslateX();
            float ppos = p.getTextMatrix().getTranslateX() + p.getWidth()
                    + fudgewidth;

            if (cpos > ppos) {
            	list.add(Integer.valueOf(i));
            } else if (ppos - cpos > fudgewidth * 2) {
                list.add(Integer.valueOf(i));
            }
        }

    	if (!list.isEmpty()) {
    		list.add(Integer.valueOf(size));
    	}

    	Collections.sort(list);

		return list;
	}

    /**
     * Calculate {@link PDRectangle} for {@link List} {@link TextPosition}.
     * @param list {@link List} {@link TextPosition}
     * @return {@link PDRectangle}
     */
    private PDRectangle calculateTextPosition(final List list) {

        final int yrotation = 90;

        float xmin = round(minimum(
                list.stream().map(s -> Float.valueOf(s.getXDirAdj()))));

        float xmax = round(maximum(
                list.stream().map(s -> Float.valueOf(s.getXDirAdj()))));

        float ymin = round(minimum(list.stream()
                .map(s -> Float.valueOf(s.getPageHeight() - s.getYDirAdj()))));

        float height = getHeight(list);

        if (list.get(0).getDir() == yrotation) {
            ymin = list.get(0).getYDirAdj() - height;
            float ymax = list.get(0).getYDirAdj();
            return create(ymin, xmin, ymax, xmax);
        }

        return create(xmin, ymin, xmax, ymin + height);
    }

    /**
     * Remove Non Printable Characters and extra spaces.
     * @param textPositions {@link List} of {@link TextPosition}
     * @return {@link List} of {@link TextPosition}
     */
    private List removeNonPrintableAndExtraSpaces(
            final List textPositions) {

        List list = textPositions.stream().filter(
                s -> cleanTextContent(s.getUnicode()).equals(s.getUnicode()))
                .collect(Collectors.toList());

        int c = 0;
        Iterator itr = list.iterator();
        while (itr.hasNext()) {

            TextPosition tp = itr.next();
            if (isEmpty(tp.getUnicode().trim())) {
                c++;
                if (c > 2) {
                    itr.remove();
                }
            } else {
                c = 0;
            }
        }

        return list;
    }

    /**
     * Remove Non Prinable Characters.
     *
     * @param s {@link String}
     * @return {@link String}
     */
    private String cleanTextContent(final String s) {

        if (s.getBytes(StandardCharsets.UTF_8).length > 1) {
            final int maxUnicodeCharacter = 8300;
            char c = s.charAt(0);
            if (Integer.toUnsignedLong(c) > maxUnicodeCharacter) {
                return "";
            }
        }

        return s;
    }

    /**
     * Get Minimum position of {@link TextPosition} {@link List}.
     * @param stream {@link Stream}
     * @return float
     */
    private float minimum(final Stream stream) {
        return stream.min(Float::compare).get().floatValue();
    }

    /**
     * Get Maximum position of {@link TextPosition} {@link List}.
     * @param stream {@link Stream}
     * @return float
     */
    private float maximum(final Stream stream) {
        return stream.max(Float::compare).get().floatValue();
    }

    /**
     * Get Height of {@link TextPosition} list.
     * @param textPositions {@link List}
     * @return float
     */
    private float getHeight(final List textPositions) {
        return textPositions.stream()
                .map(s -> Float.valueOf(s.getHeight()))
                .max(Float::compare).get().floatValue();
    }

    /**
     * @return {@link Map} {@link List} {@link DocumentText}
     */
    public Map> getTextLocations() {
        return this.textMap;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy