All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.formkiq.vision.pdf.PdfTextJoiningHorizontalTransformer Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 FormKiQ Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.formkiq.vision.pdf;

import java.awt.Font;
import java.awt.font.FontRenderContext;
import java.awt.geom.AffineTransform;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

import com.formkiq.vision.comparator.DocumentBlockRectangleComparator;
import com.formkiq.vision.document.DocumentText;
import com.formkiq.vision.predicate.TextIsUnderScorePredicate;

/**
 * {@link Function} to Join close {@link PdfText} together.
 *
 */
public class PdfTextJoiningHorizontalTransformer
        implements Function> {

    /** {@link Map} {@link List} {@link DocumentText}. */
    private Map> textGrouping;

    /**
     * constructor.
     * @param texts {@link List} {@link DocumentText}
     */
	public PdfTextJoiningHorizontalTransformer(final List texts) {
		this.textGrouping = texts.stream().sorted(new DocumentBlockRectangleComparator())
				.collect(Collectors
						.groupingBy(s -> Integer.valueOf((int) s.getUpperRightY())));
	}

    @Override
    public List apply(final DocumentText t) {

        Integer upperRightY = Integer
                .valueOf((int) t.getUpperRightY());

        List textList = new ArrayList<>();
        textList.add(t);

        List texts = this.textGrouping.getOrDefault(upperRightY,
                Collections.emptyList());
        texts.removeIf(new TextIsUnderScorePredicate());

        int index = texts.indexOf(t);
        int startIndex = index;
        int endIndex = index + 1;

        while (endIndex < texts.size() && endIndex > 0) {

        	DocumentText t0 = texts.get(endIndex - 1);
        	DocumentText t1 = texts.get(endIndex);

            if (t0.getFontSize() != t1.getFontSize() || !isTextClose(textList, t1)) {
                endIndex--;
                break;
            }

            textList.add(t1);

            endIndex++;
        }

        while (startIndex > 0) {

        	DocumentText t0 = texts.get(startIndex);
        	DocumentText t1 = texts.get(startIndex - 1);

            if (t0.getFontSize() != t1.getFontSize() || !isTextClose(Arrays.asList(t1), t0)) {
                break;
            }

            textList.add(0, t1);
            startIndex--;
        }

        return textList;
    }

    /**
     * Calculate the average font space size.
     * @param texts {@link Collection} {@link DocumentText}
     * @return float
     */
	private float calculateAverageFontSpace(final Collection texts) {
    	
		DocumentText ptxt = texts.iterator().next();
    	AffineTransform affinetransform = new AffineTransform();     
		FontRenderContext frc = new FontRenderContext(affinetransform, true, true);     
    	Font font = new Font(ptxt.getFontName(), Font.PLAIN, (int) ptxt.getFontSize());
		float textwidth = (float) (font.getStringBounds(" ", frc).getWidth());
    	
		float upperX = texts.stream().map(t -> t.getUpperRightX()).max(Float::compareTo).get()
				.floatValue();
		float lowerX = texts.stream().map(t -> t.getLowerLeftX()).min(Float::compareTo).get()
				.floatValue();
		
		String txt = texts.stream().map(t -> t.getText()).collect(Collectors.joining(""));
		float avgFontDistance = (upperX - lowerX) / txt.length();
		int avgFontDistanceInt = (int) Math.ceil(avgFontDistance);
		
		final int minFontSpace = 4;
		return Math.max(Math.max(textwidth, avgFontDistanceInt), minFontSpace);
    }
    
    /**
     * Is {@link PdfText} close together.
     * @param list {@link DocumentText}
     * @param t1 {@link DocumentText}
     * @return boolean
     */
    private boolean isTextClose(final List list, final DocumentText t1) {

    	DocumentText t0 = list.get(list.size() - 1);
    	
    	float avgFontDistance0 = calculateAverageFontSpace(list);
    	float avgFontDistance1 = calculateAverageFontSpace(Arrays.asList(t1));
        
        int maxDistance = (int) Math.ceil(Math.max(avgFontDistance0, avgFontDistance1));
        
		int t1Count = t1.getText().length();
		String ltrim = t1.getText().replaceAll("^\\s+", "");
		int beginWhitespace = t1Count - ltrim.length();

		int t0Count = t0.getText().length();
		String rtrim = t0.getText().replaceAll("\\s+$", "");
		int endingWhitespace = t0Count - rtrim.length();

		float totalWhitespace = beginWhitespace * maxDistance + endingWhitespace * maxDistance;

		float diff = t0.getUpperRightX() > t1.getLowerLeftX()
				? t0.getLowerLeftX() - t1.getUpperRightX()
				: t1.getLowerLeftX() - t0.getUpperRightX();

        return diff + totalWhitespace <= maxDistance * 2;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy