com.formkiq.vision.pdf.PdfTextJoiningHorizontalTransformer Maven / Gradle / Ivy
/*
* Copyright (C) 2018 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.pdf;
import java.awt.Font;
import java.awt.font.FontRenderContext;
import java.awt.geom.AffineTransform;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.formkiq.vision.comparator.DocumentBlockRectangleComparator;
import com.formkiq.vision.document.DocumentText;
import com.formkiq.vision.predicate.TextIsUnderScorePredicate;
/**
* {@link Function} to Join close {@link PdfText} together.
*
*/
public class PdfTextJoiningHorizontalTransformer
implements Function> {
/** {@link Map} {@link List} {@link DocumentText}. */
private Map> textGrouping;
/**
* constructor.
* @param texts {@link List} {@link DocumentText}
*/
public PdfTextJoiningHorizontalTransformer(final List texts) {
this.textGrouping = texts.stream().sorted(new DocumentBlockRectangleComparator())
.collect(Collectors
.groupingBy(s -> Integer.valueOf((int) s.getUpperRightY())));
}
@Override
public List apply(final DocumentText t) {
Integer upperRightY = Integer
.valueOf((int) t.getUpperRightY());
List textList = new ArrayList<>();
textList.add(t);
List texts = this.textGrouping.getOrDefault(upperRightY,
Collections.emptyList());
texts.removeIf(new TextIsUnderScorePredicate());
int index = texts.indexOf(t);
int startIndex = index;
int endIndex = index + 1;
while (endIndex < texts.size() && endIndex > 0) {
DocumentText t0 = texts.get(endIndex - 1);
DocumentText t1 = texts.get(endIndex);
if (t0.getFontSize() != t1.getFontSize() || !isTextClose(textList, t1)) {
endIndex--;
break;
}
textList.add(t1);
endIndex++;
}
while (startIndex > 0) {
DocumentText t0 = texts.get(startIndex);
DocumentText t1 = texts.get(startIndex - 1);
if (t0.getFontSize() != t1.getFontSize() || !isTextClose(Arrays.asList(t1), t0)) {
break;
}
textList.add(0, t1);
startIndex--;
}
return textList;
}
/**
* Calculate the average font space size.
* @param texts {@link Collection} {@link DocumentText}
* @return float
*/
private float calculateAverageFontSpace(final Collection texts) {
DocumentText ptxt = texts.iterator().next();
AffineTransform affinetransform = new AffineTransform();
FontRenderContext frc = new FontRenderContext(affinetransform, true, true);
Font font = new Font(ptxt.getFontName(), Font.PLAIN, (int) ptxt.getFontSize());
float textwidth = (float) (font.getStringBounds(" ", frc).getWidth());
float upperX = texts.stream().map(t -> t.getUpperRightX()).max(Float::compareTo).get()
.floatValue();
float lowerX = texts.stream().map(t -> t.getLowerLeftX()).min(Float::compareTo).get()
.floatValue();
String txt = texts.stream().map(t -> t.getText()).collect(Collectors.joining(""));
float avgFontDistance = (upperX - lowerX) / txt.length();
int avgFontDistanceInt = (int) Math.ceil(avgFontDistance);
final int minFontSpace = 4;
return Math.max(Math.max(textwidth, avgFontDistanceInt), minFontSpace);
}
/**
* Is {@link PdfText} close together.
* @param list {@link DocumentText}
* @param t1 {@link DocumentText}
* @return boolean
*/
private boolean isTextClose(final List list, final DocumentText t1) {
DocumentText t0 = list.get(list.size() - 1);
float avgFontDistance0 = calculateAverageFontSpace(list);
float avgFontDistance1 = calculateAverageFontSpace(Arrays.asList(t1));
int maxDistance = (int) Math.ceil(Math.max(avgFontDistance0, avgFontDistance1));
int t1Count = t1.getText().length();
String ltrim = t1.getText().replaceAll("^\\s+", "");
int beginWhitespace = t1Count - ltrim.length();
int t0Count = t0.getText().length();
String rtrim = t0.getText().replaceAll("\\s+$", "");
int endingWhitespace = t0Count - rtrim.length();
float totalWhitespace = beginWhitespace * maxDistance + endingWhitespace * maxDistance;
float diff = t0.getUpperRightX() > t1.getLowerLeftX()
? t0.getLowerLeftX() - t1.getUpperRightX()
: t1.getLowerLeftX() - t0.getUpperRightX();
return diff + totalWhitespace <= maxDistance * 2;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy