com.formkiq.vision.pdf.PDDocumentToTextTransformer Maven / Gradle / Ivy
/*
* Copyright (C) 2017 FormKiQ Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.formkiq.vision.pdf;
import static com.formkiq.vision.pdf.PDRectangleUtil.create;
import static java.lang.Math.round;
import static org.apache.commons.lang3.StringUtils.isEmpty;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import com.formkiq.vision.document.DocumentText;
/**
* Create a map from a {@link PDDocument} to the Text by Page Number..
*
*/
public class PDDocumentToTextTransformer extends PDFTextStripper
implements Function>> {
/** {@link List} of {@link DocumentText}. */
private Map> textMap;
/**
* constructor.
* @throws IOException IOException
*/
public PDDocumentToTextTransformer() throws IOException {
super();
setSortByPosition(true);
this.textMap = new HashMap<>();
}
@Override
public Map> apply(final PDDocument doc) {
try {
getText(doc);
mergeTextHorizontal();
return this.textMap;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Merge Horizontally Similar Text.
*/
private void mergeTextHorizontal() {
for (Map.Entry> e : this.textMap.entrySet()) {
List texts = e.getValue();
PdfTextJoiningHorizontalTransformer ht =
new PdfTextJoiningHorizontalTransformer(texts);
for (DocumentText text : new ArrayList<>(texts)) {
List result = ht.apply(text);
if (text.equals(result.get(0))) {
text.setText(result.stream().map(s -> s.getText())
.collect(Collectors.joining(" ")));
text.setUpperRightX(result.stream()
.map(r -> Float
.valueOf(r.getUpperRightX()))
.max(Comparator.naturalOrder()).get().floatValue());
if (result.size() > 1) {
texts.removeAll(result.subList(1, result.size()));
}
} else {
texts.remove(text);
}
}
}
}
/**
* Override the default functionality of PDFTextStripper.
*/
@Override
protected void writeString(final String o, final List list)
throws IOException {
Integer page = Integer.valueOf(getCurrentPageNo() - 1);
if (!this.textMap.containsKey(page)) {
this.textMap.put(page, new ArrayList<>());
}
List positions = removeNonPrintableAndExtraSpaces(list);
List splitPoints = getSplitPoints(positions);
List> splits = splitAtPoints(positions, splitPoints);
for (List tps : splits) {
String text = toString(tps);
if (text.isEmpty()) {
continue;
}
PDRectangle rect = calculateTextPosition(tps);
PDFont font = tps.get(0).getFont();
float fontSize = tps.stream()
.map(s -> Float.valueOf(s.getFontSizeInPt()))
.max(Float::compare).orElse(Float.valueOf(0)).floatValue();
PdfText tf = new PdfText();
tf.setText(text.replaceAll("\t", " "));
tf.setRectangle(rect);
tf.setFontSize(fontSize);
tf.setFontName(font.getName());
this.textMap.get(page).add(tf);
}
}
/**
* Conversions to {@link String}.
* @param tps {@link List} {@link TextPosition}
* @return {@link String}
*/
private String toString(final List tps) {
String text = tps.stream().map(s -> s.getUnicode())
.collect(Collectors.joining());
return text;
}
/**
* Split {@link TextPosition} by {@link List} {@link Integer}.
* @param textPositions {@link List} {@link TextPosition}
* @param splitPoints {@link List} {@link Integer}
* @return {@link List} {@link TextPosition}
*/
private List> splitAtPoints(
final List textPositions,
final List splitPoints) {
List> list = new ArrayList<>();
if (!splitPoints.isEmpty()) {
int fromIndex = 0;
for (int i = 0; i < splitPoints.size(); i++) {
int toIndex = splitPoints.get(i).intValue();
toIndex = toIndex > textPositions.size() ? textPositions.size()
: toIndex;
list.add(textPositions.subList(fromIndex, toIndex));
fromIndex = toIndex;
}
} else {
list.add(textPositions);
}
return list;
}
/**
* Get Split Points for {@link List} {@link TextPosition}.
* @param textList {@link List} {@link TextPosition}
* @return {@link List} {@link Integer}
*/
private List getSplitPoints(final List textList) {
String o = toString(textList);
List list = new ArrayList<>();
Pattern pattern = Pattern.compile("[\\s]*[_]+[\\s]*");
Matcher m = pattern.matcher(o);
while (m.find()) {
list.add(Integer.valueOf(m.start()));
list.add(Integer.valueOf(m.end()));
}
final int fudgewidth = 5;
int size = textList.size();
for (int i = 1; i < size; i++) {
TextPosition p = textList.get(i - 1);
TextPosition c = textList.get(i);
float cpos = c.getTextMatrix().getTranslateX();
float ppos = p.getTextMatrix().getTranslateX() + p.getWidth()
+ fudgewidth;
if (cpos > ppos) {
list.add(Integer.valueOf(i));
} else if (ppos - cpos > fudgewidth * 2) {
list.add(Integer.valueOf(i));
}
}
if (!list.isEmpty()) {
list.add(Integer.valueOf(size));
}
Collections.sort(list);
return list;
}
/**
* Calculate {@link PDRectangle} for {@link List} {@link TextPosition}.
* @param list {@link List} {@link TextPosition}
* @return {@link PDRectangle}
*/
private PDRectangle calculateTextPosition(final List list) {
final int yrotation = 90;
float xmin = round(minimum(
list.stream().map(s -> Float.valueOf(s.getXDirAdj()))));
float xmax = round(maximum(
list.stream().map(s -> Float.valueOf(s.getXDirAdj()))));
float ymin = round(minimum(list.stream()
.map(s -> Float.valueOf(s.getPageHeight() - s.getYDirAdj()))));
float height = getHeight(list);
if (list.get(0).getDir() == yrotation) {
ymin = list.get(0).getYDirAdj() - height;
float ymax = list.get(0).getYDirAdj();
return create(ymin, xmin, ymax, xmax);
}
return create(xmin, ymin, xmax, ymin + height);
}
/**
* Remove Non Printable Characters and extra spaces.
* @param textPositions {@link List} of {@link TextPosition}
* @return {@link List} of {@link TextPosition}
*/
private List removeNonPrintableAndExtraSpaces(
final List textPositions) {
List list = textPositions.stream().filter(
s -> cleanTextContent(s.getUnicode()).equals(s.getUnicode()))
.collect(Collectors.toList());
int c = 0;
Iterator itr = list.iterator();
while (itr.hasNext()) {
TextPosition tp = itr.next();
if (isEmpty(tp.getUnicode().trim())) {
c++;
if (c > 2) {
itr.remove();
}
} else {
c = 0;
}
}
return list;
}
/**
* Remove Non Prinable Characters.
*
* @param s {@link String}
* @return {@link String}
*/
private String cleanTextContent(final String s) {
if (s.getBytes(StandardCharsets.UTF_8).length > 1) {
final int maxUnicodeCharacter = 8300;
char c = s.charAt(0);
if (Integer.toUnsignedLong(c) > maxUnicodeCharacter) {
return "";
}
}
return s;
}
/**
* Get Minimum position of {@link TextPosition} {@link List}.
* @param stream {@link Stream}
* @return float
*/
private float minimum(final Stream stream) {
return stream.min(Float::compare).get().floatValue();
}
/**
* Get Maximum position of {@link TextPosition} {@link List}.
* @param stream {@link Stream}
* @return float
*/
private float maximum(final Stream stream) {
return stream.max(Float::compare).get().floatValue();
}
/**
* Get Height of {@link TextPosition} list.
* @param textPositions {@link List}
* @return float
*/
private float getHeight(final List textPositions) {
return textPositions.stream()
.map(s -> Float.valueOf(s.getHeight()))
.max(Float::compare).get().floatValue();
}
/**
* @return {@link Map} {@link List} {@link DocumentText}
*/
public Map> getTextLocations() {
return this.textMap;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy