net.wicp.tams.commons.report.pdf.parse.PRTextStripper Maven / Gradle / Ivy
package net.wicp.tams.commons.report.pdf.parse;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.PositionWrapper;
import org.apache.pdfbox.util.QuickSort;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextNormalize;
import org.apache.pdfbox.util.TextPosition;
/**
* This class will take a pdf document and strip out all of the text and ignore
* the formatting and such. Please note; it is up to clients of this class to
* verify that a specific user has the correct permissions to extract text from
* the PDF document.
*
* The basic flow of this process is that we get a document and use a series of
* processXXX() functions that work on smaller and smaller chunks of the page.
* Eventually, we fully process each page and then print it.
*
* @author Ben Litchfield
*
*/
public class PRTextStripper extends PDFStreamEngine {
private static final String thisClassName = PRTextStripper.class.getSimpleName().toLowerCase();
private static float DEFAULT_INDENT_THRESHOLD = 2.0f;
private static float DEFAULT_DROP_THRESHOLD = 2.5f;
private static final boolean useCustomQuicksort;
// enable the ability to set the default indent/drop thresholds
// with -D system properties:
// pdftextstripper.indent
// pdftextstripper.drop
static {
String sdrop = null, sindent = null;
try {
String prop = thisClassName + ".indent";
sindent = System.getProperty(prop);
prop = thisClassName + ".drop";
sdrop = System.getProperty(prop);
} catch (SecurityException e) {
// PDFBOX-1946 when run in an applet
// ignore and use default
}
if (sindent != null && sindent.length() > 0) {
try {
float f = Float.parseFloat(sindent);
DEFAULT_INDENT_THRESHOLD = f;
} catch (NumberFormatException nfe) {
// ignore and use default
}
}
if (sdrop != null && sdrop.length() > 0) {
try {
float f = Float.parseFloat(sdrop);
DEFAULT_DROP_THRESHOLD = f;
} catch (NumberFormatException nfe) {
// ignore and use default
}
}
// check if we need to use the custom quicksort algorithm as a
// workaround to the transitivity issue of TextPositionComparator:
// https://issues.apache.org/jira/browse/PDFBOX-1512
boolean is16orLess = false;
try {
String[] versionComponents = System.getProperty("java.version").split("\\.");
int javaMajorVersion = Integer.parseInt(versionComponents[0]);
int javaMinorVersion = Integer.parseInt(versionComponents[1]);
is16orLess = javaMajorVersion == 1 && javaMinorVersion <= 6;
} catch (SecurityException e) {
// when run in an applet ignore and use default
// assume 1.7 or higher so that quicksort is used
}
useCustomQuicksort = !is16orLess;
}
/**
* The platforms line separator.
*/
protected final String systemLineSeparator = System.getProperty("line.separator");
private String lineSeparator = systemLineSeparator;
private String pageSeparator = systemLineSeparator;
private String wordSeparator = " ";
private String paragraphStart = "";
private String paragraphEnd = "";
private String pageStart = "";
private String pageEnd = pageSeparator;
private String articleStart = "";
private String articleEnd = "";
private int currentPageNo = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDOutlineItem startBookmark = null;
private int startBookmarkPageNumber = -1;
private PDOutlineItem endBookmark = null;
private int endBookmarkPageNumber = -1;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
private boolean addMoreFormatting = false;
private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
private float dropThreshold = DEFAULT_DROP_THRESHOLD;
// We will need to estimate where to add spaces.
// These are used to help guess.
private float spacingTolerance = .5f;
private float averageCharTolerance = .3f;
private List pageArticles = null;
/**
* The charactersByArticle is used to extract text by article divisions. For
* example a PDF that has two columns like a newspaper, we want to extract
* the first column and then the second column. In this example the PDF
* would have 2 beads(or articles), one for each column. The size of the
* charactersByArticle would be 5, because not all text on the screen will
* fall into one of the articles. The five divisions are shown below
*
* Text before first article first article text text between first article
* and second article second article text text after second article
*
* Most PDFs won't have any beads, so charactersByArticle will contain a
* single entry.
*/
protected Vector> charactersByArticle = new Vector>();
private Map>> characterListMapping = new HashMap>>();
/**
* encoding that text will be written in (or null).
*/
protected String outputEncoding;
/**
* The document to read.
*/
protected PDDocument document;
/**
* The stream to write the output to.
*/
protected Writer output;
/**
* The normalizer is used to remove text ligatures/presentation forms and to
* correct the direction of right to left text, such as Arabic and Hebrew.
*/
private TextNormalize normalize = null;
/**
* True if we started a paragraph but haven't ended it yet.
*/
private boolean inParagraph;
/**
* Instantiate a new PDFTextStripper object. This object will load
* properties from PDFTextStripper.properties and will not do anything
* special to convert the text to a more encoding-specific output.
*
* @throws IOException
* If there is an error loading the properties.
*/
public PRTextStripper() throws IOException {
super(ResourceLoader.loadProperties("org/apache/pdfbox/resources/PDFTextStripper.properties", true));
this.outputEncoding = null;
normalize = new TextNormalize(this.outputEncoding);
}
/**
* Instantiate a new PDFTextStripper object. Loading all of the operator
* mappings from the properties object that is passed in. Does not convert
* the text to more encoding-specific output.
*
* @param props
* The properties containing the mapping of operators to
* PDFOperator classes.
*
* @throws IOException
* If there is an error reading the properties.
*/
public PRTextStripper(Properties props) throws IOException {
super(props);
this.outputEncoding = null;
normalize = new TextNormalize(this.outputEncoding);
}
/**
* Instantiate a new PDFTextStripper object. This object will load
* properties from PDFTextStripper.properties and will apply
* encoding-specific conversions to the output text.
*
* @param encoding
* The encoding that the output will be written in.
* @throws IOException
* If there is an error reading the properties.
*/
public PRTextStripper(String encoding) throws IOException {
super(ResourceLoader.loadProperties("org/apache/pdfbox/resources/PDFTextStripper.properties", true));
this.outputEncoding = encoding;
normalize = new TextNormalize(this.outputEncoding);
}
/**
* This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc
* The document to get the text from.
* @return The text of the PDF document.
* @throws IOException
* if the doc state is invalid or it is encrypted.
*/
public String getText(PDDocument doc) throws IOException {
StringWriter outputStream = new StringWriter();
writeText(doc, outputStream);
return outputStream.toString();
}
/**
* @deprecated
* @param doc
* The document to extract the text from.
* @return The document text.
* @throws IOException
* If there is an error extracting the text.
*/
public String getText(COSDocument doc) throws IOException {
return getText(new PDDocument(doc));
}
/**
* @deprecated
* @param doc
* The document to extract the text.
* @param outputStream
* The stream to write the text to.
* @throws IOException
* If there is an error extracting the text.
*/
public void writeText(COSDocument doc, Writer outputStream) throws IOException {
writeText(new PDDocument(doc), outputStream);
}
/**
* {@inheritDoc}
*/
public void resetEngine() {
super.resetEngine();
currentPageNo = 0;
document = null;
if (charactersByArticle != null) {
charactersByArticle.clear();
}
if (characterListMapping != null) {
characterListMapping.clear();
}
}
/**
* This will take a PDDocument and write the text of that document to the
* print writer.
*
* @param doc
* The document to get the data from.
* @param outputStream
* The location to put the text.
*
* @throws IOException
* If the doc is in an invalid state.
*/
@SuppressWarnings("unchecked")
public void writeText(PDDocument doc, Writer outputStream) throws IOException {
resetEngine();
document = doc;
output = outputStream;
if (getAddMoreFormatting()) {
paragraphEnd = lineSeparator;
pageStart = lineSeparator;
articleStart = lineSeparator;
articleEnd = lineSeparator;
}
startDocument(document);
processPages(document.getDocumentCatalog().getAllPages());
endDocument(document);
}
/**
* This will process all of the pages and the text that is in them.
*
* @param pages
* The pages object in the document.
*
* @throws IOException
* If there is an error parsing the text.
*/
protected void processPages(List pages) throws IOException {
if (startBookmark != null) {
startBookmarkPageNumber = getPageNumber(startBookmark, pages);
}
if (endBookmark != null) {
endBookmarkPageNumber = getPageNumber(endBookmark, pages);
}
if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null
&& startBookmark.getCOSObject() == endBookmark.getCOSObject()) {
// this is a special case where both the start and end bookmark
// are the same but point to nothing. In this case
// we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
Iterator pageIter = pages.iterator();
while (pageIter.hasNext()) {
PDPage nextPage = (PDPage) pageIter.next();
PDStream contentStream = nextPage.getContents();
currentPageNo++;
if (contentStream != null) {
COSStream contents = contentStream.getStream();
processPage(nextPage, contents);
}
}
}
private int getPageNumber(PDOutlineItem bookmark, List allPages) throws IOException {
int pageNumber = -1;
PDPage page = bookmark.findDestinationPage(document);
if (page != null) {
pageNumber = allPages.indexOf(page) + 1;// use one based indexing
}
return pageNumber;
}
/**
* This method is available for subclasses of this class. It will be called
* before processing of the document start.
*
* @param pdf
* The PDF document that is being processed.
* @throws IOException
* If an IO error occurs.
*/
protected void startDocument(PDDocument pdf) throws IOException {
// no default implementation, but available for subclasses
}
/**
* This method is available for subclasses of this class. It will be called
* after processing of the document finishes.
*
* @param pdf
* The PDF document that is being processed.
* @throws IOException
* If an IO error occurs.
*/
protected void endDocument(PDDocument pdf) throws IOException {
// no default implementation, but available for subclasses
}
/**
* This will process the contents of a page.
*
* @param page
* The page to process.
* @param content
* The contents of the page.
*
* @throws IOException
* If there is an error processing the page.
*/
protected void processPage(PDPage page, COSStream content) throws IOException {
if (currentPageNo >= startPage && currentPageNo <= endPage
&& (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
&& (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber)) {
startPage(page);
pageArticles = page.getThreadBeads();
int numberOfArticleSections = 1 + pageArticles.size() * 2;
if (!shouldSeparateByBeads) {
numberOfArticleSections = 1;
}
int originalSize = charactersByArticle.size();
charactersByArticle.setSize(numberOfArticleSections);
for (int i = 0; i < numberOfArticleSections; i++) {
if (numberOfArticleSections < originalSize) {
((List) charactersByArticle.get(i)).clear();
} else {
charactersByArticle.set(i, new ArrayList());
}
}
characterListMapping.clear();
processStream(page, page.findResources(), content);
writePage();
endPage(page);
}
}
/**
* Start a new article, which is typically defined as a column on a single
* page (also referred to as a bead). This assumes that the primary
* direction of text is left to right. Default implementation is to do
* nothing. Subclasses may provide additional information.
*
* @throws IOException
* If there is any error writing to the stream.
*/
protected void startArticle() throws IOException {
startArticle(true);
}
/**
* Start a new article, which is typically defined as a column on a single
* page (also referred to as a bead). Default implementation is to do
* nothing. Subclasses may provide additional information.
*
* @param isltr
* true if primary direction of text is left to right.
* @throws IOException
* If there is any error writing to the stream.
*/
protected void startArticle(boolean isltr) throws IOException {
output.write(getArticleStart());
}
/**
* End an article. Default implementation is to do nothing. Subclasses may
* provide additional information.
*
* @throws IOException
* If there is any error writing to the stream.
*/
protected void endArticle() throws IOException {
output.write(getArticleEnd());
}
/**
* Start a new page. Default implementation is to do nothing. Subclasses may
* provide additional information.
*
* @param page
* The page we are about to process.
*
* @throws IOException
* If there is any error writing to the stream.
*/
protected void startPage(PDPage page) throws IOException {
// default is to do nothing.
}
/**
* End a page. Default implementation is to do nothing. Subclasses may
* provide additional information.
*
* @param page
* The page we are about to process.
*
* @throws IOException
* If there is any error writing to the stream.
*/
protected void endPage(PDPage page) throws IOException {
// default is to do nothing
}
private static final float ENDOFLASTTEXTX_RESET_VALUE = -1;
private static final float MAXYFORLINE_RESET_VALUE = -Float.MAX_VALUE;
private static final float EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE = -Float.MAX_VALUE;
private static final float MAXHEIGHTFORLINE_RESET_VALUE = -1;
private static final float MINYTOPFORLINE_RESET_VALUE = Float.MAX_VALUE;
private static final float LASTWORDSPACING_RESET_VALUE = -1;
/**
* This will print the text of the processed page to "output". It will
* estimate, based on the coordinates of the text, where newlines and word
* spacings should be placed. The text will be sorted only if that feature
* was enabled.
*
* @throws IOException
* If there is an error writing the text.
*/
protected void writePage() throws IOException {
float maxYForLine = MAXYFORLINE_RESET_VALUE;
float minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
float lastWordSpacing = LASTWORDSPACING_RESET_VALUE;
float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
PositionWrapper lastPosition = null;
PositionWrapper lastLineStartPosition = null;
boolean startOfPage = true;// flag to indicate start of page
boolean startOfArticle = true;
if (charactersByArticle.size() > 0) {
writePageStart();
}
for (int i = 0; i < charactersByArticle.size(); i++) {
List textList = charactersByArticle.get(i);
if (getSortByPosition()) {
TextPositionComparatorY comparator = new TextPositionComparatorY();
// because the TextPositionComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom quicksort implementation (which is slower,
// unfortunately).
if (useCustomQuicksort) {
QuickSort.sort(textList, comparator);
} else {
Collections.sort(textList, comparator);
}
}
Iterator textIter = textList.iterator();
/*
* Before we can display the text, we need to do some normalizing.
* Arabic and Hebrew text is right to left and is typically stored
* in its logical format, which means that the rightmost character
* is stored first, followed by the second character from the right
* etc. However, PDF stores the text in presentation form, which is
* left to right. We need to do some normalization to convert the
* PDF data to the proper logical output format.
*
* Note that if we did not sort the text, then the output of
* reversing the text is undefined and can sometimes produce worse
* output then not trying to reverse the order. Sorting should be
* done for these languages.
*/
/*
* First step is to determine if we have any right to left text, and
* if so, is it dominant.
*/
int ltrCnt = 0;
int rtlCnt = 0;
while (textIter.hasNext()) {
TextPosition position = (TextPosition) textIter.next();
String stringValue = position.getCharacter();
for (int a = 0; a < stringValue.length(); a++) {
byte dir = Character.getDirectionality(stringValue.charAt(a));
if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT)
|| (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)) {
ltrCnt++;
} else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
rtlCnt++;
}
}
}
// choose the dominant direction
boolean isRtlDominant = rtlCnt > ltrCnt;
startArticle(!isRtlDominant);
startOfArticle = true;
// we will later use this to skip reordering
boolean hasRtl = rtlCnt > 0;
/*
* Now cycle through to print the text. We queue up a line at a time
* before we print so that we can convert the line from presentation
* form to logical form (if needed).
*/
List line = new ArrayList();
textIter = textList.iterator(); // start from the beginning again
/*
* PDF files don't always store spaces. We will need to guess where
* we should add spaces based on the distances between
* TextPositions. Historically, this was done based on the size of
* the space character provided by the font. In general, this worked
* but there were cases where it did not work. Calculating the
* average character width and using that as a metric works better
* in some cases but fails in some cases where the spacing worked.
* So we use both. NOTE: Adobe reader also fails on some of these
* examples.
*/
// Keeps track of the previous average character width
float previousAveCharWidth = -1;
while (textIter.hasNext()) {
TextPosition position = (TextPosition) textIter.next();
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getCharacter();
// Resets the average character width when we see a change in
// font
// or a change in the font size
if (lastPosition != null && ((position.getFont() != lastPosition.getTextPosition().getFont())
|| (position.getFontSize() != lastPosition.getTextPosition().getFontSize()))) {
previousAveCharWidth = -1;
}
float positionX;
float positionY;
float positionWidth;
float positionHeight;
/*
* If we are sorting, then we need to use the text direction
* adjusted coordinates, because they were used in the sorting.
*/
if (getSortByPosition()) {
positionX = position.getXDirAdj();
positionY = position.getYDirAdj();
positionWidth = position.getWidthDirAdj();
positionHeight = position.getHeightDir();
} else {
positionX = position.getX();
positionY = position.getY();
positionWidth = position.getWidth();
positionHeight = position.getHeight();
}
// The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
/*
* Estimate the expected width of the space based on the space
* character with some margin.
*/
float wordSpacing = position.getWidthOfSpace();
float deltaSpace = 0;
if (wordSpacing == 0 || Float.isNaN(wordSpacing)) {
deltaSpace = Float.MAX_VALUE;
} else {
if (lastWordSpacing < 0) {
deltaSpace = (wordSpacing * getSpacingTolerance());
} else {
deltaSpace = (((wordSpacing + lastWordSpacing) / 2f) * getSpacingTolerance());
}
}
/*
* Estimate the expected width of the space based on the average
* character width with some margin. This calculation does not
* make a true average (average of averages) but we found that
* it gave the best results after numerous experiments. Based on
* experiments we also found that .3 worked well.
*/
float averageCharWidth = -1;
if (previousAveCharWidth < 0) {
averageCharWidth = (positionWidth / wordCharCount);
} else {
averageCharWidth = (previousAveCharWidth + (positionWidth / wordCharCount)) / 2f;
}
float deltaCharWidth = (averageCharWidth * getAverageCharTolerance());
// Compares the values obtained by the average method and the
// wordSpacing method and picks
// the smaller number.
float expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
if (endOfLastTextX != ENDOFLASTTEXTX_RESET_VALUE) {
if (deltaCharWidth > deltaSpace) {
expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
} else {
expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
}
}
if (lastPosition != null) {
if (startOfArticle) {
lastPosition.setArticleStart();
startOfArticle = false;
}
// RDD - Here we determine whether this text object is on
// the current
// line. We use the lastBaselineFontSize to handle the
// superscript
// case, and the size of the current font to handle the
// subscript case.
// Text must overlap with the last rendered baseline text by
// at least
// a small amount in order to be considered as being on the
// same line.
/*
* XXX BC: In theory, this check should really check if the
* next char is in full range seen in this line. This is
* what I tried to do with minYTopForLine, but this caused a
* lot of regression test failures. So, I'm leaving it be
* for now.
*/
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
writeLine(normalize(line, isRtlDominant, hasRtl), isRtlDominant);
line.clear();
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition,
maxHeightForLine);
endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
maxYForLine = MAXYFORLINE_RESET_VALUE;
maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
}
// Test if our TextPosition starts after a new word would be
// expected to start.
if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE
&& expectedStartOfNextWordX < positionX &&
// only bother adding a space if the last character
// was not a space
lastPosition.getTextPosition().getCharacter() != null
&& !lastPosition.getTextPosition().getCharacter().endsWith(" ")) {
line.add(WordSeparator.getSeparator());
}
}
if (positionY >= maxYForLine) {
maxYForLine = positionY;
}
// RDD - endX is what PDF considers to be the x coordinate of
// the
// end position of the text. We use it in computing our metrics
// below.
endOfLastTextX = positionX + positionWidth;
// add it to the list
if (characterValue != null) {
if (startOfPage && lastPosition == null) {
writeParagraphStart();// not sure this is correct for
// RTL?
}
line.add(position);
}
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
lastPosition = current;
if (startOfPage) {
lastPosition.setParagraphStart();
lastPosition.setLineStart();
lastLineStartPosition = lastPosition;
startOfPage = false;
}
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
}
// print the final line
if (line.size() > 0) {
writeLine(normalize(line, isRtlDominant, hasRtl), isRtlDominant);
writeParagraphEnd();
}
endArticle();
}
writePageEnd();
}
private boolean overlap(float y1, float height1, float y2, float height2) {
return within(y1, y2, .1f) || (y2 <= y1 && y2 >= y1 - height1) || (y1 <= y2 && y1 >= y2 - height2);
}
/**
* Write the page separator value to the output stream.
*
* @throws IOException
* If there is a problem writing out the pageseparator to the
* document.
*/
protected void writePageSeperator() throws IOException {
// RDD - newline at end of flush - required for end of page (so that the
// top
// of the next page starts on its own line.
output.write(getPageSeparator());
output.flush();
}
/**
* Write the line separator value to the output stream.
*
* @throws IOException
* If there is a problem writing out the lineseparator to the
* document.
*/
protected void writeLineSeparator() throws IOException {
output.write(getLineSeparator());
}
/**
* Write the word separator value to the output stream.
*
* @throws IOException
* If there is a problem writing out the wordseparator to the
* document.
*/
protected void writeWordSeparator() throws IOException {
output.write(getWordSeparator());
}
/**
* Write the string in TextPosition to the output stream.
*
* @param text
* The text to write to the stream.
* @throws IOException
* If there is an error when writing the text.
*/
protected void writeCharacters(TextPosition text) throws IOException {
output.write(text.getCharacter());
}
/**
* Write a Java string to the output stream. The default implementation will
* ignore the textPositions and just calls
* {@link #writeString(String)}.
*
* @param text
* The text to write to the stream.
* @param textPositions
* The TextPositions belonging to the text.
* @throws IOException
* If there is an error when writing the text.
*/
protected void writeString(String text, List textPositions) throws IOException {
writeString(text);
}
/**
* Write a Java string to the output stream.
*
* @param text
* The text to write to the stream.
* @throws IOException
* If there is an error when writing the text.
*/
protected void writeString(String text) throws IOException {
output.write(text);
}
/**
* This will determine of two floating point numbers are within a specified
* variance.
*
* @param first
* The first number to compare to.
* @param second
* The second number to compare to.
* @param variance
* The allowed variance.
*/
private boolean within(float first, float second, float variance) {
return second < first + variance && second > first - variance;
}
/**
* This will process a TextPosition object and add the text to the list of
* characters on a page. It takes care of overlapping text.
*
* @param text
* The text to process.
*/
protected void processTextPosition(TextPosition text) {
boolean showCharacter = true;
if (suppressDuplicateOverlappingText) {
showCharacter = false;
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
TreeMap> sameTextCharacters = characterListMapping.get(textCharacter);
if (sameTextCharacters == null) {
sameTextCharacters = new TreeMap>();
characterListMapping.put(textCharacter, sameTextCharacters);
}
// RDD - Here we compute the value that represents the end of the
// rendered
// text. This value is used to determine whether subsequent text
// rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme
// amounts
// of padding are applied, then backed off (not sure why this is
// done, but there
// are cases where the padding is on the order of 10x the character
// width, and
// the TJ just backs up to compensate after each character). Also,
// we subtract
// an amount to allow for kerning (a percentage of the width of the
// last
// character).
//
boolean suppressCharacter = false;
float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f;
SortedMap> xMatches = sameTextCharacters.subMap(textX - tolerance, textX + tolerance);
for (TreeSet xMatch : xMatches.values()) {
SortedSet yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
if (!yMatches.isEmpty()) {
suppressCharacter = true;
break;
}
}
if (!suppressCharacter) {
TreeSet ySet = sameTextCharacters.get(textX);
if (ySet == null) {
ySet = new TreeSet();
sameTextCharacters.put(textX, ySet);
}
ySet.add(textY);
showCharacter = true;
}
}
if (showCharacter) {
// if we are showing the character then we need to determine which
// article it belongs to.
int foundArticleDivisionIndex = -1;
int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
int notFoundButFirstLeftArticleDivisionIndex = -1;
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
if (shouldSeparateByBeads) {
for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
PDThreadBead bead = (PDThreadBead) pageArticles.get(i);
if (bead != null) {
PDRectangle rect = bead.getRectangle();
if (rect.contains(x, y)) {
foundArticleDivisionIndex = i * 2 + 1;
} else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
&& notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) {
notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
} else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) {
notFoundButFirstLeftArticleDivisionIndex = i * 2;
} else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) {
notFoundButFirstAboveArticleDivisionIndex = i * 2;
}
} else {
foundArticleDivisionIndex = 0;
}
}
} else {
foundArticleDivisionIndex = 0;
}
int articleDivisionIndex = -1;
if (foundArticleDivisionIndex != -1) {
articleDivisionIndex = foundArticleDivisionIndex;
} else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
} else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
} else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
} else {
articleDivisionIndex = charactersByArticle.size() - 1;
}
List textList = (List) charactersByArticle.get(articleDivisionIndex);
/*
* In the wild, some PDF encoded documents put diacritics (accents
* on top of characters) into a separate Tj element. When displaying
* them graphically, the two chunks get overlayed. With text output
* though, we need to do the overlay. This code recombines the
* diacritic with its associated character if the two are
* consecutive.
*/
if (textList.isEmpty()) {
textList.add(text);
} else {
/*
* test if we overlap the previous entry. Note that we are
* making an assumption that we need to only look back one
* TextPosition to find what we are overlapping. This may not
* always be true.
*/
TextPosition previousTextPosition = (TextPosition) textList.get(textList.size() - 1);
if (text.isDiacritic() && previousTextPosition.contains(text)) {
previousTextPosition.mergeDiacritic(text, normalize);
}
/*
* If the previous TextPosition was the diacritic, merge it into
* this one and remove it from the list.
*/
else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition)) {
text.mergeDiacritic(previousTextPosition, normalize);
textList.remove(textList.size() - 1);
textList.add(text);
} else {
textList.add(text);
}
}
}
}
/**
* This is the page that the text extraction will start on. The pages start
* at page 1. For example in a 5 page PDF document, if the start page is 1
* then all pages will be extracted. If the start page is 4 then pages 4 and
* 5 will be extracted. The default value is 1.
*
* @return Value of property startPage.
*/
public int getStartPage() {
return startPage;
}
/**
* This will set the first page to be extracted by this class.
*
* @param startPageValue
* New value of 1-based startPage property.
*/
public void setStartPage(int startPageValue) {
startPage = startPageValue;
}
/**
* This will get the last page that will be extracted. This is inclusive,
* for example if a 5 page PDF an endPage value of 5 would extract the
* entire document, an end page of 2 would extract pages 1 and 2. This
* defaults to Integer.MAX_VALUE such that all pages of the pdf will be
* extracted.
*
* @return Value of property endPage.
*/
public int getEndPage() {
return endPage;
}
/**
* This will set the last page to be extracted by this class.
*
* @param endPageValue
* New value of 1-based endPage property.
*/
public void setEndPage(int endPageValue) {
endPage = endPageValue;
}
/**
* Set the desired line separator for output text. The line.separator system
* property is used if the line separator preference is not set explicitly
* using this method.
*
* @param separator
* The desired line separator string.
*/
public void setLineSeparator(String separator) {
lineSeparator = separator;
}
/**
* This will get the line separator.
*
* @return The desired line separator string.
*/
public String getLineSeparator() {
return lineSeparator;
}
/**
* Set the desired page separator for output text. The line.separator system
* property is used if the page separator preference is not set explicitly
* using this method.
*
* @param separator
* The desired page separator string.
*
* @deprecated
*/
public void setPageSeparator(String separator) {
pageSeparator = separator;
}
/**
* This will get the word separator.
*
* @return The desired word separator string.
*/
public String getWordSeparator() {
return wordSeparator;
}
/**
* Set the desired word separator for output text. The PDFBox text
* extraction algorithm will output a space character if there is enough
* space between two words. By default a space character is used. If you
* need and accurate count of characters that are found in a PDF document
* then you might want to set the word separator to the empty string.
*
* @param separator
* The desired page separator string.
*/
public void setWordSeparator(String separator) {
wordSeparator = separator;
}
/**
* This will get the page separator.
*
* @return The page separator string.
*
* @deprecated use {@link #getPageStart()} and {@link #getPageEnd()} instead
*/
public String getPageSeparator() {
return pageSeparator;
}
/**
* @return Returns the suppressDuplicateOverlappingText.
*/
public boolean getSuppressDuplicateOverlappingText() {
return suppressDuplicateOverlappingText;
}
/**
* Get the current page number that is being processed.
*
* @return A 1 based number representing the current page.
*/
protected int getCurrentPageNo() {
return currentPageNo;
}
/**
* The output stream that is being written to.
*
* @return The stream that output is being written to.
*/
protected Writer getOutput() {
return output;
}
/**
* Character strings are grouped by articles. It is quite common that there
* will only be a single article. This returns a List that contains List
* objects, the inner lists will contain TextPosition objects.
*
* @return A double List of TextPositions for all text strings on the page.
*/
protected Vector> getCharactersByArticle() {
return charactersByArticle;
}
/**
* By default the text stripper will attempt to remove text that overlapps
* each other. Word paints the same character several times in order to make
* it look bold. By setting this to false all text will be extracted, which
* means that certain sections will be duplicated, but better performance
* will be noticed.
*
* @param suppressDuplicateOverlappingTextValue
* The suppressDuplicateOverlappingText to set.
*/
public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) {
suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
}
/**
* This will tell if the text stripper should separate by beads.
*
* @return If the text will be grouped by beads.
*/
public boolean getSeparateByBeads() {
return shouldSeparateByBeads;
}
/**
* Set if the text stripper should group the text output by a list of beads.
* The default value is true!
*
* @param aShouldSeparateByBeads
* The new grouping of beads.
*/
public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {
shouldSeparateByBeads = aShouldSeparateByBeads;
}
/**
* Get the bookmark where text extraction should end, inclusive. Default is
* null.
*
* @return The ending bookmark.
*/
public PDOutlineItem getEndBookmark() {
return endBookmark;
}
/**
* Set the bookmark where the text extraction should stop.
*
* @param aEndBookmark
* The ending bookmark.
*/
public void setEndBookmark(PDOutlineItem aEndBookmark) {
endBookmark = aEndBookmark;
}
/**
* Get the bookmark where text extraction should start, inclusive. Default
* is null.
*
* @return The starting bookmark.
*/
public PDOutlineItem getStartBookmark() {
return startBookmark;
}
/**
* Set the bookmark where text extraction should start, inclusive.
*
* @param aStartBookmark
* The starting bookmark.
*/
public void setStartBookmark(PDOutlineItem aStartBookmark) {
startBookmark = aStartBookmark;
}
/**
* This will tell if the text stripper should add some more text formatting.
*
* @return true if some more text formatting will be added
*/
public boolean getAddMoreFormatting() {
return addMoreFormatting;
}
/**
* There will some additional text formatting be added if addMoreFormatting
* is set to true. Default is false.
*
* @param newAddMoreFormatting
* Tell PDFBox to add some more text formatting
*/
public void setAddMoreFormatting(boolean newAddMoreFormatting) {
addMoreFormatting = newAddMoreFormatting;
}
/**
* This will tell if the text stripper should sort the text tokens before
* writing to the stream.
*
* @return true If the text tokens will be sorted before being written.
*/
public boolean getSortByPosition() {
return sortByPosition;
}
/**
* The order of the text tokens in a PDF file may not be in the same as they
* appear visually on the screen. For example, a PDF writer may write out
* all text by font, so all bold or larger text, then make a second pass and
* write out the normal text.
* The default is to not sort by position.
*
* A PDF writer could choose to write each character in a different order.
* By default PDFBox does not sort the text tokens before processing
* them due to performance reasons.
*
* @param newSortByPosition
* Tell PDFBox to sort the text positions.
*/
public void setSortByPosition(boolean newSortByPosition) {
sortByPosition = newSortByPosition;
}
/**
* Get the current space width-based tolerance value that is being used to
* estimate where spaces in text should be added. Note that the default
* value for this has been determined from trial and error.
*
* @return The current tolerance / scaling factor
*/
public float getSpacingTolerance() {
return spacingTolerance;
}
/**
* Set the space width-based tolerance value that is used to estimate where
* spaces in text should be added. Note that the default value for this has
* been determined from trial and error. Setting this value larger will
* reduce the number of spaces added.
*
* @param spacingToleranceValue
* tolerance / scaling factor to use
*/
public void setSpacingTolerance(float spacingToleranceValue) {
spacingTolerance = spacingToleranceValue;
}
/**
* Get the current character width-based tolerance value that is being used
* to estimate where spaces in text should be added. Note that the default
* value for this has been determined from trial and error.
*
* @return The current tolerance / scaling factor
*/
public float getAverageCharTolerance() {
return averageCharTolerance;
}
/**
* Set the character width-based tolerance value that is used to estimate
* where spaces in text should be added. Note that the default value for
* this has been determined from trial and error. Setting this value larger
* will reduce the number of spaces added.
*
* @param averageCharToleranceValue
* average tolerance / scaling factor to use
*/
public void setAverageCharTolerance(float averageCharToleranceValue) {
averageCharTolerance = averageCharToleranceValue;
}
/**
* returns the multiple of whitespace character widths for the current text
* which the current line start can be indented from the previous line start
* beyond which the current line start is considered to be a paragraph
* start.
*
* @return the number of whitespace character widths to use when detecting
* paragraph indents.
*/
public float getIndentThreshold() {
return indentThreshold;
}
/**
* sets the multiple of whitespace character widths for the current text
* which the current line start can be indented from the previous line start
* beyond which the current line start is considered to be a paragraph
* start. The default value is 2.0.
*
* @param indentThresholdValue
* the number of whitespace character widths to use when
* detecting paragraph indents.
*/
public void setIndentThreshold(float indentThresholdValue) {
indentThreshold = indentThresholdValue;
}
/**
* the minimum whitespace, as a multiple of the max height of the current
* characters beyond which the current line start is considered to be a
* paragraph start.
*
* @return the character height multiple for max allowed whitespace between
* lines in the same paragraph.
*/
public float getDropThreshold() {
return dropThreshold;
}
/**
* sets the minimum whitespace, as a multiple of the max height of the
* current characters beyond which the current line start is considered to
* be a paragraph start. The default value is 2.5.
*
* @param dropThresholdValue
* the character height multiple for max allowed whitespace
* between lines in the same paragraph.
*/
public void setDropThreshold(float dropThresholdValue) {
dropThreshold = dropThresholdValue;
}
/**
* Returns the string which will be used at the beginning of a paragraph.
*
* @return the paragraph start string
*/
public String getParagraphStart() {
return paragraphStart;
}
/**
* Sets the string which will be used at the beginning of a paragraph.
*
* @param s
* the paragraph start string
*/
public void setParagraphStart(String s) {
paragraphStart = s;
}
/**
* Returns the string which will be used at the end of a paragraph.
*
* @return the paragraph end string
*/
public String getParagraphEnd() {
return paragraphEnd;
}
/**
* Sets the string which will be used at the end of a paragraph.
*
* @param s
* the paragraph end string
*/
public void setParagraphEnd(String s) {
paragraphEnd = s;
}
/**
* Returns the string which will be used at the beginning of a page.
*
* @return the page start string
*/
public String getPageStart() {
return pageStart;
}
/**
* Sets the string which will be used at the beginning of a page.
*
* @param pageStartValue
* the page start string
*/
public void setPageStart(String pageStartValue) {
pageStart = pageStartValue;
}
/**
* Returns the string which will be used at the end of a page.
*
* @return the page end string
*/
public String getPageEnd() {
return pageEnd;
}
/**
* Sets the string which will be used at the end of a page.
*
* @param pageEndValue
* the page end string
*/
public void setPageEnd(String pageEndValue) {
pageEnd = pageEndValue;
}
/**
* Returns the string which will be used at the beginning of an article.
*
* @return the article start string
*/
public String getArticleStart() {
return articleStart;
}
/**
* Sets the string which will be used at the beginning of an article.
*
* @param articleStartValue
* the article start string
*/
public void setArticleStart(String articleStartValue) {
articleStart = articleStartValue;
}
/**
* Returns the string which will be used at the end of an article.
*
* @return the article end string
*/
public String getArticleEnd() {
return articleEnd;
}
/**
* Sets the string which will be used at the end of an article.
*
* @param articleEndValue
* the article end string
*/
public void setArticleEnd(String articleEndValue) {
articleEnd = articleEndValue;
}
/**
* Reverse characters of a compound Arabic glyph. When getSortByPosition()
* is true, inspect the sequence encoded by one glyph. If the glyph encodes
* two or more Arabic characters, reverse these characters from a logical
* order to a visual order. This ensures that the bidirectional algorithm
* that runs later will convert them back to a logical order.
*
* @param str
* a string obtained from font.encoding()
*
* @return the reversed string
*/
public String inspectFontEncoding(String str) {
if (!sortByPosition || str == null || str.length() < 2) {
return str;
}
for (int i = 0; i < str.length(); ++i) {
if (Character.getDirectionality(str.charAt(i)) != Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) {
return str;
}
}
StringBuilder reversed = new StringBuilder(str.length());
for (int i = str.length() - 1; i >= 0; --i) {
reversed.append(str.charAt(i));
}
return reversed.toString();
}
/**
* handles the line separator for a new line given the specified current and
* previous TextPositions.
*
* @param current
* the current text position
* @param lastPosition
* the previous text position
* @param lastLineStartPosition
* the last text position that followed a line separator.
* @param maxHeightForLine
* max height for positions since lastLineStartPosition
* @return start position of the last line
* @throws IOException
* if something went wrong
*/
protected PositionWrapper handleLineSeparation(PositionWrapper current, PositionWrapper lastPosition,
PositionWrapper lastLineStartPosition, float maxHeightForLine) throws IOException {
current.setLineStart();
isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
lastLineStartPosition = current;
if (current.isParagraphStart()) {
if (lastPosition.isArticleStart()) {
writeParagraphStart();
} else {
writeLineSeparator();
writeParagraphSeparator();
}
} else {
writeLineSeparator();
}
return lastLineStartPosition;
}
/**
* tests the relationship between the last text position, the current text
* position and the last text position that followed a line separator to
* decide if the gap represents a paragraph separation. This should
* only be called for consecutive text positions that first pass the
* line separation test.
*
* This base implementation tests to see if the lastLineStartPosition is
* null OR if the current vertical position has dropped below the last text
* vertical position by at least 2.5 times the current text height OR if the
* current horizontal position is indented by at least 2 times the current
* width of a space character.
*
*
* This also attempts to identify text that is indented under a hanging
* indent.
*
*
* This method sets the isParagraphStart and isHangingIndent flags on the
* current position object.
*
*
* @param position
* the current text position. This may have its isParagraphStart
* or isHangingIndent flags set upon return.
* @param lastPosition
* the previous text position (should not be null).
* @param lastLineStartPosition
* the last text position that followed a line separator. May be
* null.
* @param maxHeightForLine
* max height for text positions since lasLineStartPosition.
*/
protected void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition,
PositionWrapper lastLineStartPosition, float maxHeightForLine) {
boolean result = false;
if (lastLineStartPosition == null) {
result = true;
} else {
float yGap = Math
.abs(position.getTextPosition().getYDirAdj() - lastPosition.getTextPosition().getYDirAdj());
float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine);
// do we need to flip this for rtl?
float xGap = position.getTextPosition().getXDirAdj() - lastLineStartPosition.getTextPosition().getXDirAdj();
float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace());
float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
if (yGap > newYVal) {
result = true;
} else if (xGap > newXVal) {
// text is indented, but try to screen for hanging indent
if (!lastLineStartPosition.isParagraphStart()) {
result = true;
} else {
position.setHangingIndent();
}
} else if (xGap < -position.getTextPosition().getWidthOfSpace()) {
// text is left of previous line. Was it a hanging indent?
if (!lastLineStartPosition.isParagraphStart()) {
result = true;
}
} else if (Math.abs(xGap) < positionWidth) {
// current horizontal position is within 1/4 a char of the last
// linestart. We'll treat them as lined up.
if (lastLineStartPosition.isHangingIndent()) {
position.setHangingIndent();
} else if (lastLineStartPosition.isParagraphStart()) {
// check to see if the previous line looks like
// any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
if (liPattern != null) {
Pattern currentPattern = matchListItemPattern(position);
if (liPattern == currentPattern) {
result = true;
}
}
}
}
}
if (result) {
position.setParagraphStart();
}
}
private float multiplyFloat(float value1, float value2) {
// multiply 2 floats and truncate the resulting value to 3 decimal
// places
// to avoid wrong results when comparing with another float
return Math.round(value1 * value2 * 1000) / 1000f;
}
/**
* writes the paragraph separator string to the output.
*
* @throws IOException
* if something went wrong
*/
protected void writeParagraphSeparator() throws IOException {
writeParagraphEnd();
writeParagraphStart();
}
/**
* Write something (if defined) at the start of a paragraph.
*
* @throws IOException
* if something went wrong
*/
protected void writeParagraphStart() throws IOException {
if (inParagraph) {
writeParagraphEnd();
inParagraph = false;
}
output.write(getParagraphStart());
inParagraph = true;
}
/**
* Write something (if defined) at the end of a paragraph.
*
* @throws IOException
* if something went wrong
*/
protected void writeParagraphEnd() throws IOException {
if (!inParagraph) {
writeParagraphStart();
}
output.write(getParagraphEnd());
inParagraph = false;
}
/**
* Write something (if defined) at the start of a page.
*
* @throws IOException
* if something went wrong
*/
protected void writePageStart() throws IOException {
output.write(getPageStart());
}
/**
* Write something (if defined) at the end of a page.
*
* @throws IOException
* if something went wrong
*/
protected void writePageEnd() throws IOException {
output.write(getPageEnd());
}
/**
* returns the list item Pattern object that matches the text at the
* specified PositionWrapper or null if the text does not match such a
* pattern. The list of Patterns tested against is given by the
* {@link #getListItemPatterns()} method. To add to the list, simply
* override that method (if sub-classing) or explicitly supply your own list
* using {@link #setListItemPatterns(List)}.
*
* @param pw
* position
* @return the matching pattern
*/
protected Pattern matchListItemPattern(PositionWrapper pw) {
TextPosition tp = pw.getTextPosition();
String txt = tp.getCharacter();
return matchPattern(txt, getListItemPatterns());
}
/**
* a list of regular expressions that match commonly used list item formats,
* i.e. bullets, numbers, letters, Roman numerals, etc. Not meant to be
* comprehensive.
*/
private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]", "\\d+\\)", "[A-Z]\\.",
"[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.", "[ivxl]+\\.",
};
private List listOfPatterns = null;
/**
* use to supply a different set of regular expression patterns for matching
* list item starts.
*
* @param patterns
* list of patterns
*/
protected void setListItemPatterns(List patterns) {
listOfPatterns = patterns;
}
/**
* returns a list of regular expression Patterns representing different
* common list item formats. For example numbered items of form:
*
* - some text
* - more text
*
* or
*
* - some text
* - more text
*
* etc., all begin with some character pattern. The pattern "\\d+\."
* (matches "1.", "2.", ...) or "\[\\d+\]" (matches "[1]", "[2]", ...).
*
* This method returns a list of such regular expression Patterns.
*
* @return a list of Pattern objects.
*/
protected List getListItemPatterns() {
if (listOfPatterns == null) {
listOfPatterns = new ArrayList();
for (String expression : LIST_ITEM_EXPRESSIONS) {
Pattern p = Pattern.compile(expression);
listOfPatterns.add(p);
}
}
return listOfPatterns;
}
/**
* iterates over the specified list of Patterns until it finds one that
* matches the specified string. Then returns the Pattern.
*
* Order of the supplied list of patterns is important as most common
* patterns should come first. Patterns should be strict in general, and all
* will be used with case sensitivity on.
*
*
* @param string
* the string to be searched
* @param patterns
* list of patterns
* @return matching pattern
*/
protected static final Pattern matchPattern(String string, List patterns) {
Pattern matchedPattern = null;
for (Pattern p : patterns) {
if (p.matcher(string).matches()) {
return p;
}
}
return matchedPattern;
}
/**
* Write a list of string containing a whole line of a document.
*
* @param line
* a list with the words of the given line
* @param isRtlDominant
* determines if rtl or ltl is dominant
* @throws IOException
* if something went wrong
*/
private void writeLine(List line, boolean isRtlDominant) throws IOException {
int numberOfStrings = line.size();
for (int i = 0; i < numberOfStrings; i++) {
WordWithTextPositions word = line.get(i);
writeString(word.getText(), word.getTextPositions());
if (i < numberOfStrings - 1) {
writeWordSeparator();
}
}
}
/**
* Normalize the given list of TextPositions.
*
* @param line
* list of TextPositions
* @param isRtlDominant
* determines if rtl or ltl is dominant
* @param hasRtl
* determines if lines contains rtl formatted text(parts)
* @return a list of strings, one string for every word
*/
private List normalize(List line, boolean isRtlDominant, boolean hasRtl) {
LinkedList normalized = new LinkedList();
StringBuilder lineBuilder = new StringBuilder();
List wordPositions = new ArrayList();
// concatenate the pieces of text in opposite order if RTL is dominant
if (isRtlDominant) {
int numberOfPositions = line.size();
for (int i = numberOfPositions - 1; i >= 0; i--) {
lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, line.get(i));
}
} else {
for (TextPosition text : line) {
lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, text);
}
}
if (lineBuilder.length() > 0) {
normalized.add(createWord(lineBuilder.toString(), wordPositions));
}
return normalized;
}
/**
* Used within {@link #normalize(List, boolean, boolean)} to create a single
* {@link WordWithTextPositions} entry.
*/
private WordWithTextPositions createWord(String word, List wordPositions) {
return new WordWithTextPositions(normalize.normalizePres(word), wordPositions);
}
/**
* Used within {@link #normalize(List, boolean, boolean)} to handle a
* {@link TextPosition}.
*
* @return The StringBuilder that must be used when calling this method.
*/
private StringBuilder normalizeAdd(LinkedList normalized, StringBuilder lineBuilder,
List wordPositions, TextPosition text) {
if (text instanceof WordSeparator) {
normalized.add(createWord(lineBuilder.toString(), new ArrayList(wordPositions)));
lineBuilder = new StringBuilder();
wordPositions.clear();
} else {
lineBuilder.append(text.getCharacter());
wordPositions.add(text);
}
return lineBuilder;
}
/**
* internal marker class. Used as a place holder in a line of TextPositions.
*
* @author ME21969
*
*/
private static final class WordSeparator extends TextPosition {
private static final WordSeparator separator = new WordSeparator();
private WordSeparator() {
}
public static final WordSeparator getSeparator() {
return separator;
}
}
/**
* Internal class that maps strings to lists of {@link TextPosition} arrays.
* Note that the number of entries in that list may differ from the number
* of characters in the string due to normalization.
*
* @author Axel D�rfler
*/
private static final class WordWithTextPositions {
protected String text;
protected List textPositions;
public WordWithTextPositions(String word, List positions) {
text = word;
textPositions = positions;
}
public String getText() {
return text;
}
public List getTextPositions() {
return textPositions;
}
}
}