org.apache.pdfbox.text.PDFTextStripper Maven / Gradle / Ivy
Show all versions of pdfbox Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.text;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.StringWriter;
import java.io.Writer;
import java.text.Bidi;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.util.QuickSort;
/**
* This class will take a pdf document and strip out all of the text and ignore the formatting and such. Please note; it
* is up to clients of this class to verify that a specific user has the correct permissions to extract text from the
* PDF document.
*
* The basic flow of this process is that we get a document and use a series of processXXX() functions that work on
* smaller and smaller chunks of the page. Eventually, we fully process each page and then print it.
*
* @author Ben Litchfield
*/
public class PDFTextStripper extends PDFTextStreamEngine
{
private static float defaultIndentThreshold = 2.0f;
private static float defaultDropThreshold = 2.5f;
private static final boolean useCustomQuickSort;
private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
// enable the ability to set the default indent/drop thresholds
// with -D system properties:
// pdftextstripper.indent
// pdftextstripper.drop
static
{
String strDrop = null, strIndent = null;
try
{
String className = PDFTextStripper.class.getSimpleName().toLowerCase();
String prop = className + ".indent";
strIndent = System.getProperty(prop);
prop = className + ".drop";
strDrop = System.getProperty(prop);
}
catch (SecurityException e)
{
// PDFBOX-1946 when run in an applet
// ignore and use default
}
if (strIndent != null && strIndent.length() > 0)
{
try
{
defaultIndentThreshold = Float.parseFloat(strIndent);
}
catch (NumberFormatException nfe)
{
// ignore and use default
}
}
if (strDrop != null && strDrop.length() > 0)
{
try
{
defaultDropThreshold = Float.parseFloat(strDrop);
}
catch (NumberFormatException nfe)
{
// ignore and use default
}
}
}
static
{
// check if we need to use the custom quicksort algorithm as a
// workaround to the PDFBOX-1512 transitivity issue of TextPositionComparator:
boolean is16orLess = false;
try
{
String version = System.getProperty("java.specification.version");
StringTokenizer st = new StringTokenizer(version, ".");
int majorVersion = Integer.parseInt(st.nextToken());
int minorVersion = 0;
if (st.hasMoreTokens())
{
minorVersion = Integer.parseInt(st.nextToken());
}
is16orLess = majorVersion == 1 && minorVersion <= 6;
}
catch (SecurityException x)
{
// when run in an applet ignore and use default
// assume 1.7 or higher so that quicksort is used
}
catch (NumberFormatException nfe)
{
// should never happen, but if it does,
// assume 1.7 or higher so that quicksort is used
}
useCustomQuickSort = !is16orLess;
}
/**
* The platform's line separator.
*/
protected final String LINE_SEPARATOR = System.getProperty("line.separator");
private String lineSeparator = LINE_SEPARATOR;
private String wordSeparator = " ";
private String paragraphStart = "";
private String paragraphEnd = "";
private String pageStart = "";
private String pageEnd = LINE_SEPARATOR;
private String articleStart = "";
private String articleEnd = "";
private int currentPageNo = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDOutlineItem startBookmark = null;
// 1-based bookmark pages
private int startBookmarkPageNumber = -1;
private int endBookmarkPageNumber = -1;
private PDOutlineItem endBookmark = null;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
private boolean addMoreFormatting = false;
private float indentThreshold = defaultIndentThreshold;
private float dropThreshold = defaultDropThreshold;
// we will need to estimate where to add spaces, these are used to help guess
private float spacingTolerance = .5f;
private float averageCharTolerance = .3f;
private List beadRectangles = null;
/**
* The charactersByArticle is used to extract text by article divisions. For example a PDF that has two columns like
* a newspaper, we want to extract the first column and then the second column. In this example the PDF would have 2
* beads(or articles), one for each column. The size of the charactersByArticle would be 5, because not all text on
* the screen will fall into one of the articles. The five divisions are shown below
*
* Text before first article
* first article text
* text between first article and second article
* second article text
* text after second article
*
* Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
*/
protected ArrayList> charactersByArticle = new ArrayList>();
private Map>> characterListMapping = new HashMap>>();
protected PDDocument document;
protected Writer output;
/**
* True if we started a paragraph but haven't ended it yet.
*/
private boolean inParagraph;
/**
* Instantiate a new PDFTextStripper object.
*
* @throws IOException If there is an error loading the properties.
*/
public PDFTextStripper() throws IOException
{
}
/**
* This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
* @return The text of the PDF document.
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public String getText(PDDocument doc) throws IOException
{
StringWriter outputStream = new StringWriter();
writeText(doc, outputStream);
return outputStream.toString();
}
private void resetEngine()
{
currentPageNo = 0;
document = null;
if (charactersByArticle != null)
{
charactersByArticle.clear();
}
if (characterListMapping != null)
{
characterListMapping.clear();
}
}
/**
* This will take a PDDocument and write the text of that document to the print writer.
*
* @param doc The document to get the data from.
* @param outputStream The location to put the text.
*
* @throws IOException If the doc is in an invalid state.
*/
public void writeText(PDDocument doc, Writer outputStream) throws IOException
{
resetEngine();
document = doc;
output = outputStream;
if (getAddMoreFormatting())
{
paragraphEnd = lineSeparator;
pageStart = lineSeparator;
articleStart = lineSeparator;
articleEnd = lineSeparator;
}
startDocument(document);
processPages(document.getPages());
endDocument(document);
}
/**
* This will process all of the pages and the text that is in them.
*
* @param pages The pages object in the document.
*
* @throws IOException If there is an error parsing the text.
*/
protected void processPages(PDPageTree pages) throws IOException
{
PDPage startBookmarkPage = startBookmark == null ? null
: startBookmark.findDestinationPage(document);
if (startBookmarkPage != null)
{
startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1;
}
else
{
// -1 = undefined
startBookmarkPageNumber = -1;
}
PDPage endBookmarkPage = endBookmark == null ? null
: endBookmark.findDestinationPage(document);
if (endBookmarkPage != null)
{
endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1;
}
else
{
// -1 = undefined
endBookmarkPageNumber = -1;
}
if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
&& endBookmark != null
&& startBookmark.getCOSObject() == endBookmark.getCOSObject())
{
// this is a special case where both the start and end bookmark
// are the same but point to nothing. In this case
// we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
for (PDPage page : pages)
{
currentPageNo++;
if (page.hasContents())
{
processPage(page);
}
}
}
/**
* This method is available for subclasses of this class. It will be called before processing of the document start.
*
* @param document The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void startDocument(PDDocument document) throws IOException
{
// no default implementation, but available for subclasses
}
/**
* This method is available for subclasses of this class. It will be called after processing of the document
* finishes.
*
* @param document The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void endDocument(PDDocument document) throws IOException
{
// no default implementation, but available for subclasses
}
/**
* This will process the contents of a page.
*
* @param page The page to process.
*
* @throws IOException If there is an error processing the page.
*/
@Override
public void processPage(PDPage page) throws IOException
{
if (currentPageNo >= startPage && currentPageNo <= endPage
&& (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber)
&& (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber))
{
startPage(page);
int numberOfArticleSections = 1;
if (shouldSeparateByBeads)
{
fillBeadRectangles(page);
numberOfArticleSections += beadRectangles.size() * 2;
}
int originalSize = charactersByArticle.size();
charactersByArticle.ensureCapacity(numberOfArticleSections);
int lastIndex = Math.max(numberOfArticleSections, originalSize);
for (int i = 0; i < lastIndex; i++)
{
if (i < originalSize)
{
charactersByArticle.get(i).clear();
}
else
{
if (numberOfArticleSections < originalSize)
{
charactersByArticle.remove(i);
}
else
{
charactersByArticle.add(new ArrayList());
}
}
}
characterListMapping.clear();
super.processPage(page);
writePage();
endPage(page);
}
}
private void fillBeadRectangles(PDPage page)
{
beadRectangles = new ArrayList();
for (PDThreadBead bead : page.getThreadBeads())
{
if (bead == null)
{
// can't skip, because of null entry handling in processTextPosition()
beadRectangles.add(null);
continue;
}
PDRectangle rect = bead.getRectangle();
// bead rectangle is in PDF coordinates (y=0 is bottom),
// glyphs are in image coordinates (y=0 is top),
// so we must flip
PDRectangle mediaBox = page.getMediaBox();
float upperRightY = mediaBox.getUpperRightY() - rect.getLowerLeftY();
float lowerLeftY = mediaBox.getUpperRightY() - rect.getUpperRightY();
rect.setLowerLeftY(lowerLeftY);
rect.setUpperRightY(upperRightY);
// adjust for cropbox
PDRectangle cropBox = page.getCropBox();
if (cropBox.getLowerLeftX() != 0 || cropBox.getLowerLeftY() != 0)
{
rect.setLowerLeftX(rect.getLowerLeftX() - cropBox.getLowerLeftX());
rect.setLowerLeftY(rect.getLowerLeftY() - cropBox.getLowerLeftY());
rect.setUpperRightX(rect.getUpperRightX() - cropBox.getLowerLeftX());
rect.setUpperRightY(rect.getUpperRightY() - cropBox.getLowerLeftY());
}
beadRectangles.add(rect);
}
}
/**
* Start a new article, which is typically defined as a column on a single page (also referred to as a bead). This
* assumes that the primary direction of text is left to right. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void startArticle() throws IOException
{
startArticle(true);
}
/**
* Start a new article, which is typically defined as a column on a single page (also referred to as a bead).
* Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param isLTR true if primary direction of text is left to right.
* @throws IOException If there is any error writing to the stream.
*/
protected void startArticle(boolean isLTR) throws IOException
{
output.write(getArticleStart());
}
/**
* End an article. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void endArticle() throws IOException
{
output.write(getArticleEnd());
}
/**
* Start a new page. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void startPage(PDPage page) throws IOException
{
// default is to do nothing
}
/**
* End a page. Default implementation is to do nothing. Subclasses may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void endPage(PDPage page) throws IOException
{
// default is to do nothing
}
private static final float END_OF_LAST_TEXT_X_RESET_VALUE = -1;
private static final float MAX_Y_FOR_LINE_RESET_VALUE = -Float.MAX_VALUE;
private static final float EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE = -Float.MAX_VALUE;
private static final float MAX_HEIGHT_FOR_LINE_RESET_VALUE = -1;
private static final float MIN_Y_TOP_FOR_LINE_RESET_VALUE = Float.MAX_VALUE;
private static final float LAST_WORD_SPACING_RESET_VALUE = -1;
/**
* This will print the text of the processed page to "output". It will estimate, based on the coordinates of the
* text, where newlines and word spacings should be placed. The text will be sorted only if that feature was
* enabled.
*
* @throws IOException If there is an error writing the text.
*/
protected void writePage() throws IOException
{
float maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
float minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
float endOfLastTextX = END_OF_LAST_TEXT_X_RESET_VALUE;
float lastWordSpacing = LAST_WORD_SPACING_RESET_VALUE;
float maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
PositionWrapper lastPosition = null;
PositionWrapper lastLineStartPosition = null;
boolean startOfPage = true; // flag to indicate start of page
boolean startOfArticle;
if (charactersByArticle.size() > 0)
{
writePageStart();
}
for (List textList : charactersByArticle)
{
if (getSortByPosition())
{
TextPositionComparator comparator = new TextPositionComparator();
// because the TextPositionComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom quicksort implementation (which is slower, unfortunately).
if (useCustomQuickSort)
{
QuickSort.sort(textList, comparator);
}
else
{
Collections.sort(textList, comparator);
}
}
Iterator textIter = textList.iterator();
startArticle();
startOfArticle = true;
// Now cycle through to print the text.
// We queue up a line at a time before we print so that we can convert
// the line from presentation form to logical form (if needed).
List line = new ArrayList();
textIter = textList.iterator(); // start from the beginning again
// PDF files don't always store spaces. We will need to guess where we should add
// spaces based on the distances between TextPositions. Historically, this was done
// based on the size of the space character provided by the font. In general, this
// worked but there were cases where it did not work. Calculating the average character
// width and using that as a metric works better in some cases but fails in some cases
// where the spacing worked. So we use both. NOTE: Adobe reader also fails on some of
// these examples.
// Keeps track of the previous average character width
float previousAveCharWidth = -1;
while (textIter.hasNext())
{
TextPosition position = textIter.next();
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getUnicode();
// Resets the average character width when we see a change in font
// or a change in the font size
if (lastPosition != null && (position.getFont() != lastPosition.getTextPosition()
.getFont()
|| position.getFontSize() != lastPosition.getTextPosition().getFontSize()))
{
previousAveCharWidth = -1;
}
float positionX;
float positionY;
float positionWidth;
float positionHeight;
// If we are sorting, then we need to use the text direction
// adjusted coordinates, because they were used in the sorting.
if (getSortByPosition())
{
positionX = position.getXDirAdj();
positionY = position.getYDirAdj();
positionWidth = position.getWidthDirAdj();
positionHeight = position.getHeightDir();
}
else
{
positionX = position.getX();
positionY = position.getY();
positionWidth = position.getWidth();
positionHeight = position.getHeight();
}
// The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
// Estimate the expected width of the space based on the
// space character with some margin.
float wordSpacing = position.getWidthOfSpace();
float deltaSpace;
if (wordSpacing == 0 || Float.isNaN(wordSpacing))
{
deltaSpace = Float.MAX_VALUE;
}
else
{
if (lastWordSpacing < 0)
{
deltaSpace = wordSpacing * getSpacingTolerance();
}
else
{
deltaSpace = (wordSpacing + lastWordSpacing) / 2f * getSpacingTolerance();
}
}
// Estimate the expected width of the space based on the average character width
// with some margin. This calculation does not make a true average (average of
// averages) but we found that it gave the best results after numerous experiments.
// Based on experiments we also found that .3 worked well.
float averageCharWidth;
if (previousAveCharWidth < 0)
{
averageCharWidth = positionWidth / wordCharCount;
}
else
{
averageCharWidth = (previousAveCharWidth + positionWidth / wordCharCount) / 2f;
}
float deltaCharWidth = averageCharWidth * getAverageCharTolerance();
// Compares the values obtained by the average method and the wordSpacing method
// and picks the smaller number.
float expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
if (endOfLastTextX != END_OF_LAST_TEXT_X_RESET_VALUE)
{
if (deltaCharWidth > deltaSpace)
{
expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
}
else
{
expectedStartOfNextWordX = endOfLastTextX + deltaCharWidth;
}
}
if (lastPosition != null)
{
if (startOfArticle)
{
lastPosition.setArticleStart();
startOfArticle = false;
}
// RDD - Here we determine whether this text object is on the current
// line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
// Text must overlap with the last rendered baseline text by at least
// a small amount in order to be considered as being on the same line.
// XXX BC: In theory, this check should really check if the next char is in
// full range seen in this line. This is what I tried to do with minYTopForLine,
// but this caused a lot of regression test failures. So, I'm leaving it be for
// now
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
{
writeLine(normalize(line));
line.clear();
lastLineStartPosition = handleLineSeparation(current, lastPosition,
lastLineStartPosition, maxHeightForLine);
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
}
// test if our TextPosition starts after a new word would be expected to start
if (expectedStartOfNextWordX != EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE
&& expectedStartOfNextWordX < positionX &&
// only bother adding a space if the last character was not a space
lastPosition.getTextPosition().getUnicode() != null
&& !lastPosition.getTextPosition().getUnicode().endsWith(" "))
{
line.add(LineItem.getWordSeparator());
}
}
if (positionY >= maxYForLine)
{
maxYForLine = positionY;
}
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
endOfLastTextX = positionX + positionWidth;
// add it to the list
if (characterValue != null)
{
if (startOfPage && lastPosition == null)
{
writeParagraphStart();// not sure this is correct for RTL?
}
line.add(new LineItem(position));
}
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
lastPosition = current;
if (startOfPage)
{
lastPosition.setParagraphStart();
lastPosition.setLineStart();
lastLineStartPosition = lastPosition;
startOfPage = false;
}
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
}
// print the final line
if (line.size() > 0)
{
writeLine(normalize(line));
writeParagraphEnd();
}
endArticle();
}
writePageEnd();
}
private boolean overlap(float y1, float height1, float y2, float height2)
{
return within(y1, y2, .1f) || y2 <= y1 && y2 >= y1 - height1
|| y1 <= y2 && y1 >= y2 - height2;
}
/**
* Write the line separator value to the output stream.
*
* @throws IOException If there is a problem writing out the lineseparator to the document.
*/
protected void writeLineSeparator() throws IOException
{
output.write(getLineSeparator());
}
/**
* Write the word separator value to the output stream.
*
* @throws IOException If there is a problem writing out the wordseparator to the document.
*/
protected void writeWordSeparator() throws IOException
{
output.write(getWordSeparator());
}
/**
* Write the string in TextPosition to the output stream.
*
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
*/
protected void writeCharacters(TextPosition text) throws IOException
{
output.write(text.getUnicode());
}
/**
* Write a Java string to the output stream. The default implementation will ignore the textPositions
* and just calls {@link #writeString(String)}.
*
* @param text The text to write to the stream.
* @param textPositions The TextPositions belonging to the text.
* @throws IOException If there is an error when writing the text.
*/
protected void writeString(String text, List textPositions) throws IOException
{
writeString(text);
}
/**
* Write a Java string to the output stream.
*
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
*/
protected void writeString(String text) throws IOException
{
output.write(text);
}
/**
* This will determine of two floating point numbers are within a specified variance.
*
* @param first The first number to compare to.
* @param second The second number to compare to.
* @param variance The allowed variance.
*/
private boolean within(float first, float second, float variance)
{
return second < first + variance && second > first - variance;
}
/**
* This will process a TextPosition object and add the text to the list of characters on a page. It takes care of
* overlapping text.
*
* @param text The text to process.
*/
@Override
protected void processTextPosition(TextPosition text)
{
boolean showCharacter = true;
if (suppressDuplicateOverlappingText)
{
showCharacter = false;
String textCharacter = text.getUnicode();
float textX = text.getX();
float textY = text.getY();
TreeMap> sameTextCharacters = characterListMapping
.get(textCharacter);
if (sameTextCharacters == null)
{
sameTextCharacters = new TreeMap>();
characterListMapping.put(textCharacter, sameTextCharacters);
}
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme amounts
// of padding are applied, then backed off (not sure why this is done, but there
// are cases where the padding is on the order of 10x the character width, and
// the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
boolean suppressCharacter = false;
float tolerance = text.getWidth() / textCharacter.length() / 3.0f;
SortedMap> xMatches = sameTextCharacters.subMap(textX - tolerance,
textX + tolerance);
for (TreeSet xMatch : xMatches.values())
{
SortedSet yMatches = xMatch.subSet(textY - tolerance, textY + tolerance);
if (!yMatches.isEmpty())
{
suppressCharacter = true;
break;
}
}
if (!suppressCharacter)
{
TreeSet ySet = sameTextCharacters.get(textX);
if (ySet == null)
{
ySet = new TreeSet();
sameTextCharacters.put(textX, ySet);
}
ySet.add(textY);
showCharacter = true;
}
}
if (showCharacter)
{
// if we are showing the character then we need to determine which article it belongs to
int foundArticleDivisionIndex = -1;
int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
int notFoundButFirstLeftArticleDivisionIndex = -1;
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
if (shouldSeparateByBeads)
{
for (int i = 0; i < beadRectangles.size() && foundArticleDivisionIndex == -1; i++)
{
PDRectangle rect = beadRectangles.get(i);
if (rect != null)
{
if (rect.contains(x, y))
{
foundArticleDivisionIndex = i * 2 + 1;
}
else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY())
&& notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2;
}
else if (x < rect.getLowerLeftX()
&& notFoundButFirstLeftArticleDivisionIndex == -1)
{
notFoundButFirstLeftArticleDivisionIndex = i * 2;
}
else if (y < rect.getUpperRightY()
&& notFoundButFirstAboveArticleDivisionIndex == -1)
{
notFoundButFirstAboveArticleDivisionIndex = i * 2;
}
}
else
{
foundArticleDivisionIndex = 0;
}
}
}
else
{
foundArticleDivisionIndex = 0;
}
int articleDivisionIndex;
if (foundArticleDivisionIndex != -1)
{
articleDivisionIndex = foundArticleDivisionIndex;
}
else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
}
else if (notFoundButFirstLeftArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
}
else if (notFoundButFirstAboveArticleDivisionIndex != -1)
{
articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
}
else
{
articleDivisionIndex = charactersByArticle.size() - 1;
}
List textList = charactersByArticle.get(articleDivisionIndex);
// In the wild, some PDF encoded documents put diacritics (accents on
// top of characters) into a separate Tj element. When displaying them
// graphically, the two chunks get overlayed. With text output though,
// we need to do the overlay. This code recombines the diacritic with
// its associated character if the two are consecutive.
if (textList.isEmpty())
{
textList.add(text);
}
else
{
// test if we overlap the previous entry.
// Note that we are making an assumption that we need to only look back
// one TextPosition to find what we are overlapping.
// This may not always be true. */
TextPosition previousTextPosition = textList.get(textList.size() - 1);
if (text.isDiacritic() && previousTextPosition.contains(text))
{
previousTextPosition.mergeDiacritic(text);
}
// If the previous TextPosition was the diacritic, merge it into this
// one and remove it from the list.
else if (previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
{
text.mergeDiacritic(previousTextPosition);
textList.remove(textList.size() - 1);
textList.add(text);
}
else
{
textList.add(text);
}
}
}
}
/**
* This is the page that the text extraction will start on. The pages start at page 1. For example in a 5 page PDF
* document, if the start page is 1 then all pages will be extracted. If the start page is 4 then pages 4 and 5 will
* be extracted. The default value is 1.
*
* @return Value of property startPage.
*/
public int getStartPage()
{
return startPage;
}
/**
* This will set the first page to be extracted by this class.
*
* @param startPageValue New value of 1-based startPage property.
*/
public void setStartPage(int startPageValue)
{
startPage = startPageValue;
}
/**
* This will get the last page that will be extracted. This is inclusive, for example if a 5 page PDF an endPage
* value of 5 would extract the entire document, an end page of 2 would extract pages 1 and 2. This defaults to
* Integer.MAX_VALUE such that all pages of the pdf will be extracted.
*
* @return Value of property endPage.
*/
public int getEndPage()
{
return endPage;
}
/**
* This will set the last page to be extracted by this class.
*
* @param endPageValue New value of 1-based endPage property.
*/
public void setEndPage(int endPageValue)
{
endPage = endPageValue;
}
/**
* Set the desired line separator for output text. The line.separator system property is used if the line separator
* preference is not set explicitly using this method.
*
* @param separator The desired line separator string.
*/
public void setLineSeparator(String separator)
{
lineSeparator = separator;
}
/**
* This will get the line separator.
*
* @return The desired line separator string.
*/
public String getLineSeparator()
{
return lineSeparator;
}
/**
* This will get the word separator.
*
* @return The desired word separator string.
*/
public String getWordSeparator()
{
return wordSeparator;
}
/**
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
* character if there is enough space between two words. By default a space character is used. If you need and
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
* the empty string.
*
* @param separator The desired page separator string.
*/
public void setWordSeparator(String separator)
{
wordSeparator = separator;
}
/**
* @return Returns the suppressDuplicateOverlappingText.
*/
public boolean getSuppressDuplicateOverlappingText()
{
return suppressDuplicateOverlappingText;
}
/**
* Get the current page number that is being processed.
*
* @return A 1 based number representing the current page.
*/
protected int getCurrentPageNo()
{
return currentPageNo;
}
/**
* The output stream that is being written to.
*
* @return The stream that output is being written to.
*/
protected Writer getOutput()
{
return output;
}
/**
* Character strings are grouped by articles. It is quite common that there will only be a single article. This
* returns a List that contains List objects, the inner lists will contain TextPosition objects.
*
* @return A double List of TextPositions for all text strings on the page.
*/
protected List> getCharactersByArticle()
{
return charactersByArticle;
}
/**
* By default the text stripper will attempt to remove text that overlapps each other. Word paints the same
* character several times in order to make it look bold. By setting this to false all text will be extracted, which
* means that certain sections will be duplicated, but better performance will be noticed.
*
* @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
*/
public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue)
{
suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
}
/**
* This will tell if the text stripper should separate by beads.
*
* @return If the text will be grouped by beads.
*/
public boolean getSeparateByBeads()
{
return shouldSeparateByBeads;
}
/**
* Set if the text stripper should group the text output by a list of beads. The default value is true!
*
* @param aShouldSeparateByBeads The new grouping of beads.
*/
public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
{
shouldSeparateByBeads = aShouldSeparateByBeads;
}
/**
* Get the bookmark where text extraction should end, inclusive. Default is null.
*
* @return The ending bookmark.
*/
public PDOutlineItem getEndBookmark()
{
return endBookmark;
}
/**
* Set the bookmark where the text extraction should stop.
*
* @param aEndBookmark The ending bookmark.
*/
public void setEndBookmark(PDOutlineItem aEndBookmark)
{
endBookmark = aEndBookmark;
}
/**
* Get the bookmark where text extraction should start, inclusive. Default is null.
*
* @return The starting bookmark.
*/
public PDOutlineItem getStartBookmark()
{
return startBookmark;
}
/**
* Set the bookmark where text extraction should start, inclusive.
*
* @param aStartBookmark The starting bookmark.
*/
public void setStartBookmark(PDOutlineItem aStartBookmark)
{
startBookmark = aStartBookmark;
}
/**
* This will tell if the text stripper should add some more text formatting.
*
* @return true if some more text formatting will be added
*/
public boolean getAddMoreFormatting()
{
return addMoreFormatting;
}
/**
* There will some additional text formatting be added if addMoreFormatting is set to true. Default is false.
*
* @param newAddMoreFormatting Tell PDFBox to add some more text formatting
*/
public void setAddMoreFormatting(boolean newAddMoreFormatting)
{
addMoreFormatting = newAddMoreFormatting;
}
/**
* This will tell if the text stripper should sort the text tokens before writing to the stream.
*
* @return true If the text tokens will be sorted before being written.
*/
public boolean getSortByPosition()
{
return sortByPosition;
}
/**
* The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For
* example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and
* write out the normal text.
* The default is to not sort by position.
*
* A PDF writer could choose to write each character in a different order. By default PDFBox does not sort
* the text tokens before processing them due to performance reasons.
*
* @param newSortByPosition Tell PDFBox to sort the text positions.
*/
public void setSortByPosition(boolean newSortByPosition)
{
sortByPosition = newSortByPosition;
}
/**
* Get the current space width-based tolerance value that is being used to estimate where spaces in text should be
* added. Note that the default value for this has been determined from trial and error.
*
* @return The current tolerance / scaling factor
*/
public float getSpacingTolerance()
{
return spacingTolerance;
}
/**
* Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note
* that the default value for this has been determined from trial and error. Setting this value larger will reduce
* the number of spaces added.
*
* @param spacingToleranceValue tolerance / scaling factor to use
*/
public void setSpacingTolerance(float spacingToleranceValue)
{
spacingTolerance = spacingToleranceValue;
}
/**
* Get the current character width-based tolerance value that is being used to estimate where spaces in text should
* be added. Note that the default value for this has been determined from trial and error.
*
* @return The current tolerance / scaling factor
*/
public float getAverageCharTolerance()
{
return averageCharTolerance;
}
/**
* Set the character width-based tolerance value that is used to estimate where spaces in text should be added. Note
* that the default value for this has been determined from trial and error. Setting this value larger will reduce
* the number of spaces added.
*
* @param averageCharToleranceValue average tolerance / scaling factor to use
*/
public void setAverageCharTolerance(float averageCharToleranceValue)
{
averageCharTolerance = averageCharToleranceValue;
}
/**
* returns the multiple of whitespace character widths for the current text which the current line start can be
* indented from the previous line start beyond which the current line start is considered to be a paragraph start.
*
* @return the number of whitespace character widths to use when detecting paragraph indents.
*/
public float getIndentThreshold()
{
return indentThreshold;
}
/**
* sets the multiple of whitespace character widths for the current text which the current line start can be
* indented from the previous line start beyond which the current line start is considered to be a paragraph start.
* The default value is 2.0.
*
* @param indentThresholdValue the number of whitespace character widths to use when detecting paragraph indents.
*/
public void setIndentThreshold(float indentThresholdValue)
{
indentThreshold = indentThresholdValue;
}
/**
* the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line
* start is considered to be a paragraph start.
*
* @return the character height multiple for max allowed whitespace between lines in the same paragraph.
*/
public float getDropThreshold()
{
return dropThreshold;
}
/**
* sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current
* line start is considered to be a paragraph start. The default value is 2.5.
*
* @param dropThresholdValue the character height multiple for max allowed whitespace between lines in the same
* paragraph.
*/
public void setDropThreshold(float dropThresholdValue)
{
dropThreshold = dropThresholdValue;
}
/**
* Returns the string which will be used at the beginning of a paragraph.
*
* @return the paragraph start string
*/
public String getParagraphStart()
{
return paragraphStart;
}
/**
* Sets the string which will be used at the beginning of a paragraph.
*
* @param s the paragraph start string
*/
public void setParagraphStart(String s)
{
paragraphStart = s;
}
/**
* Returns the string which will be used at the end of a paragraph.
*
* @return the paragraph end string
*/
public String getParagraphEnd()
{
return paragraphEnd;
}
/**
* Sets the string which will be used at the end of a paragraph.
*
* @param s the paragraph end string
*/
public void setParagraphEnd(String s)
{
paragraphEnd = s;
}
/**
* Returns the string which will be used at the beginning of a page.
*
* @return the page start string
*/
public String getPageStart()
{
return pageStart;
}
/**
* Sets the string which will be used at the beginning of a page.
*
* @param pageStartValue the page start string
*/
public void setPageStart(String pageStartValue)
{
pageStart = pageStartValue;
}
/**
* Returns the string which will be used at the end of a page.
*
* @return the page end string
*/
public String getPageEnd()
{
return pageEnd;
}
/**
* Sets the string which will be used at the end of a page.
*
* @param pageEndValue the page end string
*/
public void setPageEnd(String pageEndValue)
{
pageEnd = pageEndValue;
}
/**
* Returns the string which will be used at the beginning of an article.
*
* @return the article start string
*/
public String getArticleStart()
{
return articleStart;
}
/**
* Sets the string which will be used at the beginning of an article.
*
* @param articleStartValue the article start string
*/
public void setArticleStart(String articleStartValue)
{
articleStart = articleStartValue;
}
/**
* Returns the string which will be used at the end of an article.
*
* @return the article end string
*/
public String getArticleEnd()
{
return articleEnd;
}
/**
* Sets the string which will be used at the end of an article.
*
* @param articleEndValue the article end string
*/
public void setArticleEnd(String articleEndValue)
{
articleEnd = articleEndValue;
}
/**
* handles the line separator for a new line given the specified current and previous TextPositions.
*
* @param current the current text position
* @param lastPosition the previous text position
* @param lastLineStartPosition the last text position that followed a line separator.
* @param maxHeightForLine max height for positions since lastLineStartPosition
* @return start position of the last line
* @throws IOException if something went wrong
*/
private PositionWrapper handleLineSeparation(PositionWrapper current,
PositionWrapper lastPosition, PositionWrapper lastLineStartPosition,
float maxHeightForLine) throws IOException
{
current.setLineStart();
isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
lastLineStartPosition = current;
if (current.isParagraphStart())
{
if (lastPosition.isArticleStart())
{
if (lastPosition.isLineStart())
{
writeLineSeparator();
}
writeParagraphStart();
}
else
{
writeLineSeparator();
writeParagraphSeparator();
}
}
else
{
writeLineSeparator();
}
return lastLineStartPosition;
}
/**
* tests the relationship between the last text position, the current text position and the last text position that
* followed a line separator to decide if the gap represents a paragraph separation. This should only be
* called for consecutive text positions that first pass the line separation test.
*
* This base implementation tests to see if the lastLineStartPosition is null OR if the current vertical position
* has dropped below the last text vertical position by at least 2.5 times the current text height OR if the current
* horizontal position is indented by at least 2 times the current width of a space character.
*
*
* This also attempts to identify text that is indented under a hanging indent.
*
*
* This method sets the isParagraphStart and isHangingIndent flags on the current position object.
*
*
* @param position the current text position. This may have its isParagraphStart or isHangingIndent flags set upon
* return.
* @param lastPosition the previous text position (should not be null).
* @param lastLineStartPosition the last text position that followed a line separator, or null.
* @param maxHeightForLine max height for text positions since lasLineStartPosition.
*/
private void isParagraphSeparation(PositionWrapper position, PositionWrapper lastPosition,
PositionWrapper lastLineStartPosition, float maxHeightForLine)
{
boolean result = false;
if (lastLineStartPosition == null)
{
result = true;
}
else
{
float yGap = Math.abs(position.getTextPosition().getYDirAdj()
- lastPosition.getTextPosition().getYDirAdj());
float newYVal = multiplyFloat(getDropThreshold(), maxHeightForLine);
// do we need to flip this for rtl?
float xGap = position.getTextPosition().getXDirAdj()
- lastLineStartPosition.getTextPosition().getXDirAdj();
float newXVal = multiplyFloat(getIndentThreshold(),
position.getTextPosition().getWidthOfSpace());
float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
if (yGap > newYVal)
{
result = true;
}
else if (xGap > newXVal)
{
// text is indented, but try to screen for hanging indent
if (!lastLineStartPosition.isParagraphStart())
{
result = true;
}
else
{
position.setHangingIndent();
}
}
else if (xGap < -position.getTextPosition().getWidthOfSpace())
{
// text is left of previous line. Was it a hanging indent?
if (!lastLineStartPosition.isParagraphStart())
{
result = true;
}
}
else if (Math.abs(xGap) < positionWidth)
{
// current horizontal position is within 1/4 a char of the last
// linestart. We'll treat them as lined up.
if (lastLineStartPosition.isHangingIndent())
{
position.setHangingIndent();
}
else if (lastLineStartPosition.isParagraphStart())
{
// check to see if the previous line looks like
// any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
if (liPattern != null)
{
Pattern currentPattern = matchListItemPattern(position);
if (liPattern == currentPattern)
{
result = true;
}
}
}
}
}
if (result)
{
position.setParagraphStart();
}
}
private float multiplyFloat(float value1, float value2)
{
// multiply 2 floats and truncate the resulting value to 3 decimal places
// to avoid wrong results when comparing with another float
return Math.round(value1 * value2 * 1000) / 1000f;
}
/**
* writes the paragraph separator string to the output.
*
* @throws IOException if something went wrong
*/
protected void writeParagraphSeparator() throws IOException
{
writeParagraphEnd();
writeParagraphStart();
}
/**
* Write something (if defined) at the start of a paragraph.
*
* @throws IOException if something went wrong
*/
protected void writeParagraphStart() throws IOException
{
if (inParagraph)
{
writeParagraphEnd();
inParagraph = false;
}
output.write(getParagraphStart());
inParagraph = true;
}
/**
* Write something (if defined) at the end of a paragraph.
*
* @throws IOException if something went wrong
*/
protected void writeParagraphEnd() throws IOException
{
if (!inParagraph)
{
writeParagraphStart();
}
output.write(getParagraphEnd());
inParagraph = false;
}
/**
* Write something (if defined) at the start of a page.
*
* @throws IOException if something went wrong
*/
protected void writePageStart() throws IOException
{
output.write(getPageStart());
}
/**
* Write something (if defined) at the end of a page.
*
* @throws IOException if something went wrong
*/
protected void writePageEnd() throws IOException
{
output.write(getPageEnd());
}
/**
* returns the list item Pattern object that matches the text at the specified PositionWrapper or null if the text
* does not match such a pattern. The list of Patterns tested against is given by the {@link #getListItemPatterns()}
* method. To add to the list, simply override that method (if sub-classing) or explicitly supply your own list
* using {@link #setListItemPatterns(List)}.
*
* @param pw position
* @return the matching pattern
*/
private Pattern matchListItemPattern(PositionWrapper pw)
{
TextPosition tp = pw.getTextPosition();
String txt = tp.getUnicode();
return matchPattern(txt, getListItemPatterns());
}
/**
* a list of regular expressions that match commonly used list item formats, i.e. bullets, numbers, letters, Roman
* numerals, etc. Not meant to be comprehensive.
*/
private static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.", "\\[\\d+\\]",
"\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)", "[a-z]\\)", "[IVXL]+\\.",
"[ivxl]+\\.", };
private List listOfPatterns = null;
/**
* use to supply a different set of regular expression patterns for matching list item starts.
*
* @param patterns list of patterns
*/
protected void setListItemPatterns(List patterns)
{
listOfPatterns = patterns;
}
/**
* returns a list of regular expression Patterns representing different common list item formats. For example
* numbered items of form:
*
* - some text
* - more text
*
* or
*
* - some text
* - more text
*
* etc., all begin with some character pattern. The pattern "\\d+\." (matches "1.", "2.", ...) or "\[\\d+\]"
* (matches "[1]", "[2]", ...).
*
* This method returns a list of such regular expression Patterns.
*
* @return a list of Pattern objects.
*/
protected List getListItemPatterns()
{
if (listOfPatterns == null)
{
listOfPatterns = new ArrayList();
for (String expression : LIST_ITEM_EXPRESSIONS)
{
Pattern p = Pattern.compile(expression);
listOfPatterns.add(p);
}
}
return listOfPatterns;
}
/**
* iterates over the specified list of Patterns until it finds one that matches the specified string. Then returns
* the Pattern.
*
* Order of the supplied list of patterns is important as most common patterns should come first. Patterns should be
* strict in general, and all will be used with case sensitivity on.
*
*
* @param string the string to be searched
* @param patterns list of patterns
* @return matching pattern
*/
protected static Pattern matchPattern(String string, List patterns)
{
for (Pattern p : patterns)
{
if (p.matcher(string).matches())
{
return p;
}
}
return null;
}
/**
* Write a list of string containing a whole line of a document.
*
* @param line a list with the words of the given line
* @throws IOException if something went wrong
*/
private void writeLine(List line)
throws IOException
{
int numberOfStrings = line.size();
for (int i = 0; i < numberOfStrings; i++)
{
WordWithTextPositions word = line.get(i);
writeString(word.getText(), word.getTextPositions());
if (i < numberOfStrings - 1)
{
writeWordSeparator();
}
}
}
/**
* Normalize the given list of TextPositions.
*
* @param line list of TextPositions
* @return a list of strings, one string for every word
*/
private List normalize(List line)
{
List normalized = new LinkedList();
StringBuilder lineBuilder = new StringBuilder();
List wordPositions = new ArrayList();
for (LineItem item : line)
{
lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item);
}
if (lineBuilder.length() > 0)
{
normalized.add(createWord(lineBuilder.toString(), wordPositions));
}
return normalized;
}
/**
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
* word. If the word is a full line, the results will be the best. If the word contains of single words or
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
* characters!
*
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx
*
* @param word The word that shall be processed
* @return new word with the correct direction of the containing characters
*/
private String handleDirection(String word)
{
Bidi bidi = new Bidi(word, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT);
// if there is pure LTR text no need to process further
if (!bidi.isMixed() && bidi.getBaseLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT)
{
return word;
}
// collect individual bidi information
int runCount = bidi.getRunCount();
byte[] levels = new byte[runCount];
Integer[] runs = new Integer[runCount];
for (int i = 0; i < runCount; i++)
{
levels[i] = (byte)bidi.getRunLevel(i);
runs[i] = i;
}
// reorder individual parts based on their levels
Bidi.reorderVisually(levels, 0, runs, 0, runCount);
// collect the parts based on the direction within the run
StringBuilder result = new StringBuilder();
for (int i = 0; i < runCount; i++)
{
int index = runs[i];
int start = bidi.getRunStart(index);
int end = bidi.getRunLimit(index);
int level = levels[index];
if ((level & 1) != 0)
{
for (; --end >= start;)
{
char character = word.charAt(end);
if (Character.isMirrored(word.codePointAt(end)))
{
if (MIRRORING_CHAR_MAP.containsKey(character))
{
result.append(MIRRORING_CHAR_MAP.get(character));
}
else
{
result.append(character);
}
}
else
{
result.append(character);
}
}
}
else
{
result.append(word, start, end);
}
}
return result.toString();
}
private static Map MIRRORING_CHAR_MAP = new HashMap();
static
{
String path = "org/apache/pdfbox/resources/text/BidiMirroring.txt";
InputStream input = PDFTextStripper.class.getClassLoader().getResourceAsStream(path);
try
{
parseBidiFile(input);
}
catch (IOException e)
{
LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: "
+ e.getMessage());
}
finally
{
try
{
input.close();
}
catch (IOException e)
{
LOG.error("Could not close BidiMirroring.txt ", e);
}
}
};
/**
* This method parses the bidi file provided as inputstream.
*
* @param inputStream - The bidi file as inputstream
* @throws IOException if any line could not be read by the LineNumberReader
*/
private static void parseBidiFile(InputStream inputStream) throws IOException
{
LineNumberReader rd = new LineNumberReader(new InputStreamReader(inputStream));
do
{
String s = rd.readLine();
if (s == null)
{
break;
}
int comment = s.indexOf('#'); // ignore comments
if (comment != -1)
{
s = s.substring(0, comment);
}
if (s.length() < 2)
{
continue;
}
StringTokenizer st = new StringTokenizer(s, ";");
int nFields = st.countTokens();
Character[] fields = new Character[nFields];
for (int i = 0; i < nFields; i++)
{
fields[i] = (char) Integer.parseInt(st.nextToken().trim(), 16);
}
if (fields.length == 2)
{
// initialize the MIRRORING_CHAR_MAP
MIRRORING_CHAR_MAP.put(fields[0], fields[1]);
}
} while (true);
}
/**
* Used within {@link #normalize(List, boolean, boolean)} to create a single {@link WordWithTextPositions} entry.
*/
private WordWithTextPositions createWord(String word, List wordPositions)
{
return new WordWithTextPositions(normalizeWord(word), wordPositions);
}
/**
* Normalize certain Unicode characters. For example, convert the single "fi" ligature to "f" and "i". Also
* normalises Arabic and Hebrew presentation forms.
*
* @param word Word to normalize
* @return Normalized word
*/
private String normalizeWord(String word)
{
StringBuilder builder = null;
int p = 0;
int q = 0;
int strLength = word.length();
for (; q < strLength; q++)
{
// We only normalize if the codepoint is in a given range.
// Otherwise, NFKC converts too many things that would cause
// confusion. For example, it converts the micro symbol in
// extended Latin to the value in the Greek script. We normalize
// the Unicode Alphabetic and Arabic A&B Presentation forms.
char c = word.charAt(q);
if (0xFB00 <= c && c <= 0xFDFF || 0xFE70 <= c && c <= 0xFEFF)
{
if (builder == null)
{
builder = new StringBuilder(strLength * 2);
}
builder.append(word.substring(p, q));
// Some fonts map U+FDF2 differently than the Unicode spec.
// They add an extra U+0627 character to compensate.
// This removes the extra character for those fonts.
if (c == 0xFDF2 && q > 0
&& (word.charAt(q - 1) == 0x0627 || word.charAt(q - 1) == 0xFE8D))
{
builder.append("\u0644\u0644\u0647");
}
else
{
// Trim because some decompositions have an extra space, such as U+FC5E
builder.append(Normalizer
.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
}
p = q + 1;
}
}
if (builder == null)
{
return handleDirection(word);
}
else
{
builder.append(word.substring(p, q));
return handleDirection(builder.toString());
}
}
/**
* Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
*
* @return The StringBuilder that must be used when calling this method.
*/
private StringBuilder normalizeAdd(List normalized,
StringBuilder lineBuilder, List wordPositions, LineItem item)
{
if (item.isWordSeparator())
{
normalized.add(
createWord(lineBuilder.toString(), new ArrayList(wordPositions)));
lineBuilder = new StringBuilder();
wordPositions.clear();
}
else
{
TextPosition text = item.getTextPosition();
lineBuilder.append(text.getUnicode());
wordPositions.add(text);
}
return lineBuilder;
}
/**
* internal marker class. Used as a place holder in a line of TextPositions.
*/
private static final class LineItem
{
public static LineItem WORD_SEPARATOR = new LineItem();
public static LineItem getWordSeparator()
{
return WORD_SEPARATOR;
}
private final TextPosition textPosition;
private LineItem()
{
textPosition = null;
}
LineItem(TextPosition textPosition)
{
this.textPosition = textPosition;
}
public TextPosition getTextPosition()
{
return textPosition;
}
public boolean isWordSeparator()
{
return textPosition == null;
}
}
/**
* Internal class that maps strings to lists of {@link TextPosition} arrays. Note that the number of entries in that
* list may differ from the number of characters in the string due to normalization.
*
* @author Axel Dörfler
*/
private static final class WordWithTextPositions
{
String text;
List textPositions;
WordWithTextPositions(String word, List positions)
{
text = word;
textPositions = positions;
}
public String getText()
{
return text;
}
public List getTextPositions()
{
return textPositions;
}
}
/**
* wrapper of TextPosition that adds flags to track status as linestart and paragraph start positions.
*
* This is implemented as a wrapper since the TextPosition class doesn't provide complete access to its state fields
* to subclasses. Also, conceptually TextPosition is immutable while these flags need to be set post-creation so it
* makes sense to put these flags in this separate class.
*
*
* @author [email protected]
*/
private static final class PositionWrapper
{
private boolean isLineStart = false;
private boolean isParagraphStart = false;
private boolean isPageBreak = false;
private boolean isHangingIndent = false;
private boolean isArticleStart = false;
private TextPosition position = null;
/**
* Constructs a PositionWrapper around the specified TextPosition object.
*
* @param position the text position.
*/
PositionWrapper(TextPosition position)
{
this.position = position;
}
/**
* Returns the underlying TextPosition object.
*
* @return the text position
*/
public TextPosition getTextPosition()
{
return position;
}
public boolean isLineStart()
{
return isLineStart;
}
/**
* Sets the isLineStart() flag to true.
*/
public void setLineStart()
{
this.isLineStart = true;
}
public boolean isParagraphStart()
{
return isParagraphStart;
}
/**
* sets the isParagraphStart() flag to true.
*/
public void setParagraphStart()
{
this.isParagraphStart = true;
}
public boolean isArticleStart()
{
return isArticleStart;
}
/**
* Sets the isArticleStart() flag to true.
*/
public void setArticleStart()
{
this.isArticleStart = true;
}
public boolean isPageBreak()
{
return isPageBreak;
}
/**
* Sets the isPageBreak() flag to true.
*/
public void setPageBreak()
{
this.isPageBreak = true;
}
public boolean isHangingIndent()
{
return isHangingIndent;
}
/**
* Sets the isHangingIndent() flag to true.
*/
public void setHangingIndent()
{
this.isHangingIndent = true;
}
}
}