All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.bingoohuang.pdf.PDFLayoutTextStripper Maven / Gradle / Ivy

package com.github.bingoohuang.pdf;

import lombok.Getter;
import lombok.Setter;
import lombok.val;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

// from https://github.com/JonathanLink/PDFLayoutTextStripper
public class PDFLayoutTextStripper extends PDFTextStripper {
    static final int OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT = 4;

    @Setter private double currentPageWidth;
    @Setter @Getter private TextPosition previousTextPosition;
    @Getter private List textLineList;

    PDFLayoutTextStripper() throws IOException {
        super();
        this.previousTextPosition = null;
        this.textLineList = new ArrayList<>();
    }

    @Override
    public void processPage(PDPage page) throws IOException {
        val pageRectangle = page.getMediaBox();
        if (pageRectangle != null) {
            this.setCurrentPageWidth(pageRectangle.getWidth());
            super.processPage(page);
            this.previousTextPosition = null;
            this.textLineList = new ArrayList<>();
        }
    }

    @Override
    protected void writePage() throws IOException {
        val charactersByArticle = super.getCharactersByArticle();
        for (val textList : charactersByArticle) {
            try {
                this.sortTextPositionList(textList);
            } catch (IllegalArgumentException e) {
                e.printStackTrace();
            }
            this.iterateThroughTextList(textList.iterator());
        }
        this.writeToOutputStream(this.getTextLineList());
    }

    private void writeToOutputStream(final List textLineList) throws IOException {
        for (val textLine : textLineList) {
            char[] line = textLine.getLine().toCharArray();
            super.getOutput().write(line);
            super.getOutput().write('\n');
            super.getOutput().flush();
        }
    }

    /*
     * In order to get rid of the warning:
     * TextPositionComparator class should implement Comparator instead of Comparator
     */
    @SuppressWarnings("unchecked")
    private void sortTextPositionList(final List textList) {
        textList.sort(new TextPositionComparator());
    }

    private void writeLine(final List textPositionList) {
        if (textPositionList.size() > 0) {
            val textLine = this.addNewLine();
            boolean firstCharacterOfLineFound = false;
            for (val textPosition : textPositionList) {
                val characterFactory = new CharacterFactory(firstCharacterOfLineFound);
                val character = characterFactory.createCharacterFromTextPosition(textPosition, this.getPreviousTextPosition());
                textLine.writeCharacterAtIndex(character);
                this.setPreviousTextPosition(textPosition);
                firstCharacterOfLineFound = true;
            }
        } else {
            this.addNewLine(); // white line
        }
    }

    private void iterateThroughTextList(Iterator textIterator) {
        val textPositionList = new ArrayList();

        while (textIterator.hasNext()) {
            val textPosition = textIterator.next();
            int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition);
            if (numberOfNewLines == 0) {
                textPositionList.add(textPosition);
            } else {
                this.writeTextPositionList(textPositionList);
                this.createNewEmptyNewLines(numberOfNewLines);
                textPositionList.add(textPosition);
            }
            this.setPreviousTextPosition(textPosition);
        }
        if (!textPositionList.isEmpty()) {
            this.writeTextPositionList(textPositionList);
        }
    }

    private void writeTextPositionList(final List textPositionList) {
        this.writeLine(textPositionList);
        textPositionList.clear();
    }

    private void createNewEmptyNewLines(int numberOfNewLines) {
        for (int i = 0; i < numberOfNewLines - 1; ++i) {
            this.addNewLine();
        }
    }

    private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textPosition) {
        val previousTextPosition = this.getPreviousTextPosition();
        if (previousTextPosition == null) return 1;

        val textYPosition = Math.round(textPosition.getY());
        val previousTextYPosition = Math.round(previousTextPosition.getY());

        if (textYPosition > previousTextYPosition && (textYPosition - previousTextYPosition > 5.5)) {
            val height = textPosition.getHeight();
            int numberOfLines = (int) (Math.floor(textYPosition - previousTextYPosition) / height);
            return Math.max(1, numberOfLines - 1);
        }

        return 0;
    }

    private TextLine addNewLine() {
        val textLine = new TextLine(this.getCurrentPageWidth());
        textLineList.add(textLine);
        return textLine;
    }

    private int getCurrentPageWidth() {
        return (int) Math.round(this.currentPageWidth);
    }
}

class TextLine {
    private static final char SPACE_CHARACTER = ' ';
    @Getter private int lineLength;
    @Getter private String line;
    @Getter @Setter private int lastIndex;

    TextLine(int lineLength) {
        this.line = "";
        this.lineLength = lineLength / PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
        this.completeLineWithSpaces();
    }

    void writeCharacterAtIndex(final Character character) {
        character.setIndex(this.computeIndexForCharacter(character));
        int index = character.getIndex();
        char characterValue = character.getCharacterValue();
        if (this.indexIsInBounds(index) && this.line.charAt(index) == SPACE_CHARACTER) {
            this.line = this.line.substring(0, index) + characterValue + this.line.substring(index + 1, this.getLineLength());
        }
    }

    private int computeIndexForCharacter(final Character character) {
        int index = character.getIndex();
        val isCharacterPartOfPreviousWord = character.isCharacterPartOfPreviousWord();
        val isCharacterAtTheBeginningOfNewLine = character.isCharacterAtTheBeginningOfNewLine();
        val isCharacterCloseToPreviousWord = character.isCharacterCloseToPreviousWord();

        if (!this.indexIsInBounds(index)) return -1;

        if (isCharacterPartOfPreviousWord && !isCharacterAtTheBeginningOfNewLine) {
            index = this.findMinimumIndexWithSpaceCharacterFromIndex(index);
        } else if (isCharacterCloseToPreviousWord) {
            if (this.line.charAt(index) != SPACE_CHARACTER) {
                index += 1;
            } else {
                index = this.findMinimumIndexWithSpaceCharacterFromIndex(index) + 1;
            }
        }
        return this.getNextValidIndex(index, isCharacterPartOfPreviousWord);
    }

    private boolean isSpaceCharacterAtIndex(int index) {
        return this.line.charAt(index) != SPACE_CHARACTER;
    }

    private boolean isNewIndexGreaterThanLastIndex(int index) {
        return index > this.getLastIndex();
    }

    private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord) {
        int nextValidIndex = index;
        int lastIndex = this.getLastIndex();
        if (!this.isNewIndexGreaterThanLastIndex(index)) {
            nextValidIndex = lastIndex + 1;
        }
        if (!isCharacterPartOfPreviousWord && this.isSpaceCharacterAtIndex(index - 1)) {
            nextValidIndex += 1;
        }
        this.setLastIndex(nextValidIndex);
        return nextValidIndex;
    }

    private int findMinimumIndexWithSpaceCharacterFromIndex(int index) {
        int newIndex = index;
        while (newIndex >= 0 && this.line.charAt(newIndex) == SPACE_CHARACTER) {
            newIndex -= 1;
        }
        return newIndex + 1;
    }

    private boolean indexIsInBounds(int index) {
        return index >= 0 && index < this.lineLength;
    }

    private void completeLineWithSpaces() {
        line += StringUtils.repeat(SPACE_CHARACTER, this.getLineLength());
    }
}

class Character {
    @Getter private char characterValue;
    @Getter @Setter private int index;
    @Getter private boolean characterPartOfPreviousWord;
    @Getter private boolean isFirstCharacterOfAWord;
    @Getter private boolean characterAtTheBeginningOfNewLine;
    @Getter private boolean characterCloseToPreviousWord;

    Character(char characterValue, int index, boolean isCharacterPartOfPreviousWord,
              boolean isFirstCharacterOfAWord, boolean isCharacterAtTheBeginningOfNewLine, boolean isCharacterPartOfASentence) {
        this.characterValue = characterValue;
        this.index = index;
        this.characterPartOfPreviousWord = isCharacterPartOfPreviousWord;
        this.isFirstCharacterOfAWord = isFirstCharacterOfAWord;
        this.characterAtTheBeginningOfNewLine = isCharacterAtTheBeginningOfNewLine;
        this.characterCloseToPreviousWord = isCharacterPartOfASentence;
    }
}


class CharacterFactory {
    @Setter @Getter private TextPosition previousTextPosition;
    private boolean firstCharacterOfLineFound;

    CharacterFactory(boolean firstCharacterOfLineFound) {
        this.firstCharacterOfLineFound = firstCharacterOfLineFound;
    }

    Character createCharacterFromTextPosition(final TextPosition textPosition, final TextPosition previousTextPosition) {
        this.setPreviousTextPosition(previousTextPosition);
        val isCharacterPartOfPreviousWord = this.isCharacterPartOfPreviousWord(textPosition);
        val isFirstCharacterOfAWord = this.isFirstCharacterOfAWord(textPosition);
        val isCharacterAtTheBeginningOfNewLine = this.isCharacterAtTheBeginningOfNewLine(textPosition);
        val isCharacterCloseToPreviousWord = this.isCharacterCloseToPreviousWord(textPosition);
        val character = textPosition.getUnicode().charAt(0);
        val index = (int) textPosition.getX() / PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
        return new Character(character, index,
                isCharacterPartOfPreviousWord,
                isFirstCharacterOfAWord,
                isCharacterAtTheBeginningOfNewLine,
                isCharacterCloseToPreviousWord);
    }

    private boolean isCharacterAtTheBeginningOfNewLine(final TextPosition textPosition) {
        if (!firstCharacterOfLineFound) return true;

        val previousTextPosition = this.getPreviousTextPosition();
        val previousTextYPosition = previousTextPosition.getY();
        return (Math.round(textPosition.getY()) < Math.round(previousTextYPosition));
    }

    private boolean isFirstCharacterOfAWord(final TextPosition textPosition) {
        if (!firstCharacterOfLineFound) return true;

        val numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
        return (numberOfSpaces > 1) || this.isCharacterAtTheBeginningOfNewLine(textPosition);
    }

    private boolean isCharacterCloseToPreviousWord(final TextPosition textPosition) {
        if (!firstCharacterOfLineFound) return false;

        val numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
        val widthOfSpace = (int) Math.ceil(textPosition.getWidthOfSpace());
        return (numberOfSpaces > 1 && numberOfSpaces <= widthOfSpace);
    }

    private boolean isCharacterPartOfPreviousWord(final TextPosition textPosition) {
        val previousTextPosition = this.getPreviousTextPosition();
        if (previousTextPosition.getUnicode().equals(" ")) return false;

        return this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition) <= 1;
    }

    private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPosition1, final TextPosition textPosition2) {
        val textEndXPosition1 = (textPosition1.getX() + textPosition1.getWidth());
        return (double) Math.abs(Math.round(textPosition2.getX() - textEndXPosition1));
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy