com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kernel Show documentation
There is a newer version: 8.0.5
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2023 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.kernel.pdf.canvas.parser;

import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.PdfPage;

import java.util.HashMap;
import java.util.Map;

public final class PdfTextExtractor {

    private PdfTextExtractor() {
    }

    /**
     * Extract text from a specified page using an extraction strategy.
     * Also allows registration of custom IContentOperators that can influence
     * how (and whether or not) the PDF instructions will be parsed.
     * Extraction strategy must be passed as a new object for every single page.
     *
     * @param page     the page for the text to be extracted from
     * @param strategy the strategy to use for extracting text
     * @param additionalContentOperators an optional map of custom {@link IContentOperator}s for rendering instructions
     * @return the extracted text
     */
    public static String getTextFromPage(PdfPage page, ITextExtractionStrategy strategy, Map additionalContentOperators) {
        PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy, additionalContentOperators);
        parser.processPageContent(page);
        return strategy.getResultantText();
    }
    
    /**
     * Extract text from a specified page using an extraction strategy.
     * Extraction strategy must be passed as a new object for every single page.
     *
     * @param page     the page for the text to be extracted from
     * @param strategy the strategy to use for extracting text
     * @return the extracted text
     */
    public static String getTextFromPage(PdfPage page, ITextExtractionStrategy strategy) {
        return getTextFromPage(page, strategy, new HashMap());
    }

    /**
     * Extract text from a specified page using the default strategy.
     * Node: the default strategy is subject to change. If using a specific strategy
     * is important, please use {@link PdfTextExtractor#getTextFromPage(PdfPage, ITextExtractionStrategy)}.
     *
     * @param page the page for the text to be extracted from
     * @return the extracted text
     */
    public static String getTextFromPage(PdfPage page) {
        return getTextFromPage(page, new LocationTextExtractionStrategy());
    }
}