All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.icepdf.ri.util.TextExtractionTask Maven / Gradle / Ivy

/*
 * Copyright 2006-2019 ICEsoft Technologies Canada Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the
 * License. You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS
 * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.icepdf.ri.util;

import org.icepdf.core.pobjects.Document;
import org.icepdf.core.pobjects.Page;
import org.icepdf.core.pobjects.graphics.text.LineText;
import org.icepdf.core.pobjects.graphics.text.WordText;

import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.text.ChoiceFormat;
import java.text.Format;
import java.text.MessageFormat;
import java.util.List;
import java.util.ResourceBundle;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This class is a utility for extracting text from a PDF document.
 *
 * @since 1.1
 */
public class TextExtractionTask extends SwingWorker {

    private static final Logger logger =
            Logger.getLogger(TextExtractionTask.class.toString());

    // total length of task (total page count), used for progress bar
    private final int lengthOfTask;

    // current progress, used for the progress bar
    private int current;

    // message displayed on progress bar
    private final MessageFormat messageDialogFormat;
    private final MessageFormat messageTextFormat;
    private String dialogMessage;

    // internationalization
    private final ResourceBundle messageBundle;

    // PDF document pointer
    private final Document document;

    // File used for text export
    private final File file;

    private final ProgressMonitor progressMonitor;

    private static final double[] fileLimits = {0, 1, 2};


    /**
     * Create a new instance of the TextExtraction object.
     *
     * @param document      document whose text will be extracted.
     * @param file          output file for extracted text.
     * @param progressMonitor progressMonitor to update with extraction progress
     * @param messageBundle main message bundle for i18n
     */
    public TextExtractionTask(Document document, File file, ProgressMonitor progressMonitor, ResourceBundle messageBundle) {
        this.document = document;
        this.file = file;
        lengthOfTask = document.getNumberOfPages();
        this.progressMonitor = progressMonitor;
        this.messageBundle = messageBundle;
        // build out dialog messages
        messageDialogFormat = new MessageFormat(messageBundle.getString(
                "viewer.exportText.fileStamp.progress.msg"));
        String[] fileStrings = {
                messageBundle.getString("viewer.exportText.fileStamp.progress.moreFile.msg"),
                messageBundle.getString("viewer.exportText.fileStamp.progress.oneFile.msg"),
                messageBundle.getString("viewer.exportText.fileStamp.progress.moreFile.msg")};
        ChoiceFormat choiceForm = new ChoiceFormat(fileLimits, fileStrings);
        Format[] formats = {null, choiceForm, null};
        messageDialogFormat.setFormats(formats);
        // build out text file messages
        messageTextFormat = new MessageFormat(messageBundle.getString("viewer.exportText.pageStamp.msg"));
    }

    @Override
    protected void done() {
        progressMonitor.close();
        Toolkit.getDefaultToolkit().beep();
    }

    @Override
    protected void process(List chunks) {
        // Update progressMonitor progress
        progressMonitor.setProgress(current);
        if (dialogMessage != null) {
            progressMonitor.setNote(dialogMessage);
        }
    }

    @Override
    protected Void doInBackground() {
        // Extraction of text from pdf procedure
        try {
            // create file output stream
            BufferedWriter fileOutputStream = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(file), StandardCharsets.UTF_8));
            // Print document information
            String pageNumber = messageBundle.getString("viewer.exportText.fileStamp.msg");

            fileOutputStream.write(pageNumber);
            fileOutputStream.write(10); // line break

            for (int i = 0; i < document.getNumberOfPages(); i++) {
                // break if needed
                if (isCancelled()) {
                    break;
                }
                // Update task information
                current = i;
                Object[] messageArguments = {String.valueOf((current + 1)), lengthOfTask, lengthOfTask};
                dialogMessage = messageDialogFormat.format(messageArguments);

                messageArguments = new Object[]{String.valueOf((current + 1))};
                pageNumber = messageTextFormat.format(messageArguments);

                fileOutputStream.write(pageNumber);
                fileOutputStream.write(10); // line break

                Page page = document.getPageTree().getPage(i);
                List pageLines;
                if (page.isInitiated()) {
                    // get a pages already initialized text.
                    pageLines = document.getPageViewText(i).getPageLines();
                } else {
                    // grap the text the fastest way possible.
                    pageLines = document.getPageText(i).getPageLines();
                }
                StringBuilder extractedText = null;
                for (LineText lineText : pageLines) {
                    extractedText = new StringBuilder();
                    for (WordText wordText : lineText.getWords()) {
                        extractedText.append(wordText.getText());
                    }
                    extractedText.append('\n');
                    fileOutputStream.write(extractedText.toString());
                }
                publish(extractedText);
            }
            current = 0;
            fileOutputStream.flush();
            fileOutputStream.close();
        } catch (Exception e) {
            logger.log(Level.FINE, "Error extraction page text to file ", e);
        }
        return null;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy