All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.icepdf.ri.util.TextExtractionTask Maven / Gradle / Ivy

There is a newer version: 6.2.2
Show newest version
/*
 * Copyright 2006-2017 ICEsoft Technologies Canada Corp.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the
 * License. You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS
 * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.icepdf.ri.util;

import org.icepdf.core.pobjects.Document;
import org.icepdf.core.pobjects.Page;
import org.icepdf.core.pobjects.graphics.text.LineText;
import org.icepdf.core.pobjects.graphics.text.WordText;
import org.icepdf.ri.common.SwingWorker;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.text.ChoiceFormat;
import java.text.Format;
import java.text.MessageFormat;
import java.util.List;
import java.util.ResourceBundle;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This class is a utility for extracting text from a PDF document.
 *
 * @since 1.1
 */
public class TextExtractionTask {

    private static final Logger logger =
            Logger.getLogger(TextExtractionTask.class.toString());

    // total length of task (total page count), used for progress bar
    private int lengthOfTask;

    // current progress, used for the progress bar
    private int current = 0;

    // message displayed on progress bar
    private String dialogMessage;

    // flags for threading
    private boolean done = false;
    private boolean canceled = false;

    // internationalization
    private ResourceBundle messageBundle;

    // PDF document pointer
    private Document document = null;

    // File used for text export
    private File file = null;

    /**
     * Create a new instance of the TextExtraction object.
     *
     * @param document document whose text will be extracted.
     * @param file     output file for extracted text.
     */
    public TextExtractionTask(Document document, File file, ResourceBundle messageBundle) {
        this.document = document;
        this.file = file;
        lengthOfTask = document.getNumberOfPages();
        this.messageBundle = messageBundle;
    }

    /**
     * Start the task,  created a new SwingWorker for the text extraction
     * process.
     */
    public void go() {
        final SwingWorker worker = new SwingWorker() {
            // reset all instance variables
            public Object construct() {
                current = 0;
                done = false;
                canceled = false;
                dialogMessage = null;
                return new ActualTask();
            }
        };
        worker.setThreadPriority(Thread.MIN_PRIORITY);
        worker.start();
    }

    /**
     * Find out how much work needs to be done.
     */
    public int getLengthOfTask() {
        return lengthOfTask;
    }

    /**
     * Find out how much has been done.
     */
    public int getCurrent() {
        return current;
    }

    /**
     * Stop the task.
     */
    public void stop() {
        canceled = true;
        dialogMessage = null;
    }

    /**
     * Find out if the task has completed.
     */
    public boolean isDone() {
        return done;
    }

    /**
     * Returns the most recent dialog message, or null
     * if there is no current dialog message.
     */
    public String getMessage() {
        return dialogMessage;
    }

    /**
     * The actual long running task.  This runs in a SwingWorker thread.
     */
    class ActualTask {
        ActualTask() {
            // Extraction of text from pdf procedure
            try {
                // create file output stream
                BufferedWriter fileOutputStream = new BufferedWriter(
                        new OutputStreamWriter(new FileOutputStream(file), "UTF8"));
                // Print document information
                String pageNumber =
                        messageBundle.getString("viewer.exportText.fileStamp.msg");

                fileOutputStream.write(pageNumber);
                fileOutputStream.write(10); // line break

                for (int i = 0; i < document.getNumberOfPages(); i++) {
                    // break if needed
                    if (canceled || done) {
                        break;
                    }

                    // Update task information
                    current = i;

                    // Build Internationalized plural phrase.
                    MessageFormat messageForm =
                            new MessageFormat(messageBundle.getString(
                                    "viewer.exportText.fileStamp.progress.msg"));
                    double[] fileLimits = {0, 1, 2};
                    String[] fileStrings = {
                            messageBundle.getString(
                                    "viewer.exportText.fileStamp.progress.moreFile.msg"),
                            messageBundle.getString(
                                    "viewer.exportText.fileStamp.progress.oneFile.msg"),
                            messageBundle.getString(
                                    "viewer.exportText.fileStamp.progress.moreFile.msg"),
                    };
                    ChoiceFormat choiceForm = new ChoiceFormat(fileLimits,
                            fileStrings);
                    Format[] formats = {null, choiceForm, null};
                    messageForm.setFormats(formats);
                    Object[] messageArguments = {String.valueOf((current + 1)),
                            lengthOfTask, lengthOfTask};

                    dialogMessage = messageForm.format(messageArguments);

                    messageForm =
                            new MessageFormat(messageBundle.getString(
                                    "viewer.exportText.pageStamp.msg"));
                    messageArguments = new Object[]{String.valueOf((current + 1))};

                    pageNumber = messageForm.format(messageArguments);

                    fileOutputStream.write(pageNumber);
                    fileOutputStream.write(10); // line break

                    Page page = document.getPageTree().getPage(i);
                    List pageLines;
                    if (page.isInitiated()) {
                        // get a pages already initialized text.
                        pageLines = document.getPageViewText(i).getPageLines();
                    } else {
                        // grap the text the fastest way possible.
                        pageLines = document.getPageText(i).getPageLines();
                    }
                    StringBuilder extractedText;
                    for (LineText lineText : pageLines) {
                        extractedText = new StringBuilder();
                        for (WordText wordText : lineText.getWords()) {
                            extractedText.append(wordText.getText());
                        }
                        extractedText.append('\n');
                        fileOutputStream.write(extractedText.toString());
                    }

                    Thread.yield();

                }

                done = true;
                current = 0;
                fileOutputStream.flush();
                fileOutputStream.close();
            } catch (Throwable e) {
                logger.log(Level.FINE, "Malformed URL Exception ", e);
            }
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy