All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.examples.viewer.utils.Exporter Maven / Gradle / Ivy

The newest version!
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/java-pdf-library-support/
 *
 * (C) Copyright 1997-2013, IDRsolutions and Contributors.
 *
 * 	This file is part of JPedal
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * Exporter.java
 * ---------------
 */
package org.jpedal.examples.viewer.utils;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.imageio.ImageIO;
import javax.swing.ProgressMonitor;

import org.jpedal.PdfDecoder;
import org.jpedal.examples.viewer.gui.SwingGUI;
import org.jpedal.examples.viewer.gui.popups.SaveImage;
import org.jpedal.examples.viewer.gui.popups.SaveText;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.gui.GUIFactory;
import org.jpedal.io.ColorSpaceConvertor;
import org.jpedal.io.JAIHelper;
import org.jpedal.objects.PdfImageData;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.Messages;
import org.jpedal.utils.Strip;
import org.jpedal.utils.SwingWorker;

/** provide save functions for Viewer to write out text, images, etc */
public class Exporter {

	final public static int RECTANGLE = 1;
	final public static int WORDLIST = 2;
	final public static int TABLE = 3;

	/** file separator used */
	private final String separator = System.getProperty("file.separator");

	private String fileName = "";

	private GUIFactory currentGUI;

	private PdfDecoder dPDF;

	private String selectedFile;

	public Exporter(SwingGUI currentGUI, String selectedFile, PdfDecoder decode_pdf) {
		String fileName = new File(selectedFile).getName();
		if (fileName.lastIndexOf('.') != -1) fileName = fileName.substring(0, fileName.lastIndexOf('.'));

		StringBuilder fileNameBuffer = new StringBuilder(fileName);
		int index;
		while ((index = fileNameBuffer.toString().indexOf("%20")) != -1) {
			fileNameBuffer.replace(index, index + 3, " ");
		}

		this.fileName = fileNameBuffer.toString();
		this.currentGUI = currentGUI;
		this.selectedFile = selectedFile;
		this.dPDF = decode_pdf;
	}

	/**
	 * save image - different versions have different bugs for file formats so we use best for each image type
	 * 
	 * @param image_to_save
	 */
	private static void saveImage(BufferedImage image_to_save, String fileName, String prefix) {

		if (JAIHelper.isJAIused()) JAIHelper.confirmJAIOnClasspath();

		if (prefix.contains("tif") && JAIHelper.isJAIused()) {

			try {
				FileOutputStream os = new FileOutputStream(fileName);
				javax.media.jai.JAI.create("encode", image_to_save, os, "TIFF", null);
			}
			catch (FileNotFoundException e) {
				e.printStackTrace();
			}

		}
		else { // default
			try {

				ImageIO.write(image_to_save, prefix, new File(fileName));

			}
			catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * routine to write out clipped PDFs
	 */
	private void decodeHires(int start, int end, String imageType, String output_dir) {

		PdfDecoder decode_pdf = null;

		String target = "";

		// PdfDecoder returns a PdfException if there is a problem
		try {

			decode_pdf = new PdfDecoder(false);
			decode_pdf.setExtractionMode(PdfDecoder.FINALIMAGES + PdfDecoder.CLIPPEDIMAGES, 1);

			/** open the file (and read metadata including pages in file) */
			decode_pdf.openPdfFile(this.selectedFile);

		}
		catch (Exception e) {
			e.printStackTrace();
		}

		/**
		 * extract data from pdf (if allowed).
		 */
		if ((decode_pdf.isEncrypted() && (!decode_pdf.isPasswordSupplied())) && (!decode_pdf.isExtractionAllowed())) return;

		ProgressMonitor status = new ProgressMonitor(this.currentGUI.getFrame(), Messages.getMessage("PdfViewerMessage.ExtractImages"), "", start,
				end);

		try {
			int count = 0;
			boolean yesToAll = false;
			for (int page = start; page < end + 1; page++) { // read pages
				if (status.isCanceled()) {
					this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
							+ Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));
					return;
				}
				// decode the page
				decode_pdf.decodePage(page);

				// get the PdfImages object which now holds the images.
				// binary data is stored in a temp directory and we hold the
				// image name and other info in this object
				PdfImageData pdf_images = decode_pdf.getPdfImageData();

				// image count (note image 1 is item 0, so any loop runs 0 to count-1)
				int image_count = pdf_images.getImageCount();

				if (image_count > 0) {
					target = output_dir + page + this.separator;
					File targetExists = new File(target);
					if (!targetExists.exists()) targetExists.mkdir();
				}

				// work through and save each image
				for (int i = 0; i < image_count; i++) {

					String image_name = pdf_images.getImageName(i);
					BufferedImage image_to_save;

					float x1 = pdf_images.getImageXCoord(i);
					float y1 = pdf_images.getImageYCoord(i);
					float w = pdf_images.getImageWidth(i);
					float h = pdf_images.getImageHeight(i);

					try {

						image_to_save = decode_pdf.getObjectStore().loadStoredImage("CLIP_" + image_name);

						// save image

						if (image_to_save != null) {

							// remove transparency on jpeg
							if (imageType.toLowerCase().startsWith("jp")) image_to_save = ColorSpaceConvertor.convertToRGB(image_to_save);

							File fileToSave = new File(target + image_name + '.' + imageType);
							if (fileToSave.exists() && !yesToAll) {
								int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), true);

								if (n == 0) {
									// clicked yes so just carry on for this once
								}
								else
									if (n == 1) {
										// clicked yes to all, so set flag
										yesToAll = true;
									}
									else
										if (n == 2) {
											// clicked no, so loop round again
											status.setProgress(page);
											continue;
										}
										else {

											this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
													+ Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));

											status.close();
											return;
										}
							}

							saveImage(image_to_save, target + image_name + '.' + imageType, imageType);
							count++;
						}

						// save an xml file with details
						/**
						 * output the data
						 */
						// LogWriter.writeLog( "Writing out "+(outputName + ".xml"));
						OutputStreamWriter output_stream = new OutputStreamWriter(new FileOutputStream(target + image_name + ".xml"), "UTF-8");

						output_stream.write("\n");
						output_stream.write("\n");
						output_stream.write("\n\n\n");
						output_stream.write("\n");
						output_stream.write("" + this.fileName + "\n");
						output_stream.write("\n");
						output_stream.close();
					}
					catch (Exception ee) {
						ee.printStackTrace();
						LogWriter.writeLog("Exception " + ee + " in extracting images");
					}
				}

				// flush images in case we do more than 1 page so only contains
				// images from current page
				decode_pdf.flushObjectValues(true);

				status.setProgress(page + 1);

			}
			status.close();

			this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.ImagesSavedTo") + ' ' + output_dir);

		}
		catch (Exception e) {
			decode_pdf.closePdfFile();
			LogWriter.writeLog("Exception " + e.getMessage());
		}

		/** close the pdf file */
		decode_pdf.closePdfFile();
	}

	public void extractImagesOnPages(SaveImage current_selection) {
		final int startPage = current_selection.getStartPage();
		final int endPage = current_selection.getEndPage();

		if (startPage < 1 || endPage < 1) return;

		final int type = current_selection.getImageType();
		// get user choice
		final String format = current_selection.getPrefix();
		final String output_dir = current_selection.getRootDir() + this.separator + this.fileName + this.separator + "images" + this.separator;

		File testDirExists = new File(output_dir);
		if (!testDirExists.exists()) testDirExists.mkdirs();

		final SwingWorker worker = new SwingWorker() {
			@Override
			public Object construct() {
				// do the save

				switch (type) {
					case PdfDecoder.CLIPPEDIMAGES:
						decodeHires(startPage, endPage, format, output_dir);
						break;
					case PdfDecoder.RAWIMAGES:
						decodeImages(startPage, endPage, format, output_dir, false);
						break;
					case PdfDecoder.FINALIMAGES:
						decodeImages(startPage, endPage, format, output_dir, true);
						break;
					default:
						System.out.println("Unknown setting");
						break;
				}

				return null;
			}
		};

		worker.start();
	}

	/**
	 * routine to write out images in PDFs
	 */
	private void decodeImages(int start, int end, String prefix, String output_dir, boolean downsampled) {

		PdfDecoder decode_pdf = null;

		// PdfDecoder returns a PdfException if there is a problem
		try {

			decode_pdf = new PdfDecoder(false);

			decode_pdf.setExtractionMode(PdfDecoder.RAWIMAGES + PdfDecoder.FINALIMAGES, 1);
			/** open the file (and read metadata including pages in file) */
			decode_pdf.openPdfFile(this.selectedFile);

		}
		catch (Exception e) {
			e.printStackTrace();
		}

		/**
		 * extract data from pdf (if allowed).
		 */
		if ((decode_pdf.isEncrypted() && (!decode_pdf.isPasswordSupplied())) && (!decode_pdf.isExtractionAllowed())) return;

		ProgressMonitor status = new ProgressMonitor(this.currentGUI.getFrame(), Messages.getMessage("PdfViewerMessage.ExtractImages"), "", start,
				end);

		try {
			int count = 0;
			boolean yesToAll = false;
			for (int page = start; page < end + 1; page++) { // read pages
				if (status.isCanceled()) {
					this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count
							+ Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));
					return;
				}
				// decode the page
				decode_pdf.decodePage(page);

				// get the PdfImages object which now holds the images.
				// binary data is stored in a temp directory and we hold the
				// image name and other info in this object
				PdfImageData pdf_images = decode_pdf.getPdfImageData();

				// image count (note image 1 is item 0, so any loop runs 0 to count-1)
				int image_count = pdf_images.getImageCount();

				String target = output_dir + this.separator;
				if (downsampled) target = target + "downsampled" + this.separator + page + this.separator;
				else target = target + "normal" + this.separator + page + this.separator;

				// tell user
				if (image_count > 0) {

					// create a directory for page
					File page_path = new File(target);
					if (page_path.exists() == false) page_path.mkdirs();

					// do it again as some OS struggle with creating nested dirs
					page_path = new File(target);
					if (page_path.exists() == false) page_path.mkdirs();

				}

				// work through and save each image
				for (int i = 0; i < image_count; i++) {
					String image_name = pdf_images.getImageName(i);
					BufferedImage image_to_save;

					try {
						if (downsampled) {
							// load processed version of image (converted to rgb)
							image_to_save = decode_pdf.getObjectStore().loadStoredImage(image_name);
							if (prefix.toLowerCase().startsWith("jp")) {
								image_to_save = ColorSpaceConvertor.convertToRGB(image_to_save);

							}
						}
						else {
							// get raw version of image (R prefix for raw image)
							image_to_save = decode_pdf.getObjectStore().loadStoredImage(image_name);
							if (prefix.toLowerCase().startsWith("jp")) {
								image_to_save = ColorSpaceConvertor.convertToRGB(image_to_save);
							}
						}

						File fileToSave = new File(target + image_name + '.' + prefix);
						if (fileToSave.exists() && !yesToAll) {
							int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), true);

							if (n == 0) {
								// clicked yes so just carry on for this once
							}
							else
								if (n == 1) {
									// clicked yes to all, so set flag
									yesToAll = true;
								}
								else
									if (n == 2) {
										// clicked no, so loop round again
										status.setProgress(page);
										continue;
									}
									else {

										this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
												+ Messages.getMessage("PdfViewerError.ReportNumberOfImagesExported"));

										status.close();
										return;
									}
						}

						// save image
						saveImage(image_to_save, target + image_name + '.' + prefix, prefix);
						count++;
					}

					catch (Exception ee) {
						System.err.println("Exception " + ee + " in extracting images");
					}
				}

				// flush images in case we do more than 1 page so only contains
				// images from current page
				decode_pdf.flushObjectValues(true);

				status.setProgress(page + 1);
			}

			this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.ImagesSavedTo") + ' ' + output_dir);

			status.close();
		}
		catch (Exception e) {
			decode_pdf.closePdfFile();
			LogWriter.writeLog("Exception " + e.getMessage());
		}

		/** close the pdf file */
		decode_pdf.closePdfFile();
	}

	public void extractTextOnPages(SaveText current_selection) {
		// get user choice
		final int startPage = current_selection.getStartPage();
		final int endPage = current_selection.getEndPage();

		if (startPage < 1 || endPage < 1) return;

		final int type = current_selection.getTextType();
		final boolean useXMLExtraction = current_selection.isXMLExtaction();

		final String output_dir = current_selection.getRootDir() + this.separator + this.fileName + this.separator + "text" + this.separator;

		File testDirExists = new File(output_dir);
		if (!testDirExists.exists()) testDirExists.mkdirs();

		final SwingWorker worker = new SwingWorker() {
			@Override
			public Object construct() {
				// do the save

				switch (type) {
					case Exporter.RECTANGLE:
						decodeTextRectangle(startPage, endPage, output_dir, useXMLExtraction);
						break;
					case Exporter.WORDLIST:
						decodeTextWordlist(startPage, endPage, output_dir, useXMLExtraction);
						break;
					case Exporter.TABLE:
						decodeTextTable(startPage, endPage, output_dir, useXMLExtraction);

						break;
					default:
						System.out.println("Unknown setting");
						break;
				}

				return null;
			}
		};

		worker.start();
	}

	private void decodeTextTable(int startPage, int endPage, String output_dir, boolean useXMLExtraction) {

		PdfDecoder decode_pdf = null;

		try {
			decode_pdf = new PdfDecoder(false);
			decode_pdf.setExtractionMode(PdfDecoder.TEXT); // extract just text

			PdfDecoder.init(true);

			/**
			 * open the file (and read metadata including pages in file)
			 */

			decode_pdf.openPdfFile(this.selectedFile);

		}
		catch (Exception e) {
			System.err.println("Exception " + e + " in pdf code");
		}

		/**
		 * extract data from pdf (if allowed).
		 */
		if ((decode_pdf.isEncrypted() && (!decode_pdf.isPasswordSupplied())) && (!decode_pdf.isExtractionAllowed())) {
			System.out.println("Encrypted settings");
			System.out.println("Please look at Viewer for code sample to handle such files");
		}
		else {

			ProgressMonitor status = new ProgressMonitor(this.currentGUI.getFrame(), Messages.getMessage("PdfViewerMessage.ExtractText"), "",
					startPage, endPage);
			/**
			 * extract data from pdf
			 */
			try {
				int count = 0;
				boolean yesToAll = false;
				for (int page = startPage; page < endPage + 1; page++) { // read pages
					if (status.isCanceled()) {
						this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
								+ Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
						return;
					}
					// decode the page
					decode_pdf.decodePage(page);

					/** create a grouping object to apply grouping to data */
					PdfGroupingAlgorithms currentGrouping = decode_pdf.getGroupingObject();

					/** use whole page size for demo - get data from PageData object */
					PdfPageData currentPageData = decode_pdf.getPdfPageData();

					int x1, y1, x2, y2;

					x1 = currentPageData.getMediaBoxX(page);
					x2 = currentPageData.getMediaBoxWidth(page) + x1;

					y2 = currentPageData.getMediaBoxY(page);
					y1 = currentPageData.getMediaBoxHeight(page) + y2;

					// default for xml
					String ending = "_text.csv";

					if (useXMLExtraction) ending = "_xml.txt";

					/** Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */

					/** The call to extract the table */
					Map tableContent;
					String tableText = null;

					try {
						// the source code for this grouping is in the customer area
						// in class pdfGroupingAlgorithms
						// all these settings are defined in the Java

						tableContent = currentGrouping.extractTextAsTable(x1, y1, x2, y2, page, !useXMLExtraction, false, false, false, 0);

						// get the text from the Map object
						tableText = (String) tableContent.get("content");

					}
					catch (PdfException e) {
						decode_pdf.closePdfFile();
						System.err.println("Exception " + e.getMessage() + " with table extraction");
					}
					catch (Error e) {
						e.printStackTrace();
					}

					if (tableText == null) {
						System.out.println("No text found");
					}
					else {

						String target = output_dir + this.separator + "table" + this.separator;

						// create a directory if it doesn't exist
						File output_path = new File(target);
						if (output_path.exists() == false) output_path.mkdirs();

						File fileToSave = new File(target + this.fileName + '_' + page + ending);
						if (fileToSave.exists() && !yesToAll) {
							if ((endPage - startPage) > 1) {
								int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), true);

								if (n == 0) {
									// clicked yes so just carry on for this once
								}
								else
									if (n == 1) {
										// clicked yes to all, so set flag
										yesToAll = true;
									}
									else
										if (n == 2) {
											// clicked no, so loop round again
											status.setProgress(page);
											continue;
										}
										else {

											this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
													+ Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));

											status.close();
											return;
										}
							}
							else {
								int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), false);

								if (n == 0) {
									// clicked yes so just carry on
								}
								else {
									// clicked no, so exit
									return;
								}
							}
						}

						/**
						 * output the data - you may wish to alter the encoding to suit
						 */
						OutputStreamWriter output_stream = new OutputStreamWriter(new FileOutputStream(target + this.fileName + '_' + page + ending),
								"UTF-8");

						// xml header
						if (useXMLExtraction) output_stream.write("\n\n");

						output_stream.write(tableText); // write actual data

						// xml footer
						if (useXMLExtraction) output_stream.write("\n");

						output_stream.close();

					}
					count++;
					status.setProgress(page + 1);
					// remove data once written out
					decode_pdf.flushObjectValues(false);
				}
				status.close();
				this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo") + ' ' + output_dir);
			}
			catch (Exception e) {
				decode_pdf.closePdfFile();
				System.err.println("Exception " + e.getMessage());
				e.printStackTrace();
			}
			catch (Error e) {
				System.out.println("h34343");
				e.printStackTrace();
			}

			decode_pdf.flushObjectValues(true); // flush any text data read

		}

		/** close the pdf file */
		decode_pdf.closePdfFile();
	}

	private void decodeTextWordlist(int startPage, int endPage, String output_dir, boolean useXMLExtraction) {

		PdfDecoder decode_pdf = null;

		// PdfDecoder returns a PdfException if there is a problem
		try {
			decode_pdf = new PdfDecoder(false);

			decode_pdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
			PdfDecoder.init(true);

			// always reset to use unaltered co-ords - allow use of rotated or unrotated
			// co-ordinates on pages with rotation (used to be in PdfDecoder)
			PdfGroupingAlgorithms.useUnrotatedCoords = false;

			/**
			 * open the file (and read metadata including pages in file)
			 */
			decode_pdf.openPdfFile(this.selectedFile);

		}
		catch (PdfSecurityException e) {
			System.err.println("Exception " + e + " in pdf code for wordlist" + this.selectedFile);
		}
		catch (PdfException e) {
			System.err.println("Exception " + e + " in pdf code for wordlist" + this.selectedFile);

		}
		catch (Exception e) {
			System.err.println("Exception " + e + " in pdf code for wordlist" + this.selectedFile);
			e.printStackTrace();
		}

		/**
		 * extract data from pdf (if allowed).
		 */
		if ((decode_pdf.isEncrypted() && (!decode_pdf.isPasswordSupplied())) && (!decode_pdf.isExtractionAllowed())) {
			System.out.println("Encrypted settings");
			System.out.println("Please look at Viewer for code sample to handle such files");

		}
		else {
			// page range
			int start = startPage, end = endPage;
			int wordsExtracted = 0;

			ProgressMonitor status = new ProgressMonitor(this.currentGUI.getFrame(), Messages.getMessage("PdfViewerMessage.ExtractText"), "",
					startPage, endPage);

			/**
			 * extract data from pdf
			 */
			try {
				int count = 0;
				boolean yesToAll = false;
				for (int page = start; page < end + 1; page++) { // read pages
					if (status.isCanceled()) {
						this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
								+ Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
						return;
					}
					// decode the page
					decode_pdf.decodePage(page);

					/** create a grouping object to apply grouping to data */
					PdfGroupingAlgorithms currentGrouping = decode_pdf.getGroupingObject();

					/** use whole page size for demo - get data from PageData object */
					PdfPageData currentPageData = decode_pdf.getPdfPageData();

					int x1 = currentPageData.getMediaBoxX(page);
					int x2 = currentPageData.getMediaBoxWidth(page) + x1;

					int y2 = currentPageData.getMediaBoxX(page);
					int y1 = currentPageData.getMediaBoxHeight(page) - y2;

					/** Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */

					/** The call to extract the list */
					List words = null;

					try {
						words = currentGrouping.extractTextAsWordlist(x1, y1, x2, y2, page, true, "&:=()!;.,\\/\"\"\'\'");
					}
					catch (PdfException e) {
						decode_pdf.closePdfFile();
						System.err.println("Exception= " + e + " in " + this.selectedFile);
						e.printStackTrace();
					}
					catch (Error e) {
						e.printStackTrace();
					}

					if (words == null) {

						System.out.println("No text found");

					}
					else {

						String target = output_dir + this.separator + "wordlist" + this.separator;

						// create a directory if it doesn't exist
						File output_path = new File(target);
						if (output_path.exists() == false) output_path.mkdirs();

						/**
						 * choose correct prefix
						 */
						String prefix = "_text.txt";
						String encoding = System.getProperty("file.encoding");

						if (useXMLExtraction) {
							prefix = "_xml.txt";
							encoding = "UTF-8";
						}

						/** each word is stored as 5 consecutive values (word,x1,y1,x2,y2) */
						int wordCount = words.size() / 5;

						// update our count
						wordsExtracted = wordsExtracted + wordCount;

						File fileToSave = new File(target + this.fileName + '_' + page + prefix);
						if (fileToSave.exists() && !yesToAll) {
							if ((endPage - startPage) > 1) {
								int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), true);

								if (n == 0) {
									// clicked yes so just carry on for this once
								}
								else
									if (n == 1) {
										// clicked yes to all, so set flag
										yesToAll = true;
									}
									else
										if (n == 2) {
											// clicked no, so loop round again
											status.setProgress(page);
											continue;
										}
										else {

											this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
													+ Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));

											status.close();
											return;
										}
							}
							else {
								int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), false);

								if (n == 0) {
									// clicked yes so just carry on
								}
								else {
									// clicked no, so exit
									return;
								}
							}
						}

						/**
						 * output the data
						 */
						OutputStreamWriter output_stream = new OutputStreamWriter(new FileOutputStream(target + this.fileName + '_' + page + prefix),
								encoding);

						Iterator wordIterator = words.iterator();
						while (wordIterator.hasNext()) {

							String currentWord = (String) wordIterator.next();

							/** remove the XML formatting if present - not needed for pure text */
							if (!useXMLExtraction) currentWord = Strip.convertToText(currentWord, true);

							int wx1 = (int) Float.parseFloat((String) wordIterator.next());
							int wy1 = (int) Float.parseFloat((String) wordIterator.next());
							int wx2 = (int) Float.parseFloat((String) wordIterator.next());
							int wy2 = (int) Float.parseFloat((String) wordIterator.next());

							/** this could be inserting into a database instead */
							output_stream.write(currentWord + ',' + wx1 + ',' + wy1 + ',' + wx2 + ',' + wy2 + '\n');

						}
						output_stream.close();

					}

					count++;
					status.setProgress(page + 1);

					// remove data once written out
					decode_pdf.flushObjectValues(false);

				}
				status.close();
				this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo") + ' ' + output_dir);
			}
			catch (Exception e) {
				decode_pdf.closePdfFile();
				System.err.println("Exception " + e + " in " + this.selectedFile);
				e.printStackTrace();
			}
			catch (Error e) {
				e.printStackTrace();
			}
		}

		/** close the pdf file */
		decode_pdf.closePdfFile();
	}

	private void decodeTextRectangle(int startPage, int endPage, String output_dir, boolean useXMLExtraction) {

		PdfDecoder decode_pdf = null;

		// PdfDecoder returns a PdfException if there is a problem
		try {
			decode_pdf = new PdfDecoder(false);

			if (!useXMLExtraction) decode_pdf.useTextExtraction();

			decode_pdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
			PdfDecoder.init(true);

			/**
			 * open the file (and read metadata including pages in file)
			 */
			decode_pdf.openPdfFile(this.selectedFile);

		}
		catch (PdfSecurityException se) {
			System.err.println("Security Exception " + se + " in pdf code for text extraction on file ");
			// e.printStackTrace();
		}
		catch (PdfException se) {
			System.err.println("Pdf Exception " + se + " in pdf code for text extraction on file ");
			// e.printStackTrace();
		}
		catch (Exception e) {
			System.err.println("Exception " + e + " in pdf code for text extraction on file ");
			e.printStackTrace();
		}

		/**
		 * extract data from pdf (if allowed).
		 */
		if ((decode_pdf.isEncrypted() && (!decode_pdf.isPasswordSupplied())) && (!decode_pdf.isExtractionAllowed())) {
			System.out.println("Encrypted settings");
			System.out.println("Please look at Viewer for code sample to handle such files");

		}
		else {

			ProgressMonitor status = new ProgressMonitor(this.currentGUI.getFrame(), Messages.getMessage("PdfViewerMessage.ExtractText"), "",
					startPage, endPage);

			/**
			 * extract data from pdf
			 */
			try {
				int count = 0;
				boolean yesToAll = false;
				for (int page = startPage; page < endPage + 1; page++) { // read pages
					if (status.isCanceled()) {
						this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
								+ Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));
						return;
					}
					// decode the page
					decode_pdf.decodePage(page);

					/** create a grouping object to apply grouping to data */
					PdfGroupingAlgorithms currentGrouping = decode_pdf.getGroupingObject();

					/** use whole page size for demo - get data from PageData object */
					PdfPageData currentPageData = decode_pdf.getPdfPageData();

					int x1 = currentPageData.getMediaBoxX(page);
					int x2 = currentPageData.getMediaBoxWidth(page) + x1;

					int y2 = currentPageData.getMediaBoxY(page);
					int y1 = currentPageData.getMediaBoxHeight(page) + y2;

					/** Co-ordinates are x1,y1 (top left hand corner), x2,y2(bottom right) */

					/** The call to extract the text */
					String text = null;

					try {
						text = currentGrouping.extractTextInRectangle(x1, y1, x2, y2, page, false, true);
					}
					catch (PdfException e) {
						decode_pdf.closePdfFile();
						System.err.println("Exception " + e.getMessage() + " in file " + decode_pdf.getObjectStore().fullFileName);
						e.printStackTrace();
					}

					// allow for no text
					if (text == null) continue;

					String target = output_dir + this.separator + "rectangle" + this.separator;

					// ensure a directory for data
					File page_path = new File(target);
					if (page_path.exists() == false) page_path.mkdirs();

					/**
					 * choose correct prefix
					 */
					String prefix = "_text.txt";
					String encoding = System.getProperty("file.encoding");

					if (useXMLExtraction) {
						prefix = "_xml.txt";
						encoding = "UTF-8";
					}

					File fileToSave = new File(target + this.fileName + '_' + page + prefix);
					if (fileToSave.exists() && !yesToAll) {
						if ((endPage - startPage) > 1) {
							int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), true);

							if (n == 0) {
								// clicked yes so just carry on for this once
							}
							else
								if (n == 1) {
									// clicked yes to all, so set flag
									yesToAll = true;
								}
								else
									if (n == 2) {
										// clicked no, so loop round again
										status.setProgress(page);
										continue;
									}
									else {

										this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerError.UserStoppedExport") + count + ' '
												+ Messages.getMessage("PdfViewerError.ReportNumberOfPagesExported"));

										status.close();
										return;
									}
						}
						else {
							int n = this.currentGUI.showOverwriteDialog(fileToSave.getAbsolutePath(), false);

							if (n == 0) {
								// clicked yes so just carry on
							}
							else {
								// clicked no, so exit
								return;
							}
						}
					}

					/**
					 * output the data
					 */
					OutputStreamWriter output_stream = new OutputStreamWriter(new FileOutputStream(target + this.fileName + '_' + page + prefix),
							encoding);

					if ((useXMLExtraction)) {
						output_stream.write("\n\n");
						output_stream.write("\n");
						output_stream.write("\n\n
\n"); output_stream.write("\n"); output_stream.write("\n\n\n"); // NOTE DATA IS TECHNICALLY UNICODE output_stream.write(text); // write actual data output_stream.write("\n\n\n"); output_stream.write("\n\n
\n"); } else output_stream.write(text); // write actual data count++; output_stream.close(); status.setProgress(page + 1); // remove data once written out decode_pdf.flushObjectValues(true); } status.close(); this.currentGUI.showMessageDialog(Messages.getMessage("PdfViewerMessage.TextSavedTo") + ' ' + output_dir); } catch (Exception e) { decode_pdf.closePdfFile(); System.err.println("Exception " + e.getMessage()); e.printStackTrace(); System.out.println(decode_pdf.getObjectStore().getCurrentFilename()); } } /** close the pdf file */ decode_pdf.closePdfFile(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy