com.jaeksoft.searchlib.parser.PdfParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2010-2015 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.parser;
import java.awt.Dimension;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.comparator.LastModifiedFileComparator;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFMergerUtility;
import com.jaeksoft.searchlib.ClientCatalog;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.ocr.HocrDocument;
import com.jaeksoft.searchlib.ocr.HocrPdf;
import com.jaeksoft.searchlib.ocr.HocrPdf.HocrPage;
import com.jaeksoft.searchlib.ocr.OcrManager;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.ExecuteUtils.ExecutionException;
import com.jaeksoft.searchlib.util.GhostScript;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.ImageUtils;
import com.jaeksoft.searchlib.util.PdfCrack;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.ThreadUtils;
import com.jaeksoft.searchlib.util.pdfbox.PDFBoxUtils;
import com.jaeksoft.searchlib.util.pdfbox.PDFBoxUtils.TolerantPDFTextStripper;
public class PdfParser extends Parser {
public static final String[] DEFAULT_MIMETYPES = { "application/pdf" };
public static final String[] DEFAULT_EXTENSIONS = { "pdf" };
public static final Semaphore gsSemaphore = new Semaphore(Runtime.getRuntime().availableProcessors());
private static ParserFieldEnum[] fl = { ParserFieldEnum.parser_name, ParserFieldEnum.title, ParserFieldEnum.author,
ParserFieldEnum.subject, ParserFieldEnum.content, ParserFieldEnum.producer, ParserFieldEnum.keywords,
ParserFieldEnum.creation_date, ParserFieldEnum.modification_date, ParserFieldEnum.language,
ParserFieldEnum.number_of_pages, ParserFieldEnum.ocr_content, ParserFieldEnum.image_ocr_boxes,
ParserFieldEnum.pdfcrack_password };
public PdfParser() {
super(fl);
}
@Override
public void initProperties() throws SearchLibException {
super.initProperties();
addProperty(ClassPropertyEnum.SIZE_LIMIT, "0", null, 20, 1);
addProperty(ClassPropertyEnum.GHOSTSCRIPT_BINARYPATH, "", null, 50, 1);
addProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE, "", null, 50, 1);
}
private Calendar getCreationDate(PDDocumentInformation pdfInfo) {
try {
return pdfInfo.getCreationDate();
} catch (IOException e) {
Logging.warn(e);
return null;
}
}
private Calendar getModificationDate(PDDocumentInformation pdfInfo) {
try {
return pdfInfo.getCreationDate();
} catch (IOException e) {
Logging.warn(e);
return null;
}
}
private String getDate(Calendar cal) {
if (cal == null)
return null;
Date time = cal.getTime();
if (time == null)
return null;
return time.toString();
}
private void extractMetaData(ParserResultItem result, PDDocument pdf) throws IOException {
PDDocumentInformation info = pdf.getDocumentInformation();
if (info != null) {
result.addField(ParserFieldEnum.title, info.getTitle());
result.addField(ParserFieldEnum.subject, info.getSubject());
result.addField(ParserFieldEnum.author, info.getAuthor());
result.addField(ParserFieldEnum.producer, info.getProducer());
result.addField(ParserFieldEnum.keywords, info.getKeywords());
String d = getDate(getCreationDate(info));
if (d != null)
result.addField(ParserFieldEnum.creation_date, d);
d = getDate(getModificationDate(info));
if (d != null)
result.addField(ParserFieldEnum.modification_date, d);
}
int pages = pdf.getNumberOfPages();
result.addField(ParserFieldEnum.number_of_pages, pages);
PDDocumentCatalog catalog = pdf.getDocumentCatalog();
if (catalog != null) {
result.addField(ParserFieldEnum.language, catalog.getLanguage());
}
}
private int addLine(ParserResultItem result, String line) {
if (line == null)
return 0;
line = StringUtils.replaceConsecutiveSpaces(line, " ").trim();
int l = line.length();
if (l == 0)
return 0;
result.addField(ParserFieldEnum.content, line);
return line.length();
}
/**
* Extract text content using PDFBox
*
* @param result
* @param pdf
* @throws IOException
*/
private int extractTextContent(ParserResultItem result, PDDocument pdf) throws IOException {
TolerantPDFTextStripper stripper = new TolerantPDFTextStripper();
String text = stripper.getText(pdf);
if (StringUtils.isEmpty(text))
return 0;
String[] lines = StringUtils.splitLines(text);
int characterCount = 0;
for (String line : lines)
characterCount += addLine(result, line);
return characterCount;
}
/**
* Extract text content using Ghostscript
*
* @param result
* @param ghostScript
* @param pdfFile
* @param pdfPassword
* @throws IOException
* @throws InterruptedException
*/
private int extractTextContent(ParserResultItem result, PdfOcrContext context)
throws IOException, InterruptedException {
File textFile = null;
BufferedReader bufferedReader = null;
FileReader fileReader = null;
try {
textFile = File.createTempFile("oss_pdfparser", "txt");
context.ghostScript.extractText(context.pdfPassword, context.pdfFile, textFile);
fileReader = new FileReader(textFile);
bufferedReader = new BufferedReader(fileReader);
int characterCount = 0;
String line;
while ((line = bufferedReader.readLine()) != null)
characterCount += addLine(result, line);
return characterCount;
} catch (ExecutionException e) {
Logging.warn("Ghostscript returned: " + e.getReturnedText());
throw e;
} finally {
IOUtils.close(bufferedReader, fileReader);
if (textFile != null)
if (textFile.exists())
textFile.delete();
}
}
private String decrypt(PDDocument pdf, File pdfFile)
throws BadSecurityHandlerException, IOException, CryptographyException {
// Let's try first with an empty password
String password = StringUtils.EMPTY;
try {
pdf.openProtection(new StandardDecryptionMaterial(password));
} catch (CryptographyException e) {
// New attempt with PDFCrack
String pdfCrackCommandLine = getStringProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE);
if (StringUtils.isEmpty(pdfCrackCommandLine))
throw e;
password = PdfCrack.findPassword(pdfCrackCommandLine, pdfFile);
if (password == null) // No password found
throw new IOException("Encrypted PDF.");
// Password found, let's open
pdf.openProtection(new StandardDecryptionMaterial(password));
}
return password;
}
@Override
protected void parseContent(StreamLimiter streamLimiter, final LanguageEnum lang) throws IOException {
PdfOcrContext context = new PdfOcrContext();
context.lang = lang;
String fileName = null;
try {
String ghostScriptBinaryPath = getStringProperty(ClassPropertyEnum.GHOSTSCRIPT_BINARYPATH);
context.ghostScript = StringUtils.isEmpty(ghostScriptBinaryPath) ? null
: new GhostScript(ghostScriptBinaryPath);
fileName = streamLimiter.getFile().getName();
context.pdfFile = streamLimiter.getFile();
context.pdf = PDDocument.load(context.pdfFile, null);
try {
if (context.pdf.isEncrypted())
context.pdfPassword = decrypt(context.pdf, context.pdfFile);
} catch (Exception e) {
Logging.warn("PDFBox decryption failed " + fileName);
IOUtils.closeQuietly(context.pdf);
context.pdf = null;
}
ParserResultItem result = getNewParserResultItem();
result.addField(ParserFieldEnum.pdfcrack_password, context.pdfPassword);
if (context.pdf != null)
extractMetaData(result, context.pdf);
int charCount = 0;
if (context.ghostScript == null) {
if (context.pdf != null)
charCount = extractTextContent(result, context.pdf);
} else
charCount = extractTextContent(result, context);
if (charCount == 0 && context.pdf != null)
extractImagesForOCR(result, context);
result.langDetection(10000, ParserFieldEnum.content);
} catch (SearchLibException e) {
throw new IOException("Failed on " + fileName, e);
} catch (InterruptedException e) {
throw new IOException("Failed on " + fileName, e);
} catch (java.util.concurrent.ExecutionException e) {
throw new IOException("Failed on " + fileName, e);
} finally {
if (context.pdf != null)
context.pdf.close();
}
}
private HocrDocument doOcr(OcrManager ocr, LanguageEnum lang, BufferedImage image)
throws IOException, InterruptedException, SearchLibException {
File hocrFile = null;
try {
hocrFile = File.createTempFile("ossocr", "." + ocr.getHocrFileExtension());
ocr.ocerizeImage(image, hocrFile, lang, true);
if (hocrFile.length() == 0)
return null;
return new HocrDocument(hocrFile);
} finally {
if (hocrFile != null)
FileUtils.deleteQuietly(hocrFile);
}
}
private HocrDocument doOcr(OcrManager ocr, LanguageEnum lang, File imageFile)
throws IOException, InterruptedException, SearchLibException {
File hocrFile = null;
try {
hocrFile = File.createTempFile("ossocr", "." + ocr.getHocrFileExtension());
ocr.ocerize(imageFile, hocrFile, lang, true);
if (hocrFile.length() == 0)
return null;
return new HocrDocument(hocrFile);
} finally {
if (hocrFile != null)
FileUtils.deleteQuietly(hocrFile);
}
}
private void ocrImageGhostcript(PdfOcrContext context, int page)
throws IOException, InterruptedException, SearchLibException {
File imageFile = null;
try {
imageFile = File.createTempFile("oss_pdfparser", ".png");
gsSemaphore.acquire();
try {
context.ghostScript.generateImage(context.pdfPassword, page, context.pdfFile, 300, imageFile);
} finally {
gsSemaphore.release();
}
Dimension dimension = ImageUtils.getDimensions(imageFile);
HocrPage hocrPage = context.hocrPdf.createPage(page - 1, dimension.width, dimension.height);
hocrPage.addImage(doOcr(context.ocr, context.lang, imageFile));
} finally {
if (imageFile != null)
if (imageFile.exists())
imageFile.delete();
}
}
public class PdfOcrContext {
private PDDocument pdf = null;
private OcrManager ocr = null;
private LanguageEnum lang = null;
private GhostScript ghostScript = null;
private File pdfFile = null;
private String pdfPassword = null;
private HocrPdf hocrPdf = null;
}
public class ImageOcrCallable implements Callable {
private final PdfOcrContext context;
private final PDPage page;
private final int currentPage;
private final AtomicInteger emptyPageImages;
public ImageOcrCallable(PdfOcrContext context, PDPage page, int currentPage, AtomicInteger emptyPageImages) {
this.context = context;
this.page = page;
this.currentPage = currentPage;
this.emptyPageImages = emptyPageImages;
}
@Override
public Boolean call() throws IOException, InterruptedException, SearchLibException {
if (PDFBoxUtils.countCheckImage(page) == 0)
return false;
if (context.ghostScript == null) {
BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_BGR, 300);
if (ImageUtils.checkIfManyColors(image)) {
HocrPage hocrPage = context.hocrPdf.createPage(currentPage - 1, image.getWidth(),
image.getHeight());
hocrPage.addImage(doOcr(context.ocr, context.lang, image));
} else
emptyPageImages.incrementAndGet();
} else {
ocrImageGhostcript(context, currentPage);
}
return true;
}
}
private void extractImagesForOCR(ParserResultItem result, PdfOcrContext context)
throws SearchLibException, IOException, InterruptedException, java.util.concurrent.ExecutionException {
context.ocr = ClientCatalog.getOcrManager();
if (context.ocr == null || context.ocr.isDisabled())
return;
if (!getFieldMap().isMapped(ParserFieldEnum.ocr_content)
&& !getFieldMap().isMapped(ParserFieldEnum.image_ocr_boxes))
return;
context.hocrPdf = new HocrPdf();
List> pages = context.pdf.getDocumentCatalog().getAllPages();
Iterator> iter = pages.iterator();
int currentPage = 0;
AtomicInteger emptyPageImages = new AtomicInteger(0);
ExecutorService executorService = config.getThreadPool();
List> futures = new ArrayList>();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
ImageOcrCallable callable = new ImageOcrCallable(context, page, ++currentPage, emptyPageImages);
futures.add(executorService.submit(callable));
}
ThreadUtils. done(futures);
if (currentPage > 0 && emptyPageImages.get() == currentPage)
throw new SearchLibException("All pages are blank " + currentPage);
if (getFieldMap().isMapped(ParserFieldEnum.image_ocr_boxes))
context.hocrPdf.putHocrToParserField(result, ParserFieldEnum.image_ocr_boxes);
if (getFieldMap().isMapped(ParserFieldEnum.ocr_content))
context.hocrPdf.putTextToParserField(result, ParserFieldEnum.ocr_content);
}
@Override
public void mergeFiles(File fileDir, File destFile) throws SearchLibException {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
File[] files = new LastModifiedFileComparator().sort(fileDir.listFiles());
for (File file : files) {
String ext = FilenameUtils.getExtension(file.getName());
if (!"pdf".equalsIgnoreCase(ext))
continue;
pdfMerger.addSource(file);
}
if (destFile.exists())
destFile.delete();
pdfMerger.setDestinationFileName(destFile.getAbsolutePath());
try {
pdfMerger.mergeDocuments();
} catch (COSVisitorException e) {
throw new SearchLibException(e);
} catch (IOException e) {
throw new SearchLibException(e);
}
}
}