
com.jaeksoft.searchlib.parser.IcePdfParser Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2010-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.parser;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.commons.exec.ExecuteException;
import org.apache.commons.io.FileUtils;
import org.icepdf.core.SecurityCallback;
import org.icepdf.core.exceptions.PDFException;
import org.icepdf.core.exceptions.PDFSecurityException;
import org.icepdf.core.pobjects.Document;
import org.icepdf.core.pobjects.PInfo;
import org.icepdf.core.pobjects.Page;
import org.icepdf.core.pobjects.graphics.text.LineText;
import org.icepdf.core.pobjects.graphics.text.PageText;
import org.icepdf.core.pobjects.graphics.text.WordText;
import org.icepdf.core.util.GraphicsRenderingHints;
import com.jaeksoft.searchlib.ClientCatalog;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassProperty;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.ocr.HocrDocument;
import com.jaeksoft.searchlib.ocr.HocrPdf;
import com.jaeksoft.searchlib.ocr.HocrPdf.HocrPage;
import com.jaeksoft.searchlib.ocr.OcrManager;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.ImageUtils;
import com.jaeksoft.searchlib.util.PdfCrack;
import com.jaeksoft.searchlib.util.StringUtils;
public class IcePdfParser extends Parser {
private static ParserFieldEnum[] fl = { ParserFieldEnum.parser_name,
ParserFieldEnum.title, ParserFieldEnum.author,
ParserFieldEnum.subject, ParserFieldEnum.creator,
ParserFieldEnum.content, ParserFieldEnum.producer,
ParserFieldEnum.keywords, ParserFieldEnum.creation_date,
ParserFieldEnum.modification_date, ParserFieldEnum.language,
ParserFieldEnum.number_of_pages, ParserFieldEnum.ocr_content,
ParserFieldEnum.image_ocr_boxes, ParserFieldEnum.pdfcrack_password };
public IcePdfParser() {
super(fl);
}
@Override
public void initProperties() throws SearchLibException {
super.initProperties();
addProperty(ClassPropertyEnum.SIZE_LIMIT, "0", null, 20, 1);
// addProperty(ClassPropertyEnum.OCR_PDF_METHOD,
// ClassPropertyEnum.OCR_PDF_METHODS[0],
// ClassPropertyEnum.OCR_PDF_METHODS);
}
private void extractContent(ParserResultItem result, Document pdf)
throws IOException, InterruptedException {
PInfo info = pdf.getInfo();
if (info != null) {
result.addField(ParserFieldEnum.title, info.getTitle());
result.addField(ParserFieldEnum.subject, info.getSubject());
result.addField(ParserFieldEnum.author, info.getAuthor());
result.addField(ParserFieldEnum.producer, info.getProducer());
result.addField(ParserFieldEnum.keywords, info.getKeywords());
result.addField(ParserFieldEnum.creator, info.getCreator());
result.addField(ParserFieldEnum.creation_date,
info.getCreationDate());
result.addField(ParserFieldEnum.modification_date,
info.getModDate());
}
int pages = pdf.getNumberOfPages();
result.addField(ParserFieldEnum.number_of_pages, pages);
for (int page = 0; page < pages; page++) {
PageText pageText = pdf.getPageText(page);
if (pageText != null && pageText.getPageLines() != null) {
List lineTextArray = pageText.getPageLines();
if (lineTextArray != null)
for (LineText lineText : lineTextArray) {
StringBuilder sb = new StringBuilder();
List words = lineText.getWords();
if (words != null)
for (WordText word : words)
sb.append(word.getText());
if (sb.length() > 0)
result.addField(
ParserFieldEnum.content,
StringUtils.replaceConsecutiveSpaces(
sb.toString(), " ").trim());
}
}
}
result.langDetection(10000, ParserFieldEnum.content);
}
private class PdfCrackCallback implements SecurityCallback {
private final File pdfFile;
private final String commandLine;
private String password = null;
private PdfCrackCallback(String commandLine, File pdfFile) {
this.commandLine = commandLine;
this.pdfFile = pdfFile;
}
@Override
public String requestPassword(Document doc) {
password = null;
try {
password = PdfCrack.findPassword(commandLine, pdfFile);
} catch (ExecuteException e) {
Logging.error(e);
} catch (IOException e) {
Logging.error(e);
}
return password;
}
}
@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang)
throws IOException {
Document pdf = null;
String fileName = null;
try {
PdfCrackCallback pdfCrackCallback = null;
fileName = streamLimiter.getFile().getName();
pdf = new Document();
ClassProperty cp = getProperty(ClassPropertyEnum.PDFCRACK_COMMANDLINE);
String pdfCrackCommandLine = cp == null ? null : cp.getValue();
if (!StringUtils.isEmpty(pdfCrackCommandLine)) {
pdfCrackCallback = new PdfCrackCallback(pdfCrackCommandLine,
streamLimiter.getFile());
pdf.setSecurityCallback(pdfCrackCallback);
}
pdf.setFile(streamLimiter.getFile().getAbsolutePath());
ParserResultItem result = getNewParserResultItem();
if (pdfCrackCallback != null)
result.addField(ParserFieldEnum.pdfcrack_password,
pdfCrackCallback.password);
extractContent(result, pdf);
extractImagesForOCR(result, pdf, lang);
} catch (SearchLibException e) {
throw new IOException("Failed on " + fileName, e);
} catch (PDFException e) {
throw new IOException("Failed on " + fileName, e);
} catch (InterruptedException e) {
throw new IOException("Failed on " + fileName, e);
} catch (PDFSecurityException e) {
throw new IOException("Failed on " + fileName, e);
} finally {
if (pdf != null)
pdf.dispose();
}
}
private HocrDocument imageOcr(Image image, float rotation,
LanguageEnum lang, OcrManager ocr) throws InterruptedException,
IOException, SearchLibException {
File hocrFile = null;
try {
if (rotation != 0) {
image = ImageUtils.toBufferedImage(image);
image = ImageUtils.rotate((BufferedImage) image, rotation);
}
hocrFile = File.createTempFile("ossocr",
"." + ocr.getHocrFileExtension());
ocr.ocerizeImage(image, hocrFile, lang, true);
return new HocrDocument(hocrFile);
} finally {
if (hocrFile != null)
FileUtils.deleteQuietly(hocrFile);
}
}
private void extractImagesForOCR(ParserResultItem result, Document pdf,
LanguageEnum lang) throws IOException, SearchLibException,
InterruptedException {
OcrManager ocr = ClientCatalog.getOcrManager();
if (ocr == null || ocr.isDisabled())
return;
HocrPdf hocrPdf = new HocrPdf();
if (!getFieldMap().isMapped(ParserFieldEnum.ocr_content)
&& !getFieldMap().isMapped(ParserFieldEnum.image_ocr_boxes))
return;
int emptyPageImages = 0;
for (int i = 0; i < pdf.getNumberOfPages(); i++) {
List images = pdf.getPageImages(i);
if (images == null || images.size() == 0)
continue;
float rotation = pdf.getPageTree().getPage(i).getTotalRotation(0);
BufferedImage image = ImageUtils.toBufferedImage(pdf.getPageImage(
i, GraphicsRenderingHints.PRINT, Page.BOUNDARY_CROPBOX,
0.0f, 4.0F));
if (ImageUtils.checkIfManyColors(image)) {
HocrPage hocrPage = hocrPdf.createPage(i, image.getWidth(),
image.getHeight());
hocrPage.addImage(imageOcr(image, 360 - rotation, lang, ocr));
} else
emptyPageImages++;
}
if (pdf.getNumberOfPages() > 0
&& emptyPageImages == pdf.getNumberOfPages())
throw new SearchLibException("All pages are blank "
+ pdf.getNumberOfPages());
if (getFieldMap().isMapped(ParserFieldEnum.image_ocr_boxes))
hocrPdf.putHocrToParserField(result,
ParserFieldEnum.image_ocr_boxes);
if (getFieldMap().isMapped(ParserFieldEnum.ocr_content))
hocrPdf.putTextToParserField(result, ParserFieldEnum.ocr_content);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy