com.opensearchserver.textextractor.parser.Pptx Maven / Gradle / Ivy
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2010-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.opensearchserver.textextractor.parser;
import java.io.File;
import java.io.InputStream;
import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.DrawingTextBody;
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFComments;
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFNotes;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
import com.opensearchserver.textextractor.ParserAbstract;
import com.opensearchserver.textextractor.ParserDocument;
import com.opensearchserver.textextractor.ParserField;
public class Pptx extends ParserAbstract {
final protected static ParserField TITLE = ParserField.newString("title",
"The title of the document");
final protected static ParserField CREATOR = ParserField.newString(
"creator", "The name of the creator");
final protected static ParserField DESCRIPTION = ParserField.newString(
"description", null);
final protected static ParserField KEYWORDS = ParserField.newString(
"keywords", null);
final protected static ParserField SUBJECT = ParserField.newString(
"subject", "The subject of the document");
final protected static ParserField CREATION_DATE = ParserField.newDate(
"creation_date", null);
final protected static ParserField MODIFICATION_DATE = ParserField.newDate(
"modification_date", null);
final protected static ParserField SLIDES = ParserField.newString("slides",
null);
final protected static ParserField MASTER = ParserField.newString("master",
null);
final protected static ParserField NOTES = ParserField.newString("notes",
null);
final protected static ParserField COMMENTS = ParserField.newString(
"comments", null);
final protected static ParserField LANG_DETECTION = ParserField.newString(
"lang_detection", "Detection of the language");
final protected static ParserField[] FIELDS = { TITLE, CREATOR,
DESCRIPTION, KEYWORDS, SUBJECT, CREATION_DATE, MODIFICATION_DATE,
SLIDES, MASTER, NOTES, COMMENTS, LANG_DETECTION };
public Pptx() {
}
@Override
protected ParserField[] getParameters() {
return null;
}
@Override
protected ParserField[] getFields() {
return FIELDS;
}
@Override
protected void parseContent(InputStream inputStream) throws Exception {
File tempFile = ParserAbstract.createTempFile(inputStream, "pptx");
try {
parseContent(tempFile);
} finally {
tempFile.delete();
}
}
@Override
protected void parseContent(File file) throws Exception {
XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath());
XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage());
// Extract metadata
XSLFPowerPointExtractor poiExtractor = null;
try {
poiExtractor = new XSLFPowerPointExtractor(slideshow);
CoreProperties info = poiExtractor.getCoreProperties();
if (info != null) {
metas.add(TITLE, info.getTitle());
metas.add(CREATOR, info.getCreator());
metas.add(SUBJECT, info.getSubject());
metas.add(DESCRIPTION, info.getDescription());
metas.add(KEYWORDS, info.getKeywords());
metas.add(CREATION_DATE, info.getCreated());
metas.add(MODIFICATION_DATE, info.getModified());
}
} finally {
poiExtractor.close();
}
extractSides(slideshow);
}
/**
* Declined from XSLFPowerPointExtractor.java
*/
private String extractText(XSLFCommonSlideData data,
boolean skipPlaceholders) {
StringBuilder sb = new StringBuilder();
for (DrawingTextBody textBody : data.getDrawingText()) {
if (skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
DrawingTextPlaceholder ph = (DrawingTextPlaceholder) textBody;
if (!ph.isPlaceholderCustom()) {
// Skip non-customised placeholder text
continue;
}
}
for (DrawingParagraph p : textBody.getParagraphs()) {
sb.append(p.getText());
sb.append("\n");
}
}
return sb.toString();
}
/**
* Declined from XSLFPowerPointExtractor.java
*
* @param pptSlideShow
*/
private void extractSides(XMLSlideShow slideshow) {
XSLFSlide[] slides = (XSLFSlide[]) slideshow.getSlides();
XSLFCommentAuthors commentAuthors = slideshow.getCommentAuthors();
for (XSLFSlide slide : slides) {
// One document per slide
ParserDocument result = getNewParserDocument();
XSLFNotes notes = slide.getNotes();
XSLFComments comments = slide.getComments();
XSLFSlideLayout layout = slide.getSlideLayout();
XSLFSlideMaster master = layout.getSlideMaster();
// TODO Do the slide's name
// (Stored in docProps/app.xml)
// Do the slide's text
result.add(SLIDES, extractText(slide.getCommonSlideData(), false));
result.add(LANG_DETECTION, languageDetection(SLIDES, 10000));
// If requested, get text from the master and it's layout
if (layout != null) {
result.add(MASTER,
extractText(layout.getCommonSlideData(), true));
}
if (master != null) {
result.add(MASTER,
extractText(master.getCommonSlideData(), true));
}
// If the slide has comments, do those too
if (comments != null) {
for (CTComment comment : comments.getCTCommentsList()
.getCmList()) {
StringBuilder sbComment = new StringBuilder();
// Do the author if we can
if (commentAuthors != null) {
CTCommentAuthor author = commentAuthors
.getAuthorById(comment.getAuthorId());
if (author != null) {
sbComment.append(author.getName());
sbComment.append(": ");
}
}
// Then the comment text, with a new line afterwards
sbComment.append(comment.getText());
sbComment.append("\n");
if (sbComment.length() > 0)
result.add(COMMENTS, sbComment.toString());
}
}
// Do the notes if requested
if (notes != null) {
result.add(NOTES,
extractText(notes.getCommonSlideData(), false));
}
}
}
}