com.opensearchserver.textextractor.parser.Pptx Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oss-text-extractor Show documentation
The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2010-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.opensearchserver.textextractor.parser;

import java.io.File;
import java.io.InputStream;

import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.DrawingTextBody;
import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
import org.apache.poi.xslf.usermodel.XSLFComments;
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFNotes;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;

import com.opensearchserver.textextractor.ParserAbstract;
import com.opensearchserver.textextractor.ParserDocument;
import com.opensearchserver.textextractor.ParserField;

public class Pptx extends ParserAbstract {

	final protected static ParserField TITLE = ParserField.newString("title",
			"The title of the document");

	final protected static ParserField CREATOR = ParserField.newString(
			"creator", "The name of the creator");

	final protected static ParserField DESCRIPTION = ParserField.newString(
			"description", null);

	final protected static ParserField KEYWORDS = ParserField.newString(
			"keywords", null);

	final protected static ParserField SUBJECT = ParserField.newString(
			"subject", "The subject of the document");

	final protected static ParserField CREATION_DATE = ParserField.newDate(
			"creation_date", null);

	final protected static ParserField MODIFICATION_DATE = ParserField.newDate(
			"modification_date", null);

	final protected static ParserField SLIDES = ParserField.newString("slides",
			null);

	final protected static ParserField MASTER = ParserField.newString("master",
			null);

	final protected static ParserField NOTES = ParserField.newString("notes",
			null);

	final protected static ParserField COMMENTS = ParserField.newString(
			"comments", null);

	final protected static ParserField LANG_DETECTION = ParserField.newString(
			"lang_detection", "Detection of the language");

	final protected static ParserField[] FIELDS = { TITLE, CREATOR,
			DESCRIPTION, KEYWORDS, SUBJECT, CREATION_DATE, MODIFICATION_DATE,
			SLIDES, MASTER, NOTES, COMMENTS, LANG_DETECTION };

	public Pptx() {
	}

	@Override
	protected ParserField[] getParameters() {
		return null;
	}

	@Override
	protected ParserField[] getFields() {
		return FIELDS;
	}

	@Override
	protected void parseContent(InputStream inputStream) throws Exception {
		File tempFile = ParserAbstract.createTempFile(inputStream, "pptx");
		try {
			parseContent(tempFile);
		} finally {
			tempFile.delete();
		}
	}

	@Override
	protected void parseContent(File file) throws Exception {

		XSLFSlideShow pptSlideShow = new XSLFSlideShow(file.getAbsolutePath());
		XMLSlideShow slideshow = new XMLSlideShow(pptSlideShow.getPackage());

		// Extract metadata
		XSLFPowerPointExtractor poiExtractor = null;
		try {
			poiExtractor = new XSLFPowerPointExtractor(slideshow);
			CoreProperties info = poiExtractor.getCoreProperties();
			if (info != null) {
				metas.add(TITLE, info.getTitle());
				metas.add(CREATOR, info.getCreator());
				metas.add(SUBJECT, info.getSubject());
				metas.add(DESCRIPTION, info.getDescription());
				metas.add(KEYWORDS, info.getKeywords());
				metas.add(CREATION_DATE, info.getCreated());
				metas.add(MODIFICATION_DATE, info.getModified());
			}
		} finally {
			poiExtractor.close();
		}
		extractSides(slideshow);
	}

	/**
	 * Declined from XSLFPowerPointExtractor.java
	 */
	private String extractText(XSLFCommonSlideData data,
			boolean skipPlaceholders) {
		StringBuilder sb = new StringBuilder();
		for (DrawingTextBody textBody : data.getDrawingText()) {
			if (skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
				DrawingTextPlaceholder ph = (DrawingTextPlaceholder) textBody;
				if (!ph.isPlaceholderCustom()) {
					// Skip non-customised placeholder text
					continue;
				}
			}

			for (DrawingParagraph p : textBody.getParagraphs()) {
				sb.append(p.getText());
				sb.append("\n");
			}
		}
		return sb.toString();
	}

	/**
	 * Declined from XSLFPowerPointExtractor.java
	 * 
	 * @param pptSlideShow
	 */
	private void extractSides(XMLSlideShow slideshow) {

		XSLFSlide[] slides = (XSLFSlide[]) slideshow.getSlides();
		XSLFCommentAuthors commentAuthors = slideshow.getCommentAuthors();

		for (XSLFSlide slide : slides) {

			// One document per slide
			ParserDocument result = getNewParserDocument();

			XSLFNotes notes = slide.getNotes();
			XSLFComments comments = slide.getComments();
			XSLFSlideLayout layout = slide.getSlideLayout();
			XSLFSlideMaster master = layout.getSlideMaster();

			// TODO Do the slide's name
			// (Stored in docProps/app.xml)

			// Do the slide's text
			result.add(SLIDES, extractText(slide.getCommonSlideData(), false));
			result.add(LANG_DETECTION, languageDetection(SLIDES, 10000));

			// If requested, get text from the master and it's layout
			if (layout != null) {
				result.add(MASTER,
						extractText(layout.getCommonSlideData(), true));
			}
			if (master != null) {
				result.add(MASTER,
						extractText(master.getCommonSlideData(), true));
			}

			// If the slide has comments, do those too
			if (comments != null) {
				for (CTComment comment : comments.getCTCommentsList()
						.getCmList()) {
					StringBuilder sbComment = new StringBuilder();
					// Do the author if we can
					if (commentAuthors != null) {
						CTCommentAuthor author = commentAuthors
								.getAuthorById(comment.getAuthorId());
						if (author != null) {
							sbComment.append(author.getName());
							sbComment.append(": ");
						}
					}

					// Then the comment text, with a new line afterwards
					sbComment.append(comment.getText());
					sbComment.append("\n");
					if (sbComment.length() > 0)
						result.add(COMMENTS, sbComment.toString());
				}
			}

			// Do the notes if requested
			if (notes != null) {
				result.add(NOTES,
						extractText(notes.getCommonSlideData(), false));
			}
		}
	}
}