All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.poi.sl.extractor.SlideShowExtractor Maven / Gradle / Ivy

Go to download

The Apache Commons Codec package contains simple encoder and decoders for various formats such as Base64 and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities.

There is a newer version: 62
Show newest version
/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.sl.extractor;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.LinkedList;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;

import com.zaxxer.sparsebits.SparseBitSet;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.sl.usermodel.MasterSheet;
import org.apache.poi.sl.usermodel.Notes;
import org.apache.poi.sl.usermodel.ObjectShape;
import org.apache.poi.sl.usermodel.Placeholder;
import org.apache.poi.sl.usermodel.PlaceholderDetails;
import org.apache.poi.sl.usermodel.Shape;
import org.apache.poi.sl.usermodel.ShapeContainer;
import org.apache.poi.sl.usermodel.Sheet;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.TableCell;
import org.apache.poi.sl.usermodel.TableShape;
import org.apache.poi.sl.usermodel.TextParagraph;
import org.apache.poi.sl.usermodel.TextRun;
import org.apache.poi.sl.usermodel.TextShape;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;

/**
 * Common SlideShow extractor
 *
 * @since POI 4.0.0
 */
public class SlideShowExtractor<
    S extends Shape,
    P extends TextParagraph
> implements POITextExtractor {
    private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class);

    // placeholder text for slide numbers
    private static final String SLIDE_NUMBER_PH = "‹#›";


    protected final SlideShow slideshow;

    private boolean slidesByDefault = true;
    private boolean notesByDefault;
    private boolean commentsByDefault;
    private boolean masterByDefault;

    private Predicate filter = o -> true;
    private boolean doCloseFilesystem = true;

    public SlideShowExtractor(final SlideShow slideshow) {
        this.slideshow = slideshow;
    }

    /**
     * Returns opened document
     *
     * @return the opened document
     */
    @Override
    public SlideShow getDocument() {
        return slideshow;
    }

    /**
     * Should a call to getText() return slide text? Default is yes
     */
    public void setSlidesByDefault(final boolean slidesByDefault) {
        this.slidesByDefault = slidesByDefault;
    }

    /**
     * Should a call to getText() return notes text? Default is no
     */
    public void setNotesByDefault(final boolean notesByDefault) {
        this.notesByDefault = notesByDefault;
    }

    /**
     * Should a call to getText() return comments text? Default is no
     */
    public void setCommentsByDefault(final boolean commentsByDefault) {
        this.commentsByDefault = commentsByDefault;
    }

    /**
     * Should a call to getText() return text from master? Default is no
     */
    public void setMasterByDefault(final boolean masterByDefault) {
        this.masterByDefault = masterByDefault;
    }

    @Override
    public POITextExtractor getMetadataTextExtractor() {
        return slideshow.getMetadataTextExtractor();
    }

    /**
     * Fetches all the slide text from the slideshow, but not the notes, unless
     * you've called setSlidesByDefault() and setNotesByDefault() to change this
     */
    @Override
    public String getText() {
        final StringBuilder sb = new StringBuilder();
        for (final Slide slide : slideshow.getSlides()) {
            getText(slide, sb::append);
        }

        return sb.toString();
    }

    public String getText(final Slide slide) {
        final StringBuilder sb = new StringBuilder();
        getText(slide, sb::append);
        return sb.toString();
    }


    private void getText(final Slide slide, final Consumer consumer) {
        if (slidesByDefault) {
            printShapeText(slide, consumer);
        }

        if (masterByDefault) {
            final MasterSheet ms = slide.getMasterSheet();
            printSlideMaster(ms, consumer);

            // only print slide layout, if it's a different instance
            final MasterSheet sl = slide.getSlideLayout();
            if (sl != ms) {
                printSlideMaster(sl, consumer);
            }
        }

        if (commentsByDefault) {
            printComments(slide, consumer);
        }

        if (notesByDefault) {
            printNotes(slide, consumer);
        }
    }

    private void printSlideMaster(final MasterSheet master, final Consumer consumer) {
        if (master == null) {
            return;
        }
        for (final Shape shape : master) {
            if (shape instanceof TextShape) {
                final TextShape ts = (TextShape)shape;
                final String text = ts.getText();
                if (text == null || text.isEmpty() || "*".equals(text)) {
                    continue;
                }

                if (ts.isPlaceholder()) {
                    // don't bother about boiler plate text on master sheets
                    LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text);
                    continue;
                }

                printTextParagraphs(ts.getTextParagraphs(), consumer);
            }
        }
    }

    private void printTextParagraphs(final List

paras, final Consumer consumer) { printTextParagraphs(paras, consumer, "\n"); } private void printTextParagraphs(final List

paras, final Consumer consumer, String trailer) { printTextParagraphs(paras, consumer, trailer, SlideShowExtractor::replaceTextCap); } private void printTextParagraphs(final List

paras, final Consumer consumer, String trailer, final Function converter) { for (P p : paras) { for (TextRun r : p) { if (filter.test(r)) { consumer.accept(converter.apply(r)); } } if (!trailer.isEmpty() && filter.test(trailer)) { consumer.accept(trailer); } } } private void printHeaderFooter(final Sheet sheet, final Consumer consumer, final Consumer footerCon) { final Sheet m = (sheet instanceof Slide) ? sheet.getMasterSheet() : sheet; addSheetPlaceholderDatails(sheet, Placeholder.HEADER, consumer); addSheetPlaceholderDatails(sheet, Placeholder.FOOTER, footerCon); if (!masterByDefault) { return; } // write header texts and determine footer text for (Shape s : m) { if (!(s instanceof TextShape)) { continue; } final TextShape ts = (TextShape) s; final PlaceholderDetails pd = ts.getPlaceholderDetails(); if (pd == null || !pd.isVisible() || pd.getPlaceholder() == null) { continue; } switch (pd.getPlaceholder()) { case HEADER: printTextParagraphs(ts.getTextParagraphs(), consumer); break; case FOOTER: printTextParagraphs(ts.getTextParagraphs(), footerCon); break; case SLIDE_NUMBER: printTextParagraphs(ts.getTextParagraphs(), footerCon, "\n", SlideShowExtractor::replaceSlideNumber); break; case DATETIME: // currently not supported default: break; } } } private void addSheetPlaceholderDatails(final Sheet sheet, final Placeholder placeholder, final Consumer consumer) { final PlaceholderDetails headerPD = sheet.getPlaceholderDetails(placeholder); final String headerStr = (headerPD != null) ? headerPD.getText() : null; if (headerStr != null && filter.test(headerPD)) { consumer.accept(headerStr); } } private void printShapeText(final Sheet sheet, final Consumer consumer) { final List footer = new LinkedList<>(); printHeaderFooter(sheet, consumer, footer::add); printShapeText((ShapeContainer)sheet, consumer); footer.forEach(consumer); } @SuppressWarnings("unchecked") private void printShapeText(final ShapeContainer container, final Consumer consumer) { for (Shape shape : container) { if (shape instanceof TextShape) { printTextParagraphs(((TextShape)shape).getTextParagraphs(), consumer); } else if (shape instanceof TableShape) { printShapeText((TableShape)shape, consumer); } else if (shape instanceof ShapeContainer) { printShapeText((ShapeContainer)shape, consumer); } } } @SuppressWarnings("Duplicates") private void printShapeText(final TableShape shape, final Consumer consumer) { final int nrows = shape.getNumberOfRows(); final int ncols = shape.getNumberOfColumns(); for (int row = 0; row < nrows; row++) { String trailer = ""; for (int col = 0; col < ncols; col++){ TableCell cell = shape.getCell(row, col); //defensive null checks; don't know if they're necessary if (cell != null) { trailer = col < ncols-1 ? "\t" : "\n"; printTextParagraphs(cell.getTextParagraphs(), consumer, trailer); } } if (!trailer.equals("\n") && filter.test("\n")) { consumer.accept("\n"); } } } private void printComments(final Slide slide, final Consumer consumer) { slide.getComments().stream().filter(filter).map(c -> c.getAuthor()+" - "+c.getText()).forEach(consumer); } private void printNotes(final Slide slide, final Consumer consumer) { final Notes notes = slide.getNotes(); if (notes == null) { return; } List footer = new LinkedList<>(); printHeaderFooter(notes, consumer, footer::add); printShapeText(notes, consumer); footer.forEach(consumer); } public List> getOLEShapes() { final List> oleShapes = new ArrayList<>(); for (final Slide slide : slideshow.getSlides()) { addOLEShapes(oleShapes, slide); } return oleShapes; } @SuppressWarnings("unchecked") private void addOLEShapes(final List> oleShapes, ShapeContainer container) { for (Shape shape : container) { if (shape instanceof ShapeContainer) { addOLEShapes(oleShapes, (ShapeContainer)shape); } else if (shape instanceof ObjectShape) { oleShapes.add((ObjectShape)shape); } } } private static String replaceSlideNumber(TextRun tr) { String raw = tr.getRawText(); if (!raw.contains(SLIDE_NUMBER_PH)) { return raw; } TextParagraph tp = tr.getParagraph(); TextShape ps = (tp != null) ? tp.getParentShape() : null; Sheet sh = (ps != null) ? ps.getSheet() : null; String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : ""; return raw.replace(SLIDE_NUMBER_PH, slideNr); } private static String replaceTextCap(TextRun tr) { final TextParagraph tp = tr.getParagraph(); final TextShape sh = (tp != null) ? tp.getParentShape() : null; final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null; // 0xB acts like cariage return in page titles and like blank in the others final char sep = ( ph == Placeholder.TITLE || ph == Placeholder.CENTERED_TITLE || ph == Placeholder.SUBTITLE ) ? '\n' : ' '; // PowerPoint seems to store files with \r as the line break // The messes things up on everything but a Mac, so translate them to \n String txt = tr.getRawText(); txt = txt.replace('\r', '\n'); txt = txt.replace((char) 0x0B, sep); switch (tr.getTextCap()) { case ALL: txt = txt.toUpperCase(LocaleUtil.getUserLocale()); break; case SMALL: txt = txt.toLowerCase(LocaleUtil.getUserLocale()); break; } return txt; } /** * Extract the used codepoints for font embedding / subsetting * @param typeface the typeface/font family of the textruns to examine * @param italic use {@code true} for italic TextRuns, {@code false} for non-italic ones and * {@code null} if it doesn't matter * @param bold use {@code true} for bold TextRuns, {@code false} for non-bold ones and * {@code null} if it doesn't matter * @return a bitset with the marked/used codepoints * @deprecated use {@link #getCodepointsInSparseBitSet(String, Boolean, Boolean)} */ @Deprecated public BitSet getCodepoints(String typeface, Boolean italic, Boolean bold) { final BitSet glyphs = new BitSet(); Predicate filterOld = filter; try { filter = o -> filterFonts(o, typeface, italic, bold); slideshow.getSlides().forEach(slide -> getText(slide, s -> s.codePoints().forEach(glyphs::set)) ); } finally { filter = filterOld; } return glyphs; } /** * Extract the used codepoints for font embedding / subsetting * @param typeface the typeface/font family of the textruns to examine * @param italic use {@code true} for italic TextRuns, {@code false} for non-italic ones and * {@code null} if it doesn't matter * @param bold use {@code true} for bold TextRuns, {@code false} for non-bold ones and * {@code null} if it doesn't matter * @return a bitset with the marked/used codepoints */ public SparseBitSet getCodepointsInSparseBitSet(String typeface, Boolean italic, Boolean bold) { final SparseBitSet glyphs = new SparseBitSet(); Predicate filterOld = filter; try { filter = o -> filterFonts(o, typeface, italic, bold); slideshow.getSlides().forEach(slide -> getText(slide, s -> s.codePoints().forEach(glyphs::set)) ); } finally { filter = filterOld; } return glyphs; } private static boolean filterFonts(Object o, String typeface, Boolean italic, Boolean bold) { if (!(o instanceof TextRun)) { return false; } TextRun tr = (TextRun)o; return typeface.equalsIgnoreCase(tr.getFontFamily()) && (italic == null || tr.isItalic() == italic) && (bold == null || tr.isBold() == bold); } @Override public void setCloseFilesystem(boolean doCloseFilesystem) { this.doCloseFilesystem = doCloseFilesystem; } @Override public boolean isCloseFilesystem() { return doCloseFilesystem; } @Override public SlideShow getFilesystem() { return getDocument(); } }