All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sejda.impl.sambox.component.optimization.OptimizationRuler Maven / Gradle / Ivy

/*
 * Created on 03 feb 2016
 * Copyright 2015 by Andrea Vacondio ([email protected]).
 * This file is part of Sejda.
 *
 * Sejda is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Sejda is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Sejda.  If not, see .
 */
package org.sejda.impl.sambox.component.optimization;

import static org.sejda.commons.util.RequireUtils.requireNotNullArg;

import java.util.List;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.sejda.model.optimization.OptimizationPolicy;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.pdmodel.PDDocument;
import org.sejda.sambox.pdmodel.PDPage;
import org.sejda.sambox.pdmodel.PDPageTree;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Component in charge to decide if a document will likely generate split/extract results needing optimization
 * 
 * @author Andrea Vacondio
 *
 */
public class OptimizationRuler implements Function {

    private static final Logger LOG = LoggerFactory.getLogger(OptimizationRuler.class);

    private OptimizationPolicy policy;

    public OptimizationRuler(OptimizationPolicy policy) {
        requireNotNullArg(policy, "Optimization policy cannot be null");
        this.policy = policy;
    }

    @Override
    public Boolean apply(PDDocument document) {
        if (policy == OptimizationPolicy.YES) {
            return true;
        }
        if (policy == OptimizationPolicy.AUTO) {
            return willNeedOptimization(document);
        }
        return false;
    }

    private boolean willNeedOptimization(PDDocument document) {
        return hasSharedXObjectDictionaries(document) || hasSharedFontDictionaries(document)
                || hasInheritedResources(document);
    }

    /**
     * @param document
     * @return true if the document page tree has non leaf nodes with fonts or images resources, inherited by page leaves.
     */
    private boolean hasInheritedResources(PDDocument document) {
        // we take all the resource dictionaries in non-leaf nodes (i.e. inherited by pages) and count fonts and the xobjects of subtype Image, so basically we try to determine if
        // pages are going to inherit images or fonts, potentially unused in which case we want to optimize
        List resources = document.getPages().streamNodes().filter(PDPageTree::isPageTreeNode)
                .map(d -> d.getDictionaryObject(COSName.RESOURCES, COSDictionary.class)).filter(Objects::nonNull)
                .distinct().collect(Collectors.toList());

        long inheritedImage = resources.stream().map(d -> d.getDictionaryObject(COSName.XOBJECT, COSDictionary.class))
                .filter(Objects::nonNull).flatMap(d -> d.getValues().stream()).map(COSBase::getCOSObject)
                .filter(d -> d instanceof COSDictionary).map(d -> (COSDictionary) d)
                .map(d -> d.getNameAsString(COSName.SUBTYPE)).filter(Objects::nonNull)
                .filter(COSName.IMAGE.getName()::equals).count();
        long inheritedFonts = resources.stream().map(d -> d.getDictionaryObject(COSName.FONT, COSDictionary.class))
                .filter(Objects::nonNull).flatMap(d -> d.getValues().stream()).map(COSBase::getCOSObject)
                .filter(d -> d instanceof COSDictionary).count();
        LOG.debug("Found {} inherited images and {} inherited fonts potentially unused", inheritedImage,
                inheritedFonts);
        return (inheritedImage + inheritedFonts) > 0;
    }

    private boolean hasSharedXObjectDictionaries(PDDocument document) {
        // we get from all the pages resource dictionaries, all the xobject name dictionaries containing images
        List xobjectsDictionaries = document.getPages().stream().map(PDPage::getCOSObject)
                .filter(Objects::nonNull).map(d -> d.getDictionaryObject(COSName.RESOURCES, COSDictionary.class))
                .filter(Objects::nonNull).map(d -> d.getDictionaryObject(COSName.XOBJECT, COSDictionary.class))
                .filter(Objects::nonNull).filter(x -> x.size() > 0).collect(Collectors.toList());
        long distinctXobjectsDictionaries = xobjectsDictionaries.stream().distinct().count();
        if (xobjectsDictionaries.size() > distinctXobjectsDictionaries) {
            // if the distinct count is different it means one or more xobject name dictionary is shared among some pages so it likely contains images used by multiple pages so we
            // optimize
            LOG.debug("Found shared XObject dictionary containing image resouces");
            return true;
        }
        return false;
    }

    private boolean hasSharedFontDictionaries(PDDocument document) {
        // we get from all the pages resource dictionaries, all the font name dictionaries
        List fontDictionaries = document.getPages().stream().map(PDPage::getCOSObject)
                .filter(Objects::nonNull).map(d -> d.getDictionaryObject(COSName.RESOURCES, COSDictionary.class))
                .filter(Objects::nonNull).map(d -> d.getDictionaryObject(COSName.FONT, COSDictionary.class))
                .filter(Objects::nonNull).filter(x -> {
                    return x.getValues().stream().map(COSBase::getCOSObject).filter(v -> v instanceof COSDictionary)
                            .count() > 0;
                }).collect(Collectors.toList());
        long distinctFontDictionaries = fontDictionaries.stream().distinct().count();
        if (fontDictionaries.size() > distinctFontDictionaries) {
            // if the distinct count is different it means one or more font name dictionaries is shared among pages so it likely contains fonts used by multiple pages so we
            // optimize
            LOG.debug("Found shared font dictionaries");
            return true;
        }
        return false;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy