All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sejda.impl.sambox.component.optimization.OptimizationRuler Maven / Gradle / Ivy

There is a newer version: 5.1.6
Show newest version
/*
 * Created on 03 feb 2016
 * Copyright 2015 by Andrea Vacondio ([email protected]).
 * This file is part of Sejda.
 *
 * Sejda is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Sejda is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Sejda.  If not, see .
 */
package org.sejda.impl.sambox.component.optimization;

import static org.sejda.commons.util.RequireUtils.requireNotNullArg;

import java.util.List;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.sejda.model.optimization.OptimizationPolicy;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.pdmodel.PDDocument;
import org.sejda.sambox.pdmodel.PDPage;
import org.sejda.sambox.pdmodel.PDPageTree;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Component in charge to decide if a document will likely generate split/extract results needing optimization
 * 
 * @author Andrea Vacondio
 *
 */
public class OptimizationRuler implements Function {

    private static final Logger LOG = LoggerFactory.getLogger(OptimizationRuler.class);

    private OptimizationPolicy policy;

    public OptimizationRuler(OptimizationPolicy policy) {
        requireNotNullArg(policy, "Optimization policy cannot be null");
        this.policy = policy;
    }

    @Override
    public Boolean apply(PDDocument document) {
        if (policy == OptimizationPolicy.YES) {
            return true;
        }
        if (policy == OptimizationPolicy.AUTO) {
            return willNeedOptimization(document);
        }
        return false;
    }

    private boolean willNeedOptimization(PDDocument document) {
        return hasSharedNameDictionaries(document) || hasInheritedResources(document);
    }

    /**
     * @param document
     * @return true if the document page tree has non leaf nodes with fonts or images resources, inherited by page leaves.
     */
    private boolean hasInheritedResources(PDDocument document) {
        // we take all the resource dictionaries in non-leaf nodes (i.e. inherited by pages) and count fonts and the xobjects of subtype Image, so basically we try to determine if
        // pages are going to inherit images or fonts, potentially unused in which case we want to optimize
        List resources = document.getPages().streamNodes().filter(PDPageTree::isPageTreeNode)
                .map(d -> d.getDictionaryObject(COSName.RESOURCES, COSDictionary.class)).filter(Objects::nonNull)
                .distinct().toList();

        long inheritedImage = resources.stream().map(d -> d.getDictionaryObject(COSName.XOBJECT, COSDictionary.class))
                .filter(Objects::nonNull).flatMap(d -> d.getValues().stream()).map(COSBase::getCOSObject)
                .filter(d -> d instanceof COSDictionary).map(d -> (COSDictionary) d)
                .map(d -> d.getNameAsString(COSName.SUBTYPE)).filter(Objects::nonNull)
                .filter(COSName.IMAGE.getName()::equals).count();
        long inheritedFonts = resources.stream().map(d -> d.getDictionaryObject(COSName.FONT, COSDictionary.class))
                .filter(Objects::nonNull).flatMap(d -> d.getValues().stream()).map(COSBase::getCOSObject)
                .filter(d -> d instanceof COSDictionary).count();
        long inheritedExtGState = resources.stream()
                .map(d -> d.getDictionaryObject(COSName.EXT_G_STATE, COSDictionary.class)).filter(Objects::nonNull)
                .flatMap(d -> d.getValues().stream()).map(COSBase::getCOSObject).filter(d -> d instanceof COSDictionary)
                .count();
        LOG.debug("Found {} inherited images, {} inherited fonts and {} inherited graphic states potentially unused",
                inheritedImage, inheritedFonts, inheritedExtGState);
        return (inheritedImage + inheritedFonts + inheritedExtGState) > 0;
    }

    /**
     * 
     * @param document
     * @return true if we detect optimizable name resource dictionaries that are shared among pages. Being shared they likely contains resources used by multiple pages so we need
     *         to optimize in case of split/extract to avoid carrying unused resources to the extracted pages
     */
    private boolean hasSharedNameDictionaries(PDDocument document) {
        // we get from all the pages resource dictionaries, all the optimizable resources name dictionaries (fonts, xobject, extgstate)
        List optimizableDictionaries = document.getPages().stream().map(PDPage::getCOSObject)
                .filter(Objects::nonNull).map(d -> d.getDictionaryObject(COSName.RESOURCES, COSDictionary.class))
                .filter(Objects::nonNull).flatMap(
                        d -> Stream.of(d.getDictionaryObject(COSName.EXT_G_STATE, COSDictionary.class),
                                d.getDictionaryObject(COSName.XOBJECT, COSDictionary.class),
                                d.getDictionaryObject(COSName.FONT, COSDictionary.class))).filter(Objects::nonNull)
                .filter(x -> x.size() > 0).toList();
        long distinctFontDictionaries = optimizableDictionaries.stream().distinct().count();
        if (optimizableDictionaries.size() > distinctFontDictionaries) {
            // if the distinct count is different it means one or more name dictionaries is shared among pages
            LOG.debug("Found shared named dictionaries");
            return true;
        }
        return false;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy