All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.kernel.pdf.canvas.parser.util.InlineImageParsingUtils Maven / Gradle / Ivy

There is a newer version: 9.0.0
Show newest version
/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2024 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .
 */
package com.itextpdf.kernel.pdf.canvas.parser.util;

import com.itextpdf.io.source.PdfTokenizer;
import com.itextpdf.kernel.exceptions.PdfException;
import com.itextpdf.kernel.exceptions.KernelExceptionMessageConstant;
import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfStream;
import com.itextpdf.kernel.pdf.filters.DoNothingFilter;
import com.itextpdf.kernel.pdf.filters.FilterHandlers;
import com.itextpdf.kernel.pdf.filters.IFilterHandler;
import com.itextpdf.kernel.pdf.filters.FlateDecodeStrictFilter;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * Utility methods to help with processing of inline images
 */
public final class InlineImageParsingUtils {

    private static final byte[] EI = new byte[]{(byte)'E', (byte)'I'};

    private InlineImageParsingUtils() {
    }

    /**
     * Simple class in case users need to differentiate an exception from processing
     * inline images vs other exceptions
     */
    public static class InlineImageParseException extends PdfException {


        public InlineImageParseException(String message) {
            super(message);
        }
    }

    /**
     * Map between key abbreviations allowed in dictionary of inline images and their
     * equivalent image dictionary keys
     */
    private static final Map inlineImageEntryAbbreviationMap;
    /**
     * Map between value abbreviations allowed in dictionary of inline images for COLORSPACE
     */
    private static final Map inlineImageColorSpaceAbbreviationMap;
    /**
     * Map between value abbreviations allowed in dictionary of inline images for FILTER
     */
    private static final Map inlineImageFilterAbbreviationMap;

    static {
        // Map between key abbreviations allowed in dictionary of inline images and their
        // equivalent image dictionary keys
        inlineImageEntryAbbreviationMap = new HashMap<>();

        // allowed entries - just pass these through
        inlineImageEntryAbbreviationMap.put(PdfName.BitsPerComponent, PdfName.BitsPerComponent);
        inlineImageEntryAbbreviationMap.put(PdfName.ColorSpace, PdfName.ColorSpace);
        inlineImageEntryAbbreviationMap.put(PdfName.Decode, PdfName.Decode);
        inlineImageEntryAbbreviationMap.put(PdfName.DecodeParms, PdfName.DecodeParms);
        inlineImageEntryAbbreviationMap.put(PdfName.Filter, PdfName.Filter);
        inlineImageEntryAbbreviationMap.put(PdfName.Height, PdfName.Height);
        inlineImageEntryAbbreviationMap.put(PdfName.ImageMask, PdfName.ImageMask);
        inlineImageEntryAbbreviationMap.put(PdfName.Intent, PdfName.Intent);
        inlineImageEntryAbbreviationMap.put(PdfName.Interpolate, PdfName.Interpolate);
        inlineImageEntryAbbreviationMap.put(PdfName.Width, PdfName.Width);

        // abbreviations - transform these to corresponding correct values
        inlineImageEntryAbbreviationMap.put(new PdfName("BPC"), PdfName.BitsPerComponent);
        inlineImageEntryAbbreviationMap.put(new PdfName("CS"), PdfName.ColorSpace);
        inlineImageEntryAbbreviationMap.put(new PdfName("D"), PdfName.Decode);
        inlineImageEntryAbbreviationMap.put(new PdfName("DP"), PdfName.DecodeParms);
        inlineImageEntryAbbreviationMap.put(new PdfName("F"), PdfName.Filter);
        inlineImageEntryAbbreviationMap.put(new PdfName("H"), PdfName.Height);
        inlineImageEntryAbbreviationMap.put(new PdfName("IM"), PdfName.ImageMask);
        inlineImageEntryAbbreviationMap.put(new PdfName("I"), PdfName.Interpolate);
        inlineImageEntryAbbreviationMap.put(new PdfName("W"), PdfName.Width);

        // Map between value abbreviations allowed in dictionary of inline images for COLORSPACE
        inlineImageColorSpaceAbbreviationMap = new HashMap<>();

        inlineImageColorSpaceAbbreviationMap.put(new PdfName("G"), PdfName.DeviceGray);
        inlineImageColorSpaceAbbreviationMap.put(new PdfName("RGB"), PdfName.DeviceRGB);
        inlineImageColorSpaceAbbreviationMap.put(new PdfName("CMYK"), PdfName.DeviceCMYK);
        inlineImageColorSpaceAbbreviationMap.put(new PdfName("I"), PdfName.Indexed);

        // Map between value abbreviations allowed in dictionary of inline images for FILTER
        inlineImageFilterAbbreviationMap = new HashMap();

        inlineImageFilterAbbreviationMap.put(new PdfName("AHx"), PdfName.ASCIIHexDecode);
        inlineImageFilterAbbreviationMap.put(new PdfName("A85"), PdfName.ASCII85Decode);
        inlineImageFilterAbbreviationMap.put(new PdfName("LZW"), PdfName.LZWDecode);
        inlineImageFilterAbbreviationMap.put(new PdfName("Fl"), PdfName.FlateDecode);
        inlineImageFilterAbbreviationMap.put(new PdfName("RL"), PdfName.RunLengthDecode);
        inlineImageFilterAbbreviationMap.put(new PdfName("CCF"), PdfName.CCITTFaxDecode);
        inlineImageFilterAbbreviationMap.put(new PdfName("DCT"), PdfName.DCTDecode);
    }

    /**
     * Parses an inline image from the provided content parser.  The parser must be positioned immediately following the BI operator in the content stream.
     * The parser will be left with current position immediately following the EI operator that terminates the inline image
     *
     * @param ps            the content parser to use for reading the image.
     * @param colorSpaceDic a color space dictionary
     * @return the parsed image
     * @throws IOException               if anything goes wring with the parsing
     * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing
     */
    public static PdfStream parse(PdfCanvasParser ps, PdfDictionary colorSpaceDic) throws IOException {
        PdfDictionary inlineImageDict = parseDictionary(ps);
        byte[] samples = parseSamples(inlineImageDict, colorSpaceDic, ps);
        PdfStream inlineImageAsStreamObject = new PdfStream(samples);
        inlineImageAsStreamObject.putAll(inlineImageDict);
        return inlineImageAsStreamObject;
    }

    /**
     * @param colorSpaceName the name of the color space. If null, a bi-tonal (black and white) color space is assumed.
     * @return the components per pixel for the specified color space
     */
    static int getComponentsPerPixel(PdfName colorSpaceName, PdfDictionary colorSpaceDic) {
        if (colorSpaceName == null)
            return 1;
        if (colorSpaceName.equals(PdfName.DeviceGray))
            return 1;
        if (colorSpaceName.equals(PdfName.DeviceRGB))
            return 3;
        if (colorSpaceName.equals(PdfName.DeviceCMYK))
            return 4;

        if (colorSpaceDic != null) {
            PdfArray colorSpace = colorSpaceDic.getAsArray(colorSpaceName);
            if (colorSpace == null) {
                PdfName tempName = colorSpaceDic.getAsName(colorSpaceName);
                if (tempName != null) {
                    return getComponentsPerPixel(tempName, colorSpaceDic);
                }
            } else {
                if (PdfName.Indexed.equals(colorSpace.getAsName(0))) {
                    return 1;
                }
                if (PdfName.ICCBased.equals(colorSpace.getAsName(0))) {
                    return colorSpace.getAsStream(1).getAsNumber(PdfName.N).intValue();
                }
            }
        }

        throw new InlineImageParseException(KernelExceptionMessageConstant.UNEXPECTED_COLOR_SPACE).setMessageParams(colorSpaceName);
    }

    /**
     * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the BI operator.
     * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
     *
     * @param ps the parser to extract the embedded image information from
     * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
     * @throws IOException if the parse fails
     */
    private static PdfDictionary parseDictionary(PdfCanvasParser ps) throws IOException {
        // by the time we get to here, we have already parsed the BI operator
        PdfDictionary dict = new PdfDictionary();

        for (PdfObject key = ps.readObject(); key != null && !"ID".equals(key.toString()); key = ps.readObject()) {
            PdfObject value = ps.readObject();
            PdfName resolvedKey = inlineImageEntryAbbreviationMap.get((PdfName) key);
            if (resolvedKey == null) {
                resolvedKey = (PdfName) key;
            }
            dict.put(resolvedKey, getAlternateValue(resolvedKey, value));
        }

        int ch = ps.getTokeniser().read();
        if (!PdfTokenizer.isWhitespace(ch))
            throw new InlineImageParseException(
                    KernelExceptionMessageConstant.UNEXPECTED_CHARACTER_FOUND_AFTER_ID_IN_INLINE_IMAGE
            ).setMessageParams(ch);

        return dict;
    }

    /**
     * Transforms value abbreviations into their corresponding real value
     *
     * @param key   the key that the value is for
     * @param value the value that might be an abbreviation
     * @return if value is an allowed abbreviation for the key, the expanded value for that abbreviation.  Otherwise, value is returned without modification
     */
    private static PdfObject getAlternateValue(PdfName key, PdfObject value) {
        if (key == PdfName.Filter) {
            if (value instanceof PdfName) {
                PdfName altValue = inlineImageFilterAbbreviationMap.get((PdfName) value);
                if (altValue != null) {
                    return altValue;
                }
            } else if (value instanceof PdfArray) {
                PdfArray array = ((PdfArray) value);
                PdfArray altArray = new PdfArray();
                int count = array.size();
                for (int i = 0; i < count; i++) {
                    altArray.add(getAlternateValue(key, array.get(i)));
                }
                return altArray;
            }
        } else if (key == PdfName.ColorSpace && value instanceof PdfName) {
            PdfName altValue = inlineImageColorSpaceAbbreviationMap.get((PdfName) value);
            if (altValue != null) {
                return altValue;
            }
        }
        return value;
    }

    /**
     * Computes the number of unfiltered bytes that each row of the image will contain.
     * If the number of bytes results in a partial terminating byte, this number is rounded up
     * per the PDF specification
     *
     * @param imageDictionary the dictionary of the inline image
     * @return the number of bytes per row of the image
     */
    private static int computeBytesPerRow(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic) {
        PdfNumber wObj = imageDictionary.getAsNumber(PdfName.Width);
        PdfNumber bpcObj = imageDictionary.getAsNumber(PdfName.BitsPerComponent);
        int cpp = getComponentsPerPixel(imageDictionary.getAsName(PdfName.ColorSpace), colorSpaceDic);

        int w = wObj.intValue();
        int bpc = bpcObj != null ? bpcObj.intValue() : 1;

        return (w * bpc * cpp + 7) / 8;
    }

    /**
     * Parses the samples of the image from the underlying content parser, ignoring all filters.
     * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
     * The parser will be left positioned immediately following the EI operator.
     * This is primarily useful if no filters have been applied.
     *
     * @param imageDictionary the dictionary of the inline image
     * @param ps              the content parser
     * @return the samples of the image
     * @throws IOException if anything bad happens during parsing
     */
    private static byte[] parseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser ps) throws IOException {
        // special case:  when no filter is specified, we just read the number of bits
        // per component, multiplied by the width and height.
        if (imageDictionary.containsKey(PdfName.Filter))
            throw new IllegalArgumentException("Dictionary contains filters");

        PdfNumber h = imageDictionary.getAsNumber(PdfName.Height);

        int bytesToRead = computeBytesPerRow(imageDictionary, colorSpaceDic) * h.intValue();
        byte[] bytes = new byte[bytesToRead];
        PdfTokenizer tokeniser = ps.getTokeniser();

        // skip next character (which better be a whitespace character - I suppose we could check for this)
        int shouldBeWhiteSpace = tokeniser.read();
        // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
        // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
        int startIndex = 0;
        if (!PdfTokenizer.isWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0) {
            // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't
            bytes[0] = (byte) shouldBeWhiteSpace;
            startIndex++;
        }
        for (int i = startIndex; i < bytesToRead; i++) {
            int ch = tokeniser.read();
            if (ch == -1)
                throw new InlineImageParseException(
                        KernelExceptionMessageConstant.END_OF_CONTENT_STREAM_REACHED_BEFORE_END_OF_IMAGE_DATA);

            bytes[i] = (byte) ch;
        }
        PdfObject ei = ps.readObject();
        if (!"EI".equals(ei.toString())) {
            // Some PDF producers seem to add another non-whitespace character after the image data.
            // Let's try to handle that case here.
            PdfObject ei2 = ps.readObject();
            if (!"EI".equals(ei2.toString()))
                throw new InlineImageParseException(
                        KernelExceptionMessageConstant.OPERATOR_EI_NOT_FOUND_AFTER_END_OF_IMAGE_DATA);
        }
        return bytes;
    }

    /**
     * Parses the samples of the image from the underlying content parser, accounting for filters
     * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
     * The parser will be left positioned immediately following the EI operator.
     * Note:This implementation does not actually apply the filters at this time
     *
     * @param imageDictionary the dictionary of the inline image
     * @param ps              the content parser
     * @return the samples of the image
     * @throws IOException if anything bad happens during parsing
     */
    private static byte[] parseSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser ps) throws IOException {
        // by the time we get to here, we have already parsed the ID operator

        if (!imageDictionary.containsKey(PdfName.Filter) && imageColorSpaceIsKnown(imageDictionary, colorSpaceDic)) {
            return parseUnfilteredSamples(imageDictionary, colorSpaceDic, ps);
        }


        // read all content until we reach an EI operator followed by whitespace.
        // then decode the content stream to check that bytes that were parsed are really all image bytes
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        int ch;
        int found = 0;
        PdfTokenizer tokeniser = ps.getTokeniser();
        while ((ch = tokeniser.read()) != -1) {
            if (ch == 'E') {
                // probably some bytes were preserved so write them
                baos.write(EI, 0, found);
                // just preserve 'E' and do not write it immediately
                found = 1;
            } else if (found == 1 && ch == 'I') {
                // just preserve 'EI' and do not write it immediately
                found = 2;
            } else {
                if (found == 2 && PdfTokenizer.isWhitespace(ch)) {
                    byte[] tmp = baos.toByteArray();
                    if (inlineImageStreamBytesAreComplete(tmp, imageDictionary)) {
                        return tmp;
                    }
                }
                // probably some bytes were preserved so write them
                baos.write(EI, 0, found);
                baos.write(ch);
                found = 0;
            }

        }
        throw new InlineImageParseException(KernelExceptionMessageConstant.CANNOT_FIND_IMAGE_DATA_OR_EI);
    }

    private static boolean imageColorSpaceIsKnown(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic) {
        PdfName cs = imageDictionary.getAsName(PdfName.ColorSpace);
        if (cs == null || cs.equals(PdfName.DeviceGray) || cs.equals(PdfName.DeviceRGB) || cs.equals(PdfName.DeviceCMYK))
            return true;

        return colorSpaceDic != null && colorSpaceDic.containsKey(cs);
    }

    /**
     * This method acts like a check that bytes that were parsed are really all image bytes. If it's true,
     * then decoding will succeed, but if not all image bytes were read and "<ws>EI<ws>" bytes were just a part of the image,
     * then decoding should fail.
     * Not the best solution, but probably there is no better and more reliable way to check this.
     * 

* Drawbacks: slow; images with DCTDecode, JBIG2Decode and JPXDecode filters couldn't be checked as iText doesn't * support these filters; what if decoding will succeed eventhough it's not all bytes?; also I'm not sure that all * filters throw an exception in case data is corrupted (For example, FlateDecodeFilter seems not to throw an exception). **/ private static boolean inlineImageStreamBytesAreComplete(byte[] samples, PdfDictionary imageDictionary) { try { Map filters = new HashMap<>(FilterHandlers.getDefaultFilterHandlers()); filters.put(PdfName.JBIG2Decode, new DoNothingFilter()); filters.put(PdfName.FlateDecode, new FlateDecodeStrictFilter()); PdfReader.decodeBytes(samples, imageDictionary, filters); } catch (Exception ex) { return false; } return true; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy