All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.pdmodel.font.PDCIDFontType2 Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdmodel.font;

import java.awt.geom.GeneralPath;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.fontbox.cff.Type2CharString;
import org.apache.fontbox.cmap.CMap;
import org.apache.fontbox.ttf.CmapLookup;
import org.apache.fontbox.ttf.GlyphData;
import org.apache.fontbox.ttf.OTFParser;
import org.apache.fontbox.ttf.OpenTypeFont;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.Matrix;

/**
 * Type 2 CIDFont (TrueType).
 * 
 * @author Ben Litchfield
 */
public class PDCIDFontType2 extends PDCIDFont
{
    private static final Log LOG = LogFactory.getLog(PDCIDFontType2.class);

    private final TrueTypeFont ttf;
    private final int[] cid2gid;
    private final boolean isEmbedded;
    private final boolean isDamaged;
    private final CmapLookup cmap; // may be null
    private Matrix fontMatrix;
    private BoundingBox fontBBox;

    /**
     * Constructor.
     * 
     * @param fontDictionary The font dictionary according to the PDF specification.
     * @param parent The parent font.
     * @throws IOException
     */
    public PDCIDFontType2(COSDictionary fontDictionary, PDType0Font parent) throws IOException
    {
        this(fontDictionary, parent, null);
    }
    
    /**
     * Constructor.
     * 
     * @param fontDictionary The font dictionary according to the PDF specification.
     * @param parent The parent font.
     * @param trueTypeFont The true type font used to create the parent font
     * @throws IOException
     */
    public PDCIDFontType2(COSDictionary fontDictionary, PDType0Font parent, TrueTypeFont trueTypeFont) throws IOException
    {
        super(fontDictionary, parent);

        PDFontDescriptor fd = getFontDescriptor();
        if (trueTypeFont != null)
        {
            ttf = trueTypeFont;
            isEmbedded = true;
            isDamaged = false;
        }
        else
        {
            boolean fontIsDamaged = false;
            TrueTypeFont ttfFont = null;

            PDStream stream = null;
            if (fd != null)
            {
                stream = fd.getFontFile2();
                if (stream == null)
                {
                    stream = fd.getFontFile3();
                }
                if (stream == null)
                {
                    // Acrobat looks in FontFile too, even though it is not in the spec, see PDFBOX-2599
                    stream = fd.getFontFile();
                }
            }
            if (stream != null)
            {
                try
                {
                    // embedded OTF or TTF
                    OTFParser otfParser = new OTFParser(true);
                    OpenTypeFont otf = otfParser.parse(stream.createInputStream());
                    ttfFont = otf;
    
                    if (otf.isPostScript())
                    {
                        // PDFBOX-3344 contains PostScript outlines instead of TrueType
                        fontIsDamaged = true;
                        LOG.warn("Found CFF/OTF but expected embedded TTF font " + fd.getFontName());
                    }
    
                    if (otf.hasLayoutTables())
                    {
                        LOG.info("OpenType Layout tables used in font " + getBaseFont() +
                                  " are not implemented in PDFBox and will be ignored");
                    }
                }
                catch (NullPointerException e) // TTF parser is buggy
                {
                    fontIsDamaged = true;
                    LOG.warn("Could not read embedded OTF for font " + getBaseFont(), e);
                }
                catch (IOException e)
                {
                    fontIsDamaged = true;
                    LOG.warn("Could not read embedded OTF for font " + getBaseFont(), e);
                }
            }
            isEmbedded = ttfFont != null;
            isDamaged = fontIsDamaged;
    
            if (ttfFont == null)
            {
                ttfFont = findFontOrSubstitute();
            }
            ttf = ttfFont;
        }
        cmap = ttf.getUnicodeCmapLookup(false);
        cid2gid = readCIDToGIDMap();
    }

    private TrueTypeFont findFontOrSubstitute() throws IOException
    {
        TrueTypeFont ttfFont;

        CIDFontMapping mapping = FontMappers.instance()
                .getCIDFont(getBaseFont(), getFontDescriptor(),
                        getCIDSystemInfo());
        if (mapping.isCIDFont())
        {
            ttfFont = mapping.getFont();
        }
        else
        {
            ttfFont = (TrueTypeFont)mapping.getTrueTypeFont();
        }
        if (mapping.isFallback())
        {
            LOG.warn("Using fallback font " + ttfFont.getName() + " for CID-keyed TrueType font " + getBaseFont());
        }
        return ttfFont;
    }

    @Override
    public Matrix getFontMatrix()
    {
        if (fontMatrix == null)
        {
            // 1000 upem, this is not strictly true
            fontMatrix = new Matrix(0.001f, 0, 0, 0.001f, 0, 0);
        }
        return fontMatrix;
    }

    @Override
    public BoundingBox getBoundingBox() throws IOException
    {
        if (fontBBox == null)
        {
            fontBBox = generateBoundingBox();
        }
        return fontBBox;
    }

    private BoundingBox generateBoundingBox() throws IOException
    {
        if (getFontDescriptor() != null)
        {
            PDRectangle bbox = getFontDescriptor().getFontBoundingBox();
            if (bbox != null &&
                    (Float.compare(bbox.getLowerLeftX(), 0) != 0 || 
                     Float.compare(bbox.getLowerLeftY(), 0) != 0 ||
                     Float.compare(bbox.getUpperRightX(), 0) != 0 ||
                     Float.compare(bbox.getUpperRightY(), 0) != 0))
            {
                return new BoundingBox(bbox.getLowerLeftX(), bbox.getLowerLeftY(),
                                       bbox.getUpperRightX(), bbox.getUpperRightY());
            }
        }
        return ttf.getFontBBox();
    }

    @Override
    public int codeToCID(int code)
    {
        CMap cMap = parent.getCMap();

        // Acrobat allows bad PDFs to use Unicode CMaps here instead of CID CMaps, see PDFBOX-1283
        if (!cMap.hasCIDMappings() && cMap.hasUnicodeMappings())
        {
            return cMap.toUnicode(code).codePointAt(0); // actually: code -> CID
        }

        return cMap.toCID(code);
    }

    /**
     * Returns the GID for the given character code.
     *
     * @param code character code
     * @return GID
     * @throws IOException
     */
    @Override
    public int codeToGID(int code) throws IOException
    {
        if (!isEmbedded)
        {
            // The conforming reader shall select glyphs by translating characters from the
            // encoding specified by the predefined CMap to one of the encodings in the TrueType
            // font's 'cmap' table. The means by which this is accomplished are implementation-
            // dependent.
            // omit the CID2GID mapping if the embedded font is replaced by an external font
            if (cid2gid != null && !isDamaged)
            {
                // Acrobat allows non-embedded GIDs - todo: can we find a test PDF for this?
                LOG.warn("Using non-embedded GIDs in font " + getName());
                int cid = codeToCID(code);
                return cid2gid[cid];
            }
            else
            {
                // fallback to the ToUnicode CMap, test with PDFBOX-1422 and PDFBOX-2560
                String unicode = parent.toUnicode(code);
                if (unicode == null)
                {
                    LOG.warn("Failed to find a character mapping for " + code + " in " + getName());
                    // Acrobat is willing to use the CID as a GID, even when the font isn't embedded
                    // see PDFBOX-2599
                    return codeToCID(code);
                }
                else if (unicode.length() > 1)
                {
                    LOG.warn("Trying to map multi-byte character using 'cmap', result will be poor");
                }
                
                // a non-embedded font always has a cmap (otherwise FontMapper won't load it)
                return cmap.getGlyphId(unicode.codePointAt(0));
            }
        }
        else
        {
            // If the TrueType font program is embedded, the Type 2 CIDFont dictionary shall contain
            // a CIDToGIDMap entry that maps CIDs to the glyph indices for the appropriate glyph
            // descriptions in that font program.

            int cid = codeToCID(code);
            if (cid2gid != null)
            {
                // use CIDToGIDMap
                if (cid < cid2gid.length)
                {
                    return cid2gid[cid];
                }
                else
                {
                    return 0;
                }
            }
            else
            {
                // "Identity" is the default CIDToGIDMap
                if (cid < ttf.getNumberOfGlyphs())
                {
                    return cid;
                }
                else
                {
                    // out of range CIDs map to GID 0
                    return 0;
                }
            }
        }
    }

    @Override
    public float getHeight(int code) throws IOException
    {
        // todo: really we want the BBox, (for text extraction:)
        return (ttf.getHorizontalHeader().getAscender() + -ttf.getHorizontalHeader().getDescender())
                / ttf.getUnitsPerEm(); // todo: shouldn't this be the yMax/yMin?
    }

    @Override
    public float getWidthFromFont(int code) throws IOException
    {
        int gid = codeToGID(code);
        int width = ttf.getAdvanceWidth(gid);
        int unitsPerEM = ttf.getUnitsPerEm();
        if (unitsPerEM != 1000)
        {
            width *= 1000f / unitsPerEM;
        }
        return width;
    }

    @Override
    public byte[] encode(int unicode)
    {
        int cid = -1;
        if (isEmbedded)
        {
            // embedded fonts always use CIDToGIDMap, with Identity as the default
            if (parent.getCMap().getName().startsWith("Identity-"))
            {
                if (cmap != null)
                {
                    cid = cmap.getGlyphId(unicode);
                }
            }
            else
            {
                // if the CMap is predefined then there will be a UCS-2 CMap
                if (parent.getCMapUCS2() != null)
                {
                    cid = parent.getCMapUCS2().toCID(unicode);
                }
            }

            // otherwise we require an explicit ToUnicode CMap
            if (cid == -1)
            {
                // todo: invert the ToUnicode CMap?
                cid = 0;
            }
        }
        else
        {
            // a non-embedded font always has a cmap (otherwise it we wouldn't load it)
            cid = cmap.getGlyphId(unicode);
        }

        if (cid == 0)
        {
            throw new IllegalArgumentException(
                    String.format("No glyph for U+%04X in font %s", unicode, getName()));
        }

        // CID is always 2-bytes (16-bit) for TrueType
        return new byte[] { (byte)(cid >> 8 & 0xff), (byte)(cid & 0xff) };
    }

    @Override
    public boolean isEmbedded()
    {
        return isEmbedded;
    }

    @Override
    public boolean isDamaged()
    {
        return isDamaged;
    }

    /**
     * Returns the embedded or substituted TrueType font. May be an OpenType font if the font is
     * not embedded.
     */
    public TrueTypeFont getTrueTypeFont()
    {
        return ttf;
    }

    @Override
    public GeneralPath getPath(int code) throws IOException
    {
        if (ttf instanceof OpenTypeFont && ((OpenTypeFont)ttf).isPostScript())
        {
            // we're not supposed to have CFF fonts inside PDCIDFontType2, but if we do,
            // then we treat their CIDs as GIDs, see PDFBOX-3344
            int cid = codeToGID(code);
            Type2CharString charstring = ((OpenTypeFont)ttf).getCFF().getFont().getType2CharString(cid);
            return charstring.getPath();
        }
        else
        {
            int gid = codeToGID(code);
            GlyphData glyph = ttf.getGlyph().getGlyph(gid);
            if (glyph != null)
            {
                return glyph.getPath();
            }
            return new GeneralPath();
        }
    }

    @Override
    public boolean hasGlyph(int code) throws IOException
    {
        return codeToGID(code) != 0;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy