All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sejda.sambox.pdmodel.font.PDFont Maven / Gradle / Ivy

Go to download

An Apache PDFBox fork intended to be used as PDF processor for Sejda and PDFsam related projects

There is a newer version: 3.0.21
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sejda.sambox.pdmodel.font;

import static java.util.Objects.nonNull;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.fontbox.afm.FontMetrics;
import org.apache.fontbox.cmap.CMap;
import org.sejda.commons.FastByteArrayOutputStream;
import org.sejda.commons.util.IOUtils;
import org.sejda.sambox.cos.COSArray;
import org.sejda.sambox.cos.COSArrayList;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.cos.COSNumber;
import org.sejda.sambox.cos.COSObjectable;
import org.sejda.sambox.cos.COSStream;
import org.sejda.sambox.pdmodel.font.encoding.GlyphList;
import org.sejda.sambox.util.Matrix;
import org.sejda.sambox.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This is the base class for all PDF fonts.
 *
 * @author Ben Litchfield
 */
public abstract class PDFont implements COSObjectable, PDFontLike, Subsettable
{
    private static final Logger LOG = LoggerFactory.getLogger(PDFont.class);
    protected static final Matrix DEFAULT_FONT_MATRIX = new Matrix(0.001f, 0, 0, 0.001f, 0, 0);

    protected final COSDictionary dict;
    private final CMap toUnicodeCMap;
    private final FontMetrics afmStandard14; // AFM for standard 14 fonts

    private PDFontDescriptor fontDescriptor;
    private List widths;
    private float avgFontWidth;
    private float fontWidthOfSpace = -1f;
    private final Map codeToWidthMap;
    // sambox specific, transient metadata, does not get serialized to the document
    private final Map transientMetadata = new HashMap<>();

    PDFont()
    {
        dict = new COSDictionary();
        dict.setItem(COSName.TYPE, COSName.FONT);
        toUnicodeCMap = null;
        fontDescriptor = null;
        afmStandard14 = null;
        codeToWidthMap = new HashMap<>();
    }

    /**
     * Constructor for Standard 14.
     */
    PDFont(String baseFont)
    {
        dict = new COSDictionary();
        dict.setItem(COSName.TYPE, COSName.FONT);
        toUnicodeCMap = null;
        afmStandard14 = Standard14Fonts.getAFM(baseFont);
        if (afmStandard14 == null)
        {
            throw new IllegalArgumentException("No AFM for font " + baseFont);
        }
        fontDescriptor = PDType1FontEmbedder.buildFontDescriptor(afmStandard14);
        // standard 14 fonts may be accessed concurrently, as they are singletons
        codeToWidthMap = new ConcurrentHashMap<>();
    }

    /**
     * Constructor.
     *
     * @param fontDictionary Font dictionary.
     */
    protected PDFont(COSDictionary fontDictionary) throws IOException
    {
        dict = fontDictionary;
        codeToWidthMap = new HashMap<>();

        // standard 14 fonts use an AFM
        afmStandard14 = Standard14Fonts.getAFM(getName()); // may be null (it usually is)
        fontDescriptor = loadFontDescriptor();
        toUnicodeCMap = loadUnicodeCmap();
    }

    private PDFontDescriptor loadFontDescriptor()
    {
        COSDictionary fd = dict.getDictionaryObject(COSName.FONT_DESC, COSDictionary.class);
        if (nonNull(fd))
        {
            return new PDFontDescriptor(fd);
        }
        if (afmStandard14 != null)
        {
            // build font descriptor from the AFM
            return PDType1FontEmbedder.buildFontDescriptor(afmStandard14);
        }
        return null;
    }

    private CMap loadUnicodeCmap()
    {
        COSBase toUnicode = dict.getDictionaryObject(COSName.TO_UNICODE);
        if (toUnicode == null)
        {
            return null;
        }

        CMap cmap = null;
        try
        {
            cmap = readCMap(toUnicode);
            if (cmap != null && !cmap.hasUnicodeMappings())
            {
                LOG.warn("Invalid ToUnicode CMap in font " + getName());
                String cmapName = cmap.getName() != null ? cmap.getName() : "";
                String ordering = cmap.getOrdering() != null ? cmap.getOrdering() : "";
                COSBase encoding = dict.getDictionaryObject(COSName.ENCODING);
                if (cmapName.contains("Identity") //
                        || ordering.contains("Identity") //
                        || COSName.IDENTITY_H.equals(encoding) //
                        || COSName.IDENTITY_V.equals(encoding))
                {
                    COSDictionary encodingDict = dict.getDictionaryObject(COSName.ENCODING, COSDictionary.class);
                    if (encodingDict == null || !encodingDict.containsKey(COSName.DIFFERENCES))
                    {
                        // assume that if encoding is identity, then the reverse is also true
                        cmap = CMapManager.getPredefinedCMap(COSName.IDENTITY_H.getName());
                        LOG.warn("Using predefined identity CMap instead");
                    }
                }
            }
        }
        catch (IOException ex)
        {
            LOG.error("Could not read ToUnicode CMap in font " + getName(), ex);
        }
        return cmap;
    }

    /**
     * Returns the AFM if this is a Standard 14 font.
     */
    protected final FontMetrics getStandard14AFM()
    {
        return afmStandard14;
    }

    @Override
    public PDFontDescriptor getFontDescriptor()
    {
        return fontDescriptor;
    }

    /**
     * Sets the font descriptor when embedding a font.
     */
    protected final void setFontDescriptor(PDFontDescriptor fontDescriptor)
    {
        this.fontDescriptor = fontDescriptor;
    }

    /**
     * /** Reads a CMap given a COS Stream or Name. May return null if a predefined CMap does not
     * exist.
     *
     * @param base COSName or COSStream
     */
    protected final CMap readCMap(COSBase base) throws IOException
    {
        if (base instanceof COSName)
        {
            // predefined CMap
            String name = ((COSName) base).getName();
            return CMapManager.getPredefinedCMap(name);
        }
        if (base instanceof COSStream)
        {
            // embedded CMap
            InputStream input = null;
            try
            {
                input = ((COSStream) base).getUnfilteredStream();
                return CMapManager.parseCMap(input);
            }
            finally
            {
                IOUtils.closeQuietly(input);
            }
        }
        throw new IOException("Expected Name or Stream");
    }

    @Override
    public COSDictionary getCOSObject()
    {
        return dict;
    }

    @Override
    public Vector getPositionVector(int code)
    {
        throw new UnsupportedOperationException("Horizontal fonts have no position vector");
    }

    /**
     * Returns the displacement vector (w0, w1) in text space, for the given character. For
     * horizontal text only the x component is used, for vertical text only the y component.
     *
     * @param code character code
     * @return displacement vector
     */
    public Vector getDisplacement(int code) throws IOException
    {
        return new Vector(getWidth(code) / 1000, 0);
    }

    @Override
    public float getWidth(int code) throws IOException
    {
        Float width = codeToWidthMap.get(code);
        if (width != null)
        {
            return width;
        }

        // Acrobat overrides the widths in the font program on the conforming reader's system with
        // the widths specified in the font dictionary." (Adobe Supplement to the ISO 32000)
        //
        // Note: The Adobe Supplement says that the override happens "If the font program is not
        // embedded", however PDFBOX-427 shows that it also applies to embedded fonts.

        // Type1, Type1C, Type3
        if (dict.getDictionaryObject(COSName.WIDTHS) != null || dict.containsKey(
                COSName.MISSING_WIDTH))
        {
            int firstChar = dict.getInt(COSName.FIRST_CHAR, -1);
            int lastChar = dict.getInt(COSName.LAST_CHAR, -1);
            int siz = getWidths().size();
            int idx = code - firstChar;
            if (siz > 0 && code >= firstChar && code <= lastChar && idx < siz)
            {
                width = getWidths().get(idx);
                if (width == null)
                {
                    width = 0f;
                }
                codeToWidthMap.put(code, width);
                return width;
            }

            PDFontDescriptor fd = getFontDescriptor();
            if (fd != null)
            {
                // get entry from /MissingWidth entry
                width = fd.getMissingWidth();
                codeToWidthMap.put(code, width);
                return width;
            }
        }

        // standard 14 font widths are specified by an AFM
        if (isStandard14())
        {
            width = getStandard14Width(code);
            codeToWidthMap.put(code, width);
            return width;
        }

        // if there's nothing to override with, then obviously we fall back to the font
        width = getWidthFromFont(code);
        codeToWidthMap.put(code, width);
        return width;
    }

    /**
     * Returns the glyph width from the AFM if this is a Standard 14 font.
     *
     * @param code character code
     * @return width in 1/1000 text space
     */
    protected abstract float getStandard14Width(int code);

    /**
     * Encodes the given string for use in a PDF content stream.
     *
     * @param text Any Unicode text.
     * @return Array of PDF content stream bytes.
     * @throws IOException If the text could not be encoded.
     */
    public final byte[] encode(String text) throws IOException
    {
        FastByteArrayOutputStream out = new FastByteArrayOutputStream();
        int offset = 0;
        while (offset < text.length())
        {
            int codePoint = text.codePointAt(offset);

            // multi-byte encoding with 1 to 4 bytes
            byte[] bytes = encode(codePoint);
            out.write(bytes);

            offset += Character.charCount(codePoint);
        }
        return out.toByteArray();
    }

    /**
     * Similar to encode() but handles leniently cases where fonts don't have a glyph by assuming
     * the identity mapping
     */
    public final byte[] encodeLeniently(String text) throws IOException
    {
        FastByteArrayOutputStream out = new FastByteArrayOutputStream();
        for (int offset = 0; offset < text.length(); )
        {
            int codePoint = text.codePointAt(offset);

            // multi-byte encoding with 1 to 4 bytes
            byte[] bytes;
            try
            {
                bytes = encode(codePoint);
            }
            catch (IllegalArgumentException e)
            {
                if (e.getMessage().contains("No glyph") || e.getMessage()
                        .contains("is not available in this"))
                {
                    bytes = new byte[] { (byte) codePoint };
                }
                else
                {
                    throw e;
                }
            }
            out.write(bytes);

            offset += Character.charCount(codePoint);
        }
        return out.toByteArray();
    }

    /**
     * Encodes the given Unicode code point for use in a PDF content stream. Content streams use a
     * multi-byte encoding with 1 to 4 bytes.
     *
     * 

* This method is called when embedding text in PDFs and when filling in fields. * * @param unicode Unicode code point. * @return Array of 1 to 4 PDF content stream bytes. * @throws IOException If the text could not be encoded. */ protected abstract byte[] encode(int unicode) throws IOException; /** * Returns the width of the given Unicode string. * * @param text The text to get the width of. * @return The width of the string in 1/1000 units of text space. * @throws IOException If there is an error getting the width information. */ public float getStringWidth(String text) throws IOException { byte[] bytes = encode(text); ByteArrayInputStream in = new ByteArrayInputStream(bytes); float width = 0; while (in.available() > 0) { int code = readCode(in); width += getWidth(code); } return width; } /** * Similar to getStringWidth() but handles leniently fonts where glyphs are missing, assuming * the identity mapping of glyphs *

* Uses encodeLeniently() instead of encode() */ public float getStringWidthLeniently(String text) throws IOException { byte[] bytes = encodeLeniently(text); ByteArrayInputStream in = new ByteArrayInputStream(bytes); float width = 0; while (in.available() > 0) { int code = readCode(in); width += getWidth(code); } return width; } /** * This will get the average font width for all characters. * * @return The width is in 1000 unit of text space, ie 333 or 777 */ // todo: this method is highly suspicious, the average glyph width is not usually a good metric @Override public float getAverageFontWidth() { float average; if (avgFontWidth != 0.0f) { average = avgFontWidth; } else { float totalWidth = 0.0f; float characterCount = 0.0f; COSArray widths = dict.getDictionaryObject(COSName.WIDTHS, COSArray.class); if (widths != null) { for (int i = 0; i < widths.size(); i++) { COSNumber fontWidth = widths.getObject(i, COSNumber.class); if (nonNull(fontWidth)) { float floatValue = fontWidth.floatValue(); if (floatValue > 0) { totalWidth += floatValue; characterCount += 1; } } } } if (totalWidth > 0) { average = totalWidth / characterCount; } else { average = 0; } avgFontWidth = average; } return average; } /** * Reads a character code from a content stream string. Codes may be up to 4 bytes long. * * @param in string stream * @return character code * @throws IOException if the CMap or stream cannot be read */ public abstract int readCode(InputStream in) throws IOException; /** * Returns the Unicode character sequence which corresponds to the given character code. * * @param code character code * @param customGlyphList a custom glyph list to use instead of the Adobe Glyph List * @return Unicode character(s) */ public String toUnicode(int code, GlyphList customGlyphList) throws IOException { return toUnicode(code); } /** * Returns the Unicode character sequence which corresponds to the given character code. * * @param code character code * @return Unicode character(s) */ public String toUnicode(int code) throws IOException { // if the font dictionary containsName a ToUnicode CMap, use that CMap if (toUnicodeCMap != null) { if (toUnicodeCMap.getName() != null && toUnicodeCMap.getName().startsWith("Identity-") && (dict.getDictionaryObject(COSName.TO_UNICODE) instanceof COSName || !toUnicodeCMap.hasUnicodeMappings())) { // handle the undocumented case of using Identity-H/V as a ToUnicode CMap, this // isn't actually valid as the Identity-x CMaps are code->CID maps, not // code->Unicode maps. See sample_fonts_solidconvertor.pdf for an example. // PDFBOX-3123: do this only if the /ToUnicode entry is a name // PDFBOX-4322: identity streams are OK too return String.valueOf((char) code); } // proceed as normal return toUnicodeCMap.toUnicode(code); } // if no value has been produced, there is no way to obtain Unicode for the character. // this behaviour can be overridden is subclasses, but this method *must* return null here return null; } /** * This will always return "Font" for fonts. * * @return The type of object that this is. */ public String getType() { return dict.getNameAsString(COSName.TYPE); } /** * This will get the subtype of font. */ public String getSubType() { return dict.getNameAsString(COSName.SUBTYPE); } /** * The widths of the characters. This will be null for the standard 14 fonts. * * @return The widths of the characters. */ protected final List getWidths() { if (widths == null) { COSArray array = dict.getDictionaryObject(COSName.WIDTHS, COSArray.class); if (array != null) { widths = COSArrayList.convertFloatCOSArrayToList(array); } else { widths = Collections.emptyList(); } } return widths; } @Override public Matrix getFontMatrix() { return DEFAULT_FONT_MATRIX; } /** * Determines the width of the space character. * * @return the width of the space character */ public float getSpaceWidth() { if (fontWidthOfSpace == -1f) { try { if (toUnicodeCMap != null && dict.containsKey(COSName.TO_UNICODE)) { int spaceMapping = toUnicodeCMap.getSpaceMapping(); if (spaceMapping > -1) { fontWidthOfSpace = getWidth(spaceMapping); } } else { fontWidthOfSpace = getWidth(32); } // try to get it from the font itself if (fontWidthOfSpace <= 0) { fontWidthOfSpace = getWidthFromFont(32); // use the average font width as fall back if (fontWidthOfSpace <= 0) { fontWidthOfSpace = getAverageFontWidth(); } } } catch (Exception e) { LOG.error("Can't determine the width of the space character, assuming 250", e); fontWidthOfSpace = 250f; } } return fontWidthOfSpace; } /** * Returns true if the font uses vertical writing mode. */ public abstract boolean isVertical(); /** * Returns true if this font is one of the "Standard 14" fonts and receives special handling. */ public boolean isStandard14() { // this logic is based on Acrobat's behaviour, see see PDFBOX-2372 // embedded fonts never get special treatment if (isEmbedded()) { return false; } // if the name matches, this is a Standard 14 font return Standard14Fonts.containsName(getName()); } @Override public boolean equals(Object other) { return other instanceof PDFont && ((PDFont) other).getCOSObject() == this.getCOSObject(); } @Override public int hashCode() { return this.getCOSObject().hashCode(); } @Override public String toString() { return getClass().getSimpleName() + " " + getName(); } protected CMap getToUnicodeCMap() { return toUnicodeCMap; } public Map getTransientMetadata() { return transientMetadata; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy