All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sejda.sambox.pdmodel.font.FontMapperImpl Maven / Gradle / Ivy

Go to download

An Apache PDFBox fork intended to be used as PDF processor for Sejda and PDFsam related projects

There is a newer version: 3.0.21
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sejda.sambox.pdmodel.font;

import static java.util.Optional.ofNullable;
import static org.sejda.commons.util.RequireUtils.requireNotNullArg;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.CompletableFuture;

import org.apache.fontbox.FontBoxFont;
import org.apache.fontbox.ttf.OpenTypeFont;
import org.apache.fontbox.ttf.TTFParser;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.type1.Type1Font;
import org.sejda.sambox.SAMBox;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Font mapper, locates non-embedded fonts via a pluggable FontProvider.
 *
 * @author John Hewson
 */
final class FontMapperImpl implements FontMapper
{
    private final CompletableFuture> fontInfoByName;
    private final TrueTypeFont lastResortFont;

    private static final Logger LOG = LoggerFactory.getLogger(FontMapperImpl.class);

    /**
     * Map of PostScript name substitutes, in priority order.
     */
    private final Map> substitutes = new HashMap<>();
    
    FontMapperImpl()
    {
        this(loadFontProvider());
    }

    FontMapperImpl(FontProvider fontProvider)
    {
        fontInfoByName = CompletableFuture.supplyAsync(() -> {
            Map map = new LinkedHashMap<>();
            fontProvider.getFontInfo().forEach(
                    info -> getPostScriptNames(info.getPostScriptName()).forEach(
                            name -> map.put(name.toLowerCase(Locale.ENGLISH), info)));
            return map;
        });
        // substitutes for standard 14 fonts
        addSubstitutes("Courier", new ArrayList<>(
                Arrays.asList("CourierNew", "CourierNewPSMT", "LiberationMono",
                        "NimbusMonL-Regu")));
        addSubstitutes("Courier-Bold", new ArrayList<>(
                Arrays.asList("CourierNewPS-BoldMT", "CourierNew-Bold", "LiberationMono-Bold",
                        "NimbusMonL-Bold")));
        addSubstitutes("Courier-Oblique", new ArrayList<>(
                Arrays.asList("CourierNewPS-ItalicMT", "CourierNew-Italic", "LiberationMono-Italic",
                        "NimbusMonL-ReguObli")));
        addSubstitutes("Courier-BoldOblique", new ArrayList<>(
                Arrays.asList("CourierNewPS-BoldItalicMT", "CourierNew-BoldItalic",
                        "LiberationMono-BoldItalic", "NimbusMonL-BoldObli")));
        addSubstitutes("Helvetica", new ArrayList<>(
                Arrays.asList("ArialMT", "Arial", "LiberationSans", "NimbusSanL-Regu")));
        addSubstitutes("Helvetica-Bold", new ArrayList<>(
                Arrays.asList("Arial-BoldMT", "Arial-Bold", "LiberationSans-Bold",
                        "NimbusSanL-Bold")));
        addSubstitutes("Helvetica-Oblique", new ArrayList<>(
                Arrays.asList("Arial-ItalicMT", "Arial-Italic", "Helvetica-Italic",
                        "LiberationSans-Italic", "NimbusSanL-ReguItal")));
        addSubstitutes("Helvetica-BoldOblique", new ArrayList<>(
                Arrays.asList("Arial-BoldItalicMT", "Helvetica-BoldItalic",
                        "LiberationSans-BoldItalic", "NimbusSanL-BoldItal")));
        addSubstitutes("Times-Roman", new ArrayList<>(
                Arrays.asList("TimesNewRomanPSMT", "TimesNewRoman", "TimesNewRomanPS",
                        "LiberationSerif", "NimbusRomNo9L-Regu")));
        addSubstitutes("Times-Bold", new ArrayList<>(
                Arrays.asList("TimesNewRomanPS-BoldMT", "TimesNewRomanPS-Bold",
                        "TimesNewRoman-Bold", "LiberationSerif-Bold", "NimbusRomNo9L-Medi")));
        addSubstitutes("Times-Italic", new ArrayList<>(
                Arrays.asList("TimesNewRomanPS-ItalicMT", "TimesNewRomanPS-Italic",
                        "TimesNewRoman-Italic", "LiberationSerif-Italic",
                        "NimbusRomNo9L-ReguItal")));
        addSubstitutes("Times-BoldItalic", new ArrayList<>(
                Arrays.asList("TimesNewRomanPS-BoldItalicMT", "TimesNewRomanPS-BoldItalic",
                        "TimesNewRoman-BoldItalic", "LiberationSerif-BoldItalic",
                        "NimbusRomNo9L-MediItal")));
        addSubstitutes("Symbol",
                new ArrayList<>(Arrays.asList("Symbol", "SymbolMT", "StandardSymL", "ChromSymbolOTF")));
        addSubstitutes("ZapfDingbats", new ArrayList<>(
                Arrays.asList("ZapfDingbatsITCbyBT-Regular", "ZapfDingbatsITC", "Dingbats",
                        "MS-Gothic", "ChromDingbatsOTF")));

        // Acrobat also uses alternative names for Standard 14 fonts, which we map to those above
        // these include names such as "Arial" and "TimesNewRoman"
        for (String baseName : Standard14Fonts.getNames())
        {
            if (getSubstitutes(baseName).isEmpty())
            {
                String mappedName = Standard14Fonts.getMappedFontName(baseName);
                addSubstitutes(baseName, copySubstitutes(mappedName.toLowerCase(Locale.ENGLISH)));
            }
        }

        // -------------------------

        lastResortFont = loadLastResortFont();
    }

    public TrueTypeFont getLastResortFont()
    {
        return lastResortFont;
    }
    
    public static TrueTypeFont loadLastResortFont()
    {
        try
        {
            InputStream stream = FontMapperImpl.class.getResourceAsStream(
                    "/org/sejda/sambox/resources/ttf/LiberationSans-Regular.ttf");
            requireNotNullArg(stream,
                    "Unable to load org/sejda/sambox/resources/ttf/LiberationSans-Regular.ttf");
            return new TTFParser().parse(new BufferedInputStream(stream));
        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }
    }
    
    public static FontProvider loadFontProvider()
    {
        String configuredFontProvider = System.getProperty(SAMBox.FONT_PROVIDER_PROPERTY); 
        if ("noop".equalsIgnoreCase(configuredFontProvider))
        {
            return new NoopFontProvider();
        }
        else if(configuredFontProvider != null && !configuredFontProvider.isEmpty())
        {
            try
            {
                LOG.debug("Trying to use {} as font provider...", configuredFontProvider);
                return (FontProvider) Class.forName(configuredFontProvider).getDeclaredConstructor().newInstance();
            }
            catch (Exception ex)
            {
                LOG.error("Failed loading custom font provider", ex);
            }
        }
        
        return new FileSystemFontProvider();
    }

    /**
     * Gets alternative names, as seen in some PDFs, e.g. PDFBOX-142.
     */
    private Set getPostScriptNames(String postScriptName)
    {
        Set names = new HashSet<>(2);

        // built-in PostScript name
        names.add(postScriptName);

        // remove hyphens (e.g. Arial-Black -> ArialBlack)
        names.add(postScriptName.replace("-", ""));

        return names;
    }

    /**
     * Copies a list of font substitutes, adding the original font at the start of the list.
     */
    private List copySubstitutes(String postScriptName)
    {
        return new ArrayList<>(substitutes.get(postScriptName));
    }

    /**
     * Adds a top-priority substitute for the given font.
     *
     * @param match   PostScript name of the font to match
     * @param replace PostScript name of the font to use as a replacement
     */
    public void addSubstitute(String match, String replace)
    {
        String lowerCaseMatch = match.toLowerCase(Locale.ENGLISH);
        if (!substitutes.containsKey(lowerCaseMatch))
        {
            substitutes.put(lowerCaseMatch, new ArrayList<>());
        }
        substitutes.get(lowerCaseMatch).add(replace);
    }

    private void addSubstitutes(String match, List replacements)
    {
        substitutes.put(match.toLowerCase(Locale.ENGLISH), replacements);
    }

    /**
     * Returns the substitutes for a given font.
     */
    private List getSubstitutes(String postScriptName)
    {
        List subs = substitutes.get(postScriptName.replace(" ", "").toLowerCase(Locale.ENGLISH));
        if (subs != null)
        {
            return subs;
        }
        return Collections.emptyList();
    }

    /**
     * Attempts to find a good fallback based on the font descriptor.
     */
    private String getFallbackFontName(PDFontDescriptor fontDescriptor)
    {
        String fontName;
        if (fontDescriptor != null)
        {
            // heuristic detection of bold
            boolean isBold = false;
            String name = fontDescriptor.getFontName();
            if (name != null)
            {
                String lower = fontDescriptor.getFontName().toLowerCase();
                isBold = lower.contains("bold") || lower.contains("black") || lower.contains(
                        "heavy");
            }

            // font descriptor flags should describe the style
            if (fontDescriptor.isFixedPitch())
            {
                fontName = "Courier";
                if (isBold && fontDescriptor.isItalic())
                {
                    fontName += "-BoldOblique";
                }
                else if (isBold)
                {
                    fontName += "-Bold";
                }
                else if (fontDescriptor.isItalic())
                {
                    fontName += "-Oblique";
                }
            }
            else if (fontDescriptor.isSerif())
            {
                fontName = "Times";
                if (isBold && fontDescriptor.isItalic())
                {
                    fontName += "-BoldItalic";
                }
                else if (isBold)
                {
                    fontName += "-Bold";
                }
                else if (fontDescriptor.isItalic())
                {
                    fontName += "-Italic";
                }
                else
                {
                    fontName += "-Roman";
                }
            }
            else
            {
                fontName = "Helvetica";
                if (isBold && fontDescriptor.isItalic())
                {
                    fontName += "-BoldOblique";
                }
                else if (isBold)
                {
                    fontName += "-Bold";
                }
                else if (fontDescriptor.isItalic())
                {
                    fontName += "-Oblique";
                }
            }
        }
        else
        {
            // if there is no FontDescriptor then we just fall back to Times Roman
            fontName = "Times-Roman";
        }
        return fontName;
    }

    /**
     * Finds a TrueType font with the given PostScript name, or a suitable substitute, or null.
     *
     * @param fontDescriptor FontDescriptor
     */
    @Override
    public FontMapping getTrueTypeFont(String baseFont,
            PDFontDescriptor fontDescriptor)
    {
        TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, baseFont);
        if (ttf != null)
        {
            return new FontMapping<>(ttf, false);
        }
        // fallback - todo: i.e. fuzzy match
        String fontName = getFallbackFontName(fontDescriptor);
        ttf = (TrueTypeFont) findFont(FontFormat.TTF, fontName);
        if (ttf == null)
        {
            // we have to return something here as TTFs aren't strictly required on the system
            ttf = lastResortFont;
        }
        return new FontMapping<>(ttf, true);
    }

    /**
     * Finds a font with the given PostScript name, or a suitable substitute, or null. This allows
     * any font to be substituted with a PFB, TTF or OTF.
     *
     * @param fontDescriptor the FontDescriptor of the font to find
     */
    @Override
    public FontMapping getFontBoxFont(String baseFont, PDFontDescriptor fontDescriptor)
    {
        FontBoxFont font = findFontBoxFont(baseFont);
        if (font != null)
        {
            return new FontMapping<>(font, false);
        }
        // fallback - todo: i.e. fuzzy match
        String fallbackName = getFallbackFontName(fontDescriptor);
        font = findFontBoxFont(fallbackName);
        if (font == null)
        {
            // we have to return something here as TTFs aren't strictly required on the system
            font = lastResortFont;
        }
        return new FontMapping<>(font, true);
    }

    /**
     * Finds a font with the given PostScript name, or a suitable substitute, or null.
     *
     * @param postScriptName PostScript font name
     */
    private FontBoxFont findFontBoxFont(String postScriptName)
    {
        Type1Font t1 = (Type1Font) findFont(FontFormat.PFB, postScriptName);
        if (t1 != null)
        {
            return t1;
        }

        TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, postScriptName);
        if (ttf != null)
        {
            return ttf;
        }

        return findFont(FontFormat.OTF, postScriptName);
    }

    /**
     * Finds a font with the given PostScript name, or a suitable substitute, or null.
     *
     * @param postScriptName PostScript font name
     */
    private FontBoxFont findFont(FontFormat format, String postScriptName)
    {
        // handle damaged PDFs, see PDFBOX-2884
        if (postScriptName == null)
        {
            return null;
        }

        // first try to match the PostScript name
        FontInfo info = getFont(format, postScriptName);
        if (info != null)
        {
            return info.getFont();
        }

        // remove hyphens (e.g. Arial-Black -> ArialBlack)
        info = getFont(format, postScriptName.replace("-", ""));
        if (info != null)
        {
            return info.getFont();
        }

        // then try named substitutes
        for (String substituteName : getSubstitutes(postScriptName))
        {
            info = getFont(format, substituteName);
            if (info != null)
            {
                return info.getFont();
            }
        }

        // then try converting Windows names e.g. (ArialNarrow,Bold) -> (ArialNarrow-Bold)
        info = getFont(format, postScriptName.replaceAll(",", "-"));
        if (info != null)
        {
            return info.getFont();
        }

        // try appending "-Regular", works for Wingdings on windows
        info = getFont(format, postScriptName + "-Regular");
        if (info != null)
        {
            return info.getFont();
        }

        // no matches
        return null;
    }

    /**
     * Finds the named font with the given format.
     */
    private FontInfo getFont(FontFormat format, String postScriptName)
    {
        // strip subset tag (happens when we substitute a corrupt embedded font, see PDFBOX-2642)
        if (postScriptName.contains("+"))
        {
            postScriptName = postScriptName.substring(postScriptName.indexOf('+') + 1);
        }

        // look up the PostScript name
        FontInfo info = fontInfoByName.join().get(postScriptName.toLowerCase(Locale.ENGLISH));
        if (info != null && info.getFormat() == format)
        {
            return info;
        }
        return null;
    }

    /**
     * Finds a CFF CID-Keyed font with the given PostScript name, or a suitable substitute, or null.
     * This method can also map CJK fonts via their CIDSystemInfo (ROS).
     *
     * @param fontDescriptor FontDescriptor
     * @param cidSystemInfo  the CID system info, e.g. "Adobe-Japan1", if any.
     */
    @Override
    public CIDFontMapping getCIDFont(String baseFont, PDFontDescriptor fontDescriptor,
            PDCIDSystemInfo cidSystemInfo)
    {
        // try name match or substitute with OTF
        OpenTypeFont otf1 = (OpenTypeFont) findFont(FontFormat.OTF, baseFont);
        if (otf1 != null)
        {
            return new CIDFontMapping(otf1, null, false);
        }

        // try name match or substitute with TTF
        TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, baseFont);
        if (ttf != null)
        {
            return new CIDFontMapping(null, ttf, false);
        }

        if (cidSystemInfo != null)
        {
            // "In Acrobat 3.0.1 and later, Type 0 fonts that use a CMap whose CIDSystemInfo
            // dictionary defines the Adobe-GB1, Adobe-CNS1 Adobe-Japan1, or Adobe-Korea1 character
            // collection can also be substituted." - Adobe Supplement to the ISO 32000

            String collection = cidSystemInfo.getRegistry() + "-" + cidSystemInfo.getOrdering();

            if (collection.equals("Adobe-GB1") || collection.equals("Adobe-CNS1")
                    || collection.equals("Adobe-Japan1") || collection.equals("Adobe-Korea1"))
            {
                // try automatic substitutes via character collection
                PriorityQueue queue = getFontMatches(fontDescriptor, cidSystemInfo);
                FontMatch bestMatch = queue.poll();
                if (bestMatch != null)
                {
                    FontBoxFont font = bestMatch.info.getFont();
                    if (font instanceof OpenTypeFont)
                    {
                        return new CIDFontMapping((OpenTypeFont) font, null, true);
                    }
                    if (font != null)
                    {
                        return new CIDFontMapping(null, font, true);
                    }
                }
            }
        }

        // last-resort fallback
        return new CIDFontMapping(null, lastResortFont, true);
    }

    /**
     * Returns a list of matching fonts, scored by suitability. Positive scores indicate matches for
     * certain attributes, while negative scores indicate mismatches. Zero scores are neutral.
     *
     * @param fontDescriptor FontDescriptor, always present.
     * @param cidSystemInfo  Font's CIDSystemInfo, may be null.
     */
    private PriorityQueue getFontMatches(PDFontDescriptor fontDescriptor,
            PDCIDSystemInfo cidSystemInfo)
    {
        PriorityQueue queue = new PriorityQueue<>(20);
        for (FontInfo info : fontInfoByName.join().values())
        {
            // filter by CIDSystemInfo, if given
            if (cidSystemInfo != null && !isCharSetMatch(cidSystemInfo, info))
            {
                continue;
            }

            FontMatch match = new FontMatch(info);

            // Panose is the most reliable
            if (fontDescriptor.getPanose() != null && info.getPanose() != null)
            {
                PDPanoseClassification panose = fontDescriptor.getPanose().getPanose();
                if (panose.getFamilyKind() == info.getPanose().getFamilyKind())
                {
                    if (panose.getFamilyKind() == 0 && (
                            info.getPostScriptName().toLowerCase().contains("barcode")
                                    || info.getPostScriptName().startsWith("Code"))
                            && !probablyBarcodeFont(fontDescriptor))
                    {
                        // PDFBOX-4268: ignore barcode font if we aren't searching for one.
                        continue;
                    }
                    // serifs
                    if (panose.getSerifStyle() == info.getPanose().getSerifStyle())
                    {
                        // exact match
                        match.score += 2;
                    }
                    else if (panose.getSerifStyle() >= 2 && panose.getSerifStyle() <= 5
                            && info.getPanose().getSerifStyle() >= 2
                            && info.getPanose().getSerifStyle() <= 5)
                    {
                        // cove (serif)
                        match.score += 1;
                    }
                    else if (panose.getSerifStyle() >= 11 && panose.getSerifStyle() <= 13
                            && info.getPanose().getSerifStyle() >= 11
                            && info.getPanose().getSerifStyle() <= 13)
                    {
                        // sans-serif
                        match.score += 1;
                    }
                    else if (panose.getSerifStyle() != 0 && info.getPanose().getSerifStyle() != 0)
                    {
                        // mismatch
                        match.score -= 1;
                    }

                    // weight
                    int weight = info.getPanose().getWeight();
                    int weightClass = info.getWeightClassAsPanose();
                    if (Math.abs(weight - weightClass) > 2)
                    {
                        // inconsistent data in system font, usWeightClass wins
                        weight = weightClass;
                    }

                    if (panose.getWeight() == weight)
                    {
                        // exact match
                        match.score += 2;
                    }
                    else if (panose.getWeight() > 1 && weight > 1)
                    {
                        float dist = Math.abs(panose.getWeight() - weight);
                        match.score += 1 - dist * 0.5;
                    }

                    // todo: italic
                    // ...
                }
            }
            else if (fontDescriptor.getFontWeight() > 0 && info.getWeightClass() > 0)
            {
                // usWeightClass is pretty reliable
                float dist = Math.abs(fontDescriptor.getFontWeight() - info.getWeightClass());
                match.score += 1 - (dist / 100) * 0.5;
            }
            // todo: italic
            // ...

            queue.add(match);
        }
        return queue;
    }

    private static boolean probablyBarcodeFont(PDFontDescriptor fontDescriptor)
    {
        String ff = ofNullable(fontDescriptor.getFontFamily()).orElse("");
        String fn = ofNullable(fontDescriptor.getFontName()).orElse("");
        return ff.startsWith("Code") || ff.toLowerCase().contains("barcode") || fn.startsWith(
                "Code") || fn.toLowerCase().contains("barcode");
    }

    /**
     * Returns true if the character set described by CIDSystemInfo is present in the given font.
     * Only applies to Adobe-GB1, Adobe-CNS1, Adobe-Japan1, Adobe-Korea1, as per the PDF spec.
     */
    private boolean isCharSetMatch(PDCIDSystemInfo cidSystemInfo, FontInfo info)
    {
        if (info.getCIDSystemInfo() != null)
        {
            return info.getCIDSystemInfo().getRegistry().equals(cidSystemInfo.getRegistry())
                    && info.getCIDSystemInfo().getOrdering().equals(cidSystemInfo.getOrdering());
        }
        long codePageRange = info.getCodePageRange();

        long JIS_JAPAN = 1 << 17;
        long CHINESE_SIMPLIFIED = 1 << 18;
        long KOREAN_WANSUNG = 1 << 19;
        long CHINESE_TRADITIONAL = 1 << 20;
        long KOREAN_JOHAB = 1 << 21;

        if ("MalgunGothic-Semilight".equals(info.getPostScriptName()))
        {
            // PDFBOX-4793 and PDF.js 10699: This font has only Korean, but has bits 17-21 set.
            codePageRange &= ~(JIS_JAPAN | CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL);
        }
        if (cidSystemInfo.getOrdering().equals("GB1")
                && (codePageRange & CHINESE_SIMPLIFIED) == CHINESE_SIMPLIFIED)
        {
            return true;
        }
        if (cidSystemInfo.getOrdering().equals("CNS1")
                && (codePageRange & CHINESE_TRADITIONAL) == CHINESE_TRADITIONAL)
        {
            return true;
        }
        if (cidSystemInfo.getOrdering().equals("Japan1")
                && (codePageRange & JIS_JAPAN) == JIS_JAPAN)
        {
            return true;
        }
        return cidSystemInfo.getOrdering().equals("Korea1") && (
                (codePageRange & KOREAN_WANSUNG) == KOREAN_WANSUNG
                        || (codePageRange & KOREAN_JOHAB) == KOREAN_JOHAB);
    }

    /**
     * A potential match for a font substitution.
     */
    private static class FontMatch implements Comparable
    {
        double score;
        final FontInfo info;

        FontMatch(FontInfo info)
        {
            this.info = info;
        }

        @Override
        public int compareTo(FontMatch match)
        {
            return Double.compare(match.score, this.score);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy