Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.sejda.sambox.pdmodel.font.FontMapperImpl Maven / Gradle / Ivy
Go to download
An Apache PDFBox fork intended to be used as PDF processor for Sejda and PDFsam
related projects
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sejda.sambox.pdmodel.font;
import static java.util.Optional.ofNullable;
import static org.sejda.commons.util.RequireUtils.requireNotNullArg;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import org.apache.fontbox.FontBoxFont;
import org.apache.fontbox.ttf.OpenTypeFont;
import org.apache.fontbox.ttf.TTFParser;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.type1.Type1Font;
import org.sejda.sambox.SAMBox;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Font mapper, locates non-embedded fonts via a pluggable FontProvider.
*
* @author John Hewson
*/
final class FontMapperImpl implements FontMapper
{
private final CompletableFuture> fontInfoByName;
private final TrueTypeFont lastResortFont;
private static final Logger LOG = LoggerFactory.getLogger(FontMapperImpl.class);
/**
* Map of PostScript name substitutes, in priority order.
*/
private final Map> substitutes = new HashMap<>();
FontMapperImpl()
{
this(loadFontProvider());
}
FontMapperImpl(FontProvider fontProvider)
{
fontInfoByName = CompletableFuture.supplyAsync(() -> {
Map map = new LinkedHashMap<>();
fontProvider.getFontInfo().forEach(
info -> getPostScriptNames(info.getPostScriptName()).forEach(
name -> map.put(name.toLowerCase(Locale.ENGLISH), info)));
return map;
});
// substitutes for standard 14 fonts
addSubstitutes("Courier", new ArrayList<>(
Arrays.asList("CourierNew", "CourierNewPSMT", "LiberationMono",
"NimbusMonL-Regu")));
addSubstitutes("Courier-Bold", new ArrayList<>(
Arrays.asList("CourierNewPS-BoldMT", "CourierNew-Bold", "LiberationMono-Bold",
"NimbusMonL-Bold")));
addSubstitutes("Courier-Oblique", new ArrayList<>(
Arrays.asList("CourierNewPS-ItalicMT", "CourierNew-Italic", "LiberationMono-Italic",
"NimbusMonL-ReguObli")));
addSubstitutes("Courier-BoldOblique", new ArrayList<>(
Arrays.asList("CourierNewPS-BoldItalicMT", "CourierNew-BoldItalic",
"LiberationMono-BoldItalic", "NimbusMonL-BoldObli")));
addSubstitutes("Helvetica", new ArrayList<>(
Arrays.asList("ArialMT", "Arial", "LiberationSans", "NimbusSanL-Regu")));
addSubstitutes("Helvetica-Bold", new ArrayList<>(
Arrays.asList("Arial-BoldMT", "Arial-Bold", "LiberationSans-Bold",
"NimbusSanL-Bold")));
addSubstitutes("Helvetica-Oblique", new ArrayList<>(
Arrays.asList("Arial-ItalicMT", "Arial-Italic", "Helvetica-Italic",
"LiberationSans-Italic", "NimbusSanL-ReguItal")));
addSubstitutes("Helvetica-BoldOblique", new ArrayList<>(
Arrays.asList("Arial-BoldItalicMT", "Helvetica-BoldItalic",
"LiberationSans-BoldItalic", "NimbusSanL-BoldItal")));
addSubstitutes("Times-Roman", new ArrayList<>(
Arrays.asList("TimesNewRomanPSMT", "TimesNewRoman", "TimesNewRomanPS",
"LiberationSerif", "NimbusRomNo9L-Regu")));
addSubstitutes("Times-Bold", new ArrayList<>(
Arrays.asList("TimesNewRomanPS-BoldMT", "TimesNewRomanPS-Bold",
"TimesNewRoman-Bold", "LiberationSerif-Bold", "NimbusRomNo9L-Medi")));
addSubstitutes("Times-Italic", new ArrayList<>(
Arrays.asList("TimesNewRomanPS-ItalicMT", "TimesNewRomanPS-Italic",
"TimesNewRoman-Italic", "LiberationSerif-Italic",
"NimbusRomNo9L-ReguItal")));
addSubstitutes("Times-BoldItalic", new ArrayList<>(
Arrays.asList("TimesNewRomanPS-BoldItalicMT", "TimesNewRomanPS-BoldItalic",
"TimesNewRoman-BoldItalic", "LiberationSerif-BoldItalic",
"NimbusRomNo9L-MediItal")));
addSubstitutes("Symbol",
new ArrayList<>(Arrays.asList("Symbol", "SymbolMT", "StandardSymL", "ChromSymbolOTF")));
addSubstitutes("ZapfDingbats", new ArrayList<>(
Arrays.asList("ZapfDingbatsITCbyBT-Regular", "ZapfDingbatsITC", "Dingbats",
"MS-Gothic", "ChromDingbatsOTF")));
// Acrobat also uses alternative names for Standard 14 fonts, which we map to those above
// these include names such as "Arial" and "TimesNewRoman"
for (String baseName : Standard14Fonts.getNames())
{
if (getSubstitutes(baseName).isEmpty())
{
String mappedName = Standard14Fonts.getMappedFontName(baseName);
addSubstitutes(baseName, copySubstitutes(mappedName.toLowerCase(Locale.ENGLISH)));
}
}
// -------------------------
lastResortFont = loadLastResortFont();
}
public TrueTypeFont getLastResortFont()
{
return lastResortFont;
}
public static TrueTypeFont loadLastResortFont()
{
try
{
InputStream stream = FontMapperImpl.class.getResourceAsStream(
"/org/sejda/sambox/resources/ttf/LiberationSans-Regular.ttf");
requireNotNullArg(stream,
"Unable to load org/sejda/sambox/resources/ttf/LiberationSans-Regular.ttf");
return new TTFParser().parse(new BufferedInputStream(stream));
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
public static FontProvider loadFontProvider()
{
String configuredFontProvider = System.getProperty(SAMBox.FONT_PROVIDER_PROPERTY);
if ("noop".equalsIgnoreCase(configuredFontProvider))
{
return new NoopFontProvider();
}
else if(configuredFontProvider != null && !configuredFontProvider.isEmpty())
{
try
{
LOG.debug("Trying to use {} as font provider...", configuredFontProvider);
return (FontProvider) Class.forName(configuredFontProvider).getDeclaredConstructor().newInstance();
}
catch (Exception ex)
{
LOG.error("Failed loading custom font provider", ex);
}
}
return new FileSystemFontProvider();
}
/**
* Gets alternative names, as seen in some PDFs, e.g. PDFBOX-142.
*/
private Set getPostScriptNames(String postScriptName)
{
Set names = new HashSet<>(2);
// built-in PostScript name
names.add(postScriptName);
// remove hyphens (e.g. Arial-Black -> ArialBlack)
names.add(postScriptName.replace("-", ""));
return names;
}
/**
* Copies a list of font substitutes, adding the original font at the start of the list.
*/
private List copySubstitutes(String postScriptName)
{
return new ArrayList<>(substitutes.get(postScriptName));
}
/**
* Adds a top-priority substitute for the given font.
*
* @param match PostScript name of the font to match
* @param replace PostScript name of the font to use as a replacement
*/
public void addSubstitute(String match, String replace)
{
String lowerCaseMatch = match.toLowerCase(Locale.ENGLISH);
if (!substitutes.containsKey(lowerCaseMatch))
{
substitutes.put(lowerCaseMatch, new ArrayList<>());
}
substitutes.get(lowerCaseMatch).add(replace);
}
private void addSubstitutes(String match, List replacements)
{
substitutes.put(match.toLowerCase(Locale.ENGLISH), replacements);
}
/**
* Returns the substitutes for a given font.
*/
private List getSubstitutes(String postScriptName)
{
List subs = substitutes.get(postScriptName.replace(" ", "").toLowerCase(Locale.ENGLISH));
if (subs != null)
{
return subs;
}
return Collections.emptyList();
}
/**
* Attempts to find a good fallback based on the font descriptor.
*/
private String getFallbackFontName(PDFontDescriptor fontDescriptor)
{
String fontName;
if (fontDescriptor != null)
{
// heuristic detection of bold
boolean isBold = false;
String name = fontDescriptor.getFontName();
if (name != null)
{
String lower = fontDescriptor.getFontName().toLowerCase();
isBold = lower.contains("bold") || lower.contains("black") || lower.contains(
"heavy");
}
// font descriptor flags should describe the style
if (fontDescriptor.isFixedPitch())
{
fontName = "Courier";
if (isBold && fontDescriptor.isItalic())
{
fontName += "-BoldOblique";
}
else if (isBold)
{
fontName += "-Bold";
}
else if (fontDescriptor.isItalic())
{
fontName += "-Oblique";
}
}
else if (fontDescriptor.isSerif())
{
fontName = "Times";
if (isBold && fontDescriptor.isItalic())
{
fontName += "-BoldItalic";
}
else if (isBold)
{
fontName += "-Bold";
}
else if (fontDescriptor.isItalic())
{
fontName += "-Italic";
}
else
{
fontName += "-Roman";
}
}
else
{
fontName = "Helvetica";
if (isBold && fontDescriptor.isItalic())
{
fontName += "-BoldOblique";
}
else if (isBold)
{
fontName += "-Bold";
}
else if (fontDescriptor.isItalic())
{
fontName += "-Oblique";
}
}
}
else
{
// if there is no FontDescriptor then we just fall back to Times Roman
fontName = "Times-Roman";
}
return fontName;
}
/**
* Finds a TrueType font with the given PostScript name, or a suitable substitute, or null.
*
* @param fontDescriptor FontDescriptor
*/
@Override
public FontMapping getTrueTypeFont(String baseFont,
PDFontDescriptor fontDescriptor)
{
TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, baseFont);
if (ttf != null)
{
return new FontMapping<>(ttf, false);
}
// fallback - todo: i.e. fuzzy match
String fontName = getFallbackFontName(fontDescriptor);
ttf = (TrueTypeFont) findFont(FontFormat.TTF, fontName);
if (ttf == null)
{
// we have to return something here as TTFs aren't strictly required on the system
ttf = lastResortFont;
}
return new FontMapping<>(ttf, true);
}
/**
* Finds a font with the given PostScript name, or a suitable substitute, or null. This allows
* any font to be substituted with a PFB, TTF or OTF.
*
* @param fontDescriptor the FontDescriptor of the font to find
*/
@Override
public FontMapping getFontBoxFont(String baseFont, PDFontDescriptor fontDescriptor)
{
FontBoxFont font = findFontBoxFont(baseFont);
if (font != null)
{
return new FontMapping<>(font, false);
}
// fallback - todo: i.e. fuzzy match
String fallbackName = getFallbackFontName(fontDescriptor);
font = findFontBoxFont(fallbackName);
if (font == null)
{
// we have to return something here as TTFs aren't strictly required on the system
font = lastResortFont;
}
return new FontMapping<>(font, true);
}
/**
* Finds a font with the given PostScript name, or a suitable substitute, or null.
*
* @param postScriptName PostScript font name
*/
private FontBoxFont findFontBoxFont(String postScriptName)
{
Type1Font t1 = (Type1Font) findFont(FontFormat.PFB, postScriptName);
if (t1 != null)
{
return t1;
}
TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, postScriptName);
if (ttf != null)
{
return ttf;
}
return findFont(FontFormat.OTF, postScriptName);
}
/**
* Finds a font with the given PostScript name, or a suitable substitute, or null.
*
* @param postScriptName PostScript font name
*/
private FontBoxFont findFont(FontFormat format, String postScriptName)
{
// handle damaged PDFs, see PDFBOX-2884
if (postScriptName == null)
{
return null;
}
// first try to match the PostScript name
FontInfo info = getFont(format, postScriptName);
if (info != null)
{
return info.getFont();
}
// remove hyphens (e.g. Arial-Black -> ArialBlack)
info = getFont(format, postScriptName.replace("-", ""));
if (info != null)
{
return info.getFont();
}
// then try named substitutes
for (String substituteName : getSubstitutes(postScriptName))
{
info = getFont(format, substituteName);
if (info != null)
{
return info.getFont();
}
}
// then try converting Windows names e.g. (ArialNarrow,Bold) -> (ArialNarrow-Bold)
info = getFont(format, postScriptName.replaceAll(",", "-"));
if (info != null)
{
return info.getFont();
}
// try appending "-Regular", works for Wingdings on windows
info = getFont(format, postScriptName + "-Regular");
if (info != null)
{
return info.getFont();
}
// no matches
return null;
}
/**
* Finds the named font with the given format.
*/
private FontInfo getFont(FontFormat format, String postScriptName)
{
// strip subset tag (happens when we substitute a corrupt embedded font, see PDFBOX-2642)
if (postScriptName.contains("+"))
{
postScriptName = postScriptName.substring(postScriptName.indexOf('+') + 1);
}
// look up the PostScript name
FontInfo info = fontInfoByName.join().get(postScriptName.toLowerCase(Locale.ENGLISH));
if (info != null && info.getFormat() == format)
{
return info;
}
return null;
}
/**
* Finds a CFF CID-Keyed font with the given PostScript name, or a suitable substitute, or null.
* This method can also map CJK fonts via their CIDSystemInfo (ROS).
*
* @param fontDescriptor FontDescriptor
* @param cidSystemInfo the CID system info, e.g. "Adobe-Japan1", if any.
*/
@Override
public CIDFontMapping getCIDFont(String baseFont, PDFontDescriptor fontDescriptor,
PDCIDSystemInfo cidSystemInfo)
{
// try name match or substitute with OTF
OpenTypeFont otf1 = (OpenTypeFont) findFont(FontFormat.OTF, baseFont);
if (otf1 != null)
{
return new CIDFontMapping(otf1, null, false);
}
// try name match or substitute with TTF
TrueTypeFont ttf = (TrueTypeFont) findFont(FontFormat.TTF, baseFont);
if (ttf != null)
{
return new CIDFontMapping(null, ttf, false);
}
if (cidSystemInfo != null)
{
// "In Acrobat 3.0.1 and later, Type 0 fonts that use a CMap whose CIDSystemInfo
// dictionary defines the Adobe-GB1, Adobe-CNS1 Adobe-Japan1, or Adobe-Korea1 character
// collection can also be substituted." - Adobe Supplement to the ISO 32000
String collection = cidSystemInfo.getRegistry() + "-" + cidSystemInfo.getOrdering();
if (collection.equals("Adobe-GB1") || collection.equals("Adobe-CNS1")
|| collection.equals("Adobe-Japan1") || collection.equals("Adobe-Korea1"))
{
// try automatic substitutes via character collection
PriorityQueue queue = getFontMatches(fontDescriptor, cidSystemInfo);
FontMatch bestMatch = queue.poll();
if (bestMatch != null)
{
FontBoxFont font = bestMatch.info.getFont();
if (font instanceof OpenTypeFont)
{
return new CIDFontMapping((OpenTypeFont) font, null, true);
}
if (font != null)
{
return new CIDFontMapping(null, font, true);
}
}
}
}
// last-resort fallback
return new CIDFontMapping(null, lastResortFont, true);
}
/**
* Returns a list of matching fonts, scored by suitability. Positive scores indicate matches for
* certain attributes, while negative scores indicate mismatches. Zero scores are neutral.
*
* @param fontDescriptor FontDescriptor, always present.
* @param cidSystemInfo Font's CIDSystemInfo, may be null.
*/
private PriorityQueue getFontMatches(PDFontDescriptor fontDescriptor,
PDCIDSystemInfo cidSystemInfo)
{
PriorityQueue queue = new PriorityQueue<>(20);
for (FontInfo info : fontInfoByName.join().values())
{
// filter by CIDSystemInfo, if given
if (cidSystemInfo != null && !isCharSetMatch(cidSystemInfo, info))
{
continue;
}
FontMatch match = new FontMatch(info);
// Panose is the most reliable
if (fontDescriptor.getPanose() != null && info.getPanose() != null)
{
PDPanoseClassification panose = fontDescriptor.getPanose().getPanose();
if (panose.getFamilyKind() == info.getPanose().getFamilyKind())
{
if (panose.getFamilyKind() == 0 && (
info.getPostScriptName().toLowerCase().contains("barcode")
|| info.getPostScriptName().startsWith("Code"))
&& !probablyBarcodeFont(fontDescriptor))
{
// PDFBOX-4268: ignore barcode font if we aren't searching for one.
continue;
}
// serifs
if (panose.getSerifStyle() == info.getPanose().getSerifStyle())
{
// exact match
match.score += 2;
}
else if (panose.getSerifStyle() >= 2 && panose.getSerifStyle() <= 5
&& info.getPanose().getSerifStyle() >= 2
&& info.getPanose().getSerifStyle() <= 5)
{
// cove (serif)
match.score += 1;
}
else if (panose.getSerifStyle() >= 11 && panose.getSerifStyle() <= 13
&& info.getPanose().getSerifStyle() >= 11
&& info.getPanose().getSerifStyle() <= 13)
{
// sans-serif
match.score += 1;
}
else if (panose.getSerifStyle() != 0 && info.getPanose().getSerifStyle() != 0)
{
// mismatch
match.score -= 1;
}
// weight
int weight = info.getPanose().getWeight();
int weightClass = info.getWeightClassAsPanose();
if (Math.abs(weight - weightClass) > 2)
{
// inconsistent data in system font, usWeightClass wins
weight = weightClass;
}
if (panose.getWeight() == weight)
{
// exact match
match.score += 2;
}
else if (panose.getWeight() > 1 && weight > 1)
{
float dist = Math.abs(panose.getWeight() - weight);
match.score += 1 - dist * 0.5;
}
// todo: italic
// ...
}
}
else if (fontDescriptor.getFontWeight() > 0 && info.getWeightClass() > 0)
{
// usWeightClass is pretty reliable
float dist = Math.abs(fontDescriptor.getFontWeight() - info.getWeightClass());
match.score += 1 - (dist / 100) * 0.5;
}
// todo: italic
// ...
queue.add(match);
}
return queue;
}
private static boolean probablyBarcodeFont(PDFontDescriptor fontDescriptor)
{
String ff = ofNullable(fontDescriptor.getFontFamily()).orElse("");
String fn = ofNullable(fontDescriptor.getFontName()).orElse("");
return ff.startsWith("Code") || ff.toLowerCase().contains("barcode") || fn.startsWith(
"Code") || fn.toLowerCase().contains("barcode");
}
/**
* Returns true if the character set described by CIDSystemInfo is present in the given font.
* Only applies to Adobe-GB1, Adobe-CNS1, Adobe-Japan1, Adobe-Korea1, as per the PDF spec.
*/
private boolean isCharSetMatch(PDCIDSystemInfo cidSystemInfo, FontInfo info)
{
if (info.getCIDSystemInfo() != null)
{
return info.getCIDSystemInfo().getRegistry().equals(cidSystemInfo.getRegistry())
&& info.getCIDSystemInfo().getOrdering().equals(cidSystemInfo.getOrdering());
}
long codePageRange = info.getCodePageRange();
long JIS_JAPAN = 1 << 17;
long CHINESE_SIMPLIFIED = 1 << 18;
long KOREAN_WANSUNG = 1 << 19;
long CHINESE_TRADITIONAL = 1 << 20;
long KOREAN_JOHAB = 1 << 21;
if ("MalgunGothic-Semilight".equals(info.getPostScriptName()))
{
// PDFBOX-4793 and PDF.js 10699: This font has only Korean, but has bits 17-21 set.
codePageRange &= ~(JIS_JAPAN | CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL);
}
if (cidSystemInfo.getOrdering().equals("GB1")
&& (codePageRange & CHINESE_SIMPLIFIED) == CHINESE_SIMPLIFIED)
{
return true;
}
if (cidSystemInfo.getOrdering().equals("CNS1")
&& (codePageRange & CHINESE_TRADITIONAL) == CHINESE_TRADITIONAL)
{
return true;
}
if (cidSystemInfo.getOrdering().equals("Japan1")
&& (codePageRange & JIS_JAPAN) == JIS_JAPAN)
{
return true;
}
return cidSystemInfo.getOrdering().equals("Korea1") && (
(codePageRange & KOREAN_WANSUNG) == KOREAN_WANSUNG
|| (codePageRange & KOREAN_JOHAB) == KOREAN_JOHAB);
}
/**
* A potential match for a font substitution.
*/
private static class FontMatch implements Comparable
{
double score;
final FontInfo info;
FontMatch(FontInfo info)
{
this.info = info;
}
@Override
public int compareTo(FontMatch match)
{
return Double.compare(match.score, this.score);
}
}
}