org.sejda.impl.sambox.util.FontUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sejda-sambox Show documentation
Show all versions of sejda-sambox Show documentation
Package containing tasks implemented using sambox.
/*
* Copyright 2015 by Andrea Vacondio ([email protected]).
*
* This file is part of the Sejda source code
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package org.sejda.impl.sambox.util;
import org.apache.commons.lang3.StringUtils;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.sejda.commons.util.IOUtils;
import org.sejda.impl.sambox.component.TextWithFont;
import org.sejda.impl.sambox.component.font.FallbackFontsProvider;
import org.sejda.model.exception.TaskIOException;
import org.sejda.model.exception.UnsupportedTextException;
import org.sejda.model.pdf.StandardType1Font;
import org.sejda.model.pdf.font.FontResource;
import org.sejda.model.pdf.font.Type0FontsProvider;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.cos.COSStream;
import org.sejda.sambox.pdmodel.PDDocument;
import org.sejda.sambox.pdmodel.common.PDRectangle;
import org.sejda.sambox.pdmodel.font.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.awt.geom.GeneralPath;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
import static java.util.Optional.ofNullable;
import static java.util.stream.StreamSupport.stream;
import static org.sejda.commons.util.RequireUtils.requireNotNullArg;
import static org.sejda.sambox.util.BidiUtils.visualToLogical;
/**
* Utility to map from Sejda font definition to PDFBox.
*
* @author Andrea Vacondio
*/
public final class FontUtils {
private static final Logger LOG = LoggerFactory.getLogger(FontUtils.class);
private FontUtils() {
// hide
}
private static final Map STANDARD_TYPE1_FONTS;
static {
Map fontsCache = new EnumMap<>(StandardType1Font.class);
fontsCache.put(StandardType1Font.CURIER, PDType1Font.COURIER);
fontsCache.put(StandardType1Font.CURIER_BOLD, PDType1Font.COURIER_BOLD);
fontsCache.put(StandardType1Font.CURIER_BOLD_OBLIQUE, PDType1Font.COURIER_BOLD_OBLIQUE);
fontsCache.put(StandardType1Font.CURIER_OBLIQUE, PDType1Font.COURIER_OBLIQUE);
fontsCache.put(StandardType1Font.HELVETICA, PDType1Font.HELVETICA);
fontsCache.put(StandardType1Font.HELVETICA_BOLD, PDType1Font.HELVETICA_BOLD);
fontsCache.put(StandardType1Font.HELVETICA_BOLD_OBLIQUE, PDType1Font.HELVETICA_BOLD_OBLIQUE);
fontsCache.put(StandardType1Font.HELVETICA_OBLIQUE, PDType1Font.HELVETICA_OBLIQUE);
fontsCache.put(StandardType1Font.SYMBOL, PDType1Font.SYMBOL);
fontsCache.put(StandardType1Font.ZAPFDINGBATS, PDType1Font.ZAPF_DINGBATS);
fontsCache.put(StandardType1Font.TIMES_BOLD, PDType1Font.TIMES_BOLD);
fontsCache.put(StandardType1Font.TIMES_BOLD_ITALIC, PDType1Font.TIMES_BOLD_ITALIC);
fontsCache.put(StandardType1Font.TIMES_ITALIC, PDType1Font.TIMES_ITALIC);
fontsCache.put(StandardType1Font.TIMES_ROMAN, PDType1Font.TIMES_ROMAN);
STANDARD_TYPE1_FONTS = Collections.unmodifiableMap(fontsCache);
}
public static PDFont HELVETICA = PDType1Font.HELVETICA;
public static final FontResource[] TYPE0FONTS;
static {
TYPE0FONTS = stream(ServiceLoader.load(Type0FontsProvider.class).spliterator(), false)
.flatMap(p -> p.getFonts().stream()).sorted(Comparator.comparingInt(FontResource::priority))
.toArray(FontResource[]::new);
}
public static final List FALLBACK_FONTS_PROVIDERS = stream(ServiceLoader.load(FallbackFontsProvider.class).spliterator(), false)
.sorted(Comparator.comparingInt(FallbackFontsProvider::getPriority)).collect(Collectors.toList());
/**
* Mapping between Sejda and PDFBox standard type 1 fonts implementation
*
* @param st1Font
* @return the PDFBox font.
*/
public static PDType1Font getStandardType1Font(StandardType1Font st1Font) {
return STANDARD_TYPE1_FONTS.get(st1Font);
}
/**
* Checks the text can be written with the given font, find a fallback font otherwise
*/
public static PDFont fontOrFallback(String text, PDFont font, PDDocument document) {
if (!canDisplay(text, font)) {
PDFont fallback = findFontFor(document, text, isBold(font), font);
String fallbackName = fallback == null ? null : fallback.getName();
LOG.debug("Text '{}' cannot be written with font {}, using fallback {}", text, font.getName(),
fallbackName);
return fallback;
}
return font;
}
// caches fonts, PER DOCUMENT
// has no auto-magical way to clear the cache when doc processing is done
// if you use this in a long lived process, call the cache clear method to avoid leaking memory
// if we get some issue we could consider something like com.twelvemonkeys.util.WeakWeakMap
private static Map> loadedFontCache = new HashMap<>();
public static void clearLoadedFontCache() {
loadedFontCache.clear();
}
public static void clearLoadedFontCache(PDDocument document) {
loadedFontCache.remove(document);
}
public static final String REMARK_FROM_SEJDA_FONT_RESOURCE = "FromSejdaFontResource";
public static PDFont loadFont(PDDocument document, FontResource font) {
if (!loadedFontCache.containsKey(document)) {
loadedFontCache.put(document, new HashMap<>());
}
Map docCache = loadedFontCache.get(document);
if (docCache.containsKey(font.getResource())) {
return docCache.get(font.getResource());
}
try (InputStream in = font.getFontStream()) {
PDType0Font loaded = PDType0Font.load(document, in);
// mark this font as loaded from a sejda font resource
loaded.getTransientMetadata().put(REMARK_FROM_SEJDA_FONT_RESOURCE, "true");
LOG.trace("Loaded font {}", loaded.getName());
docCache.put(font.getResource(), loaded);
return loaded;
} catch (IOException e) {
LOG.warn("Failed to load font " + font, e);
return null;
}
}
public static PDFont findFontFor(PDDocument document, String text) {
return findFontFor(document, text, false, null);
}
/**
* @param document
* @param text
* @return a font capable of displaying the given string or null
*/
public static PDFont findFontFor(PDDocument document, String text, boolean bold, PDFont originalFont) {
// ask the fallback font providers first
for(FallbackFontsProvider provider: FALLBACK_FONTS_PROVIDERS) {
PDFont fallback = provider.findFallbackFont(originalFont, document, text, bold);
if(fallback != null) {
LOG.debug("Found suitable font {} to display '{}', via provider {}", fallback.getName(), text, provider.getClass().getName());
return fallback;
}
}
PDFont firstPartialMatch = null;
for (FontResource font : TYPE0FONTS) {
PDFont loaded = loadFont(document, font);
if (canDisplay(text, loaded)) {
firstPartialMatch = loaded;
LOG.debug("Found suitable font {} to display '{}'", loaded, text);
// match on style? great, return it
// otherwise continue and fallback to this font if no exact style match is found in the end
if (isBold(loaded) == bold) {
return loaded;
}
}
}
return firstPartialMatch;
}
/**
* @param text
* @return true if given text contains only unicode whitespace characters
*/
public static boolean isOnlyWhitespace(String text) {
return text.replaceAll("\\p{Zs}", "").length() == 0;
}
/**
* Removes all unicode whitespace characters from the input string
*
* @param text
* @return the resulting string
*/
public static String removeWhitespace(String text) {
return text.replaceAll("\\p{Zs}", "").replaceAll("\\r\\n", "").replaceAll("\\n", "");
}
public static boolean canDisplaySpace(PDFont font) {
try {
String text = " ";
// try encode
var encoded = font.encode(text);
// see if width is non zero
if(font.getStringWidth(text) <= 0) {
return false;
}
// make sure the displayed text is the same as the input, eg: no cmap gibberish issues
return areEncodeDecodeSame(font, text, encoded);
} catch (IllegalArgumentException | IOException | UnsupportedOperationException | NullPointerException e) {
// Nope
}
return false;
}
/**
* @return true if the given font can display the given text. IMPORTANT: Ignores all whitespace in text.
*/
public static boolean canDisplay(String text, PDFont font) {
return canDisplayString(removeWhitespace(text), font);
}
private static boolean canDisplayString(String text, PDFont font) {
if (font == null)
return false;
// LOG.debug("Can display '{}' using {}?", text, font);
try {
// remove all whitespace characters and check only if those can be written using the font
byte[] encoded = font.encode(text);
int[] cid2gid = null;
if(font instanceof PDType0Font type0Font) {
try {
cid2gid = readCIDToGIDMap(type0Font.getDescendantFont());
} catch (Exception e){
LOG.warn("Exception reading CIDToGIDMap: " + e.getMessage());
}
}
if (font instanceof PDVectorFont vectorFont) {
InputStream in = new ByteArrayInputStream(encoded);
while (in.available() > 0) {
int code = font.readCode(in);
// LOG.debug("Read codePoint {}", code);
GeneralPath path = vectorFont.getPath(code);
// if(path != null) {
// LOG.debug("GeneralPath is {} for '{}' (code = {}, font = {})", path.getBounds2D(), new String(Character.toChars(code)), code, font.getName());
// }
if (path == null || path.getBounds2D().getWidth() == 0) {
return false;
}
if (cid2gid != null && code < cid2gid.length && cid2gid[code] == 0) {
return false;
}
}
}
// make sure the displayed text is the same as the input, eg: no cmap gibberish issues
// fonts loaded from sejda font resources are trusted
if(!"true".equals(font.getTransientMetadata().get(REMARK_FROM_SEJDA_FONT_RESOURCE))) {
return areEncodeDecodeSame(font, text, encoded);
}
return true;
} catch (IllegalArgumentException | IOException | UnsupportedOperationException | NullPointerException e) {
// LOG.debug("Cannot display text with font", e);
}
return false;
}
private static int[] readCIDToGIDMap(PDCIDFont font) throws IOException
{
int[] cid2gid = null;
COSDictionary dict = font.getCOSObject();
COSBase map = dict.getDictionaryObject(COSName.CID_TO_GID_MAP);
if (map instanceof COSStream)
{
COSStream stream = (COSStream) map;
InputStream in = stream.getUnfilteredStream();
byte[] mapAsBytes = IOUtils.toByteArray(in);
IOUtils.closeQuietly(in);
int numberOfInts = mapAsBytes.length / 2;
cid2gid = new int[numberOfInts];
int offset = 0;
for (int index = 0; index < numberOfInts; index++)
{
int gid = (mapAsBytes[offset] & 0xff) << 8 | mapAsBytes[offset + 1] & 0xff;
cid2gid[index] = gid;
offset += 2;
}
}
return cid2gid;
}
public static double calculateBBoxHeight(String text, PDFont font) {
requireNotNullArg(font, "Font cannot be null");
double maxHeight = 0;
try {
InputStream in = new ByteArrayInputStream(font.encode(text));
while (in.available() > 0) {
int code = font.readCode(in);
if (font instanceof PDType3Font) {
maxHeight = Math.max(maxHeight,
ofNullable(((PDType3Font) font).getCharProc(code)).map(PDType3CharProc::getGlyphBBox)
.map(PDRectangle::toGeneralPath).map(p -> p.getBounds2D().getHeight()).orElse(0d));
} else if (font instanceof PDVectorFont) {
maxHeight = Math.max(maxHeight,
ofNullable(((PDVectorFont) font).getPath(code)).map(p -> p.getBounds2D().getHeight())
.orElse(0d));
} else if (font instanceof PDSimpleFont simpleFont) {
String name = ofNullable(simpleFont.getEncoding()).map(e -> e.getName(code)).orElse(null);
if (nonNull(name)) {
maxHeight = Math.max(maxHeight, simpleFont.getPath(name).getBounds2D().getHeight());
}
}
}
} catch (IOException e) {
LOG.warn("An error occurred while calculating the highest glyph bbox", e);
}
return maxHeight;
}
public static boolean isBold(PDFont font) {
if(font.getName() == null) {
return false;
}
String lowercasedName = font.getName().toLowerCase();
return lowercasedName.contains("bold");
}
public static boolean isItalic(PDFont font) {
if(font.getName() == null) {
return false;
}
String lowercasedName = font.getName().toLowerCase();
return lowercasedName.contains("italic") || lowercasedName.contains("oblique");
}
/**
* Helper for subset fonts. Determines if a font is subset, computes original font name. Provides methods for loading the original full font from the system, if available, or
* loading a fallback font.
*/
public static class FontSubsetting {
public final String fontName;
public final boolean isSubset;
public final PDFont subsetFont;
public FontSubsetting(PDFont subsetFont) {
this.subsetFont = subsetFont;
// is it a subset font? ABCDEF+Verdana
String fontName = StringUtils.trimToEmpty(subsetFont.getName());
String[] fontNameFragments = fontName.split("\\+");
if (fontNameFragments.length == 2 && fontNameFragments[0].length() == 6) {
this.isSubset = true;
this.fontName = fontNameFragments[1];
} else {
this.isSubset = false;
this.fontName = null;
}
}
public PDFont loadOriginalOrSimilar(PDDocument document) {
PDFont original = loadOriginal(document);
if (original == null) {
return loadSimilar(document);
}
return original;
}
/**
* Tries to load the original full font from the system
*/
public PDFont loadOriginal(PDDocument document) {
String lookupName = fontName.replace("-", " ");
LOG.debug("Searching the system for a font matching name '{}'", lookupName);
FontMapping fontMapping = FontMappers.instance().getTrueTypeFont(lookupName, null);
if (fontMapping != null && fontMapping.getFont() != null && !fontMapping.isFallback()) {
TrueTypeFont mappedFont = fontMapping.getFont();
try {
LOG.debug("Original font available on the system: {}", fontName);
return PDType0Font.load(document, mappedFont.getOriginalData());
} catch (IOException ioe) {
LOG.warn("Failed to load font from system", ioe);
try {
mappedFont.close();
} catch (IOException e) {
LOG.warn("Failed closing font", e);
}
}
}
return null;
}
/**
* Tries to load a similar full font from the system
*/
public PDFont loadSimilar(PDDocument document) {
String lookupName = fontName.replace("-", " ");
// Eg: Arial-BoldMT
PDFontDescriptor descriptor = new PDFontDescriptor(new COSDictionary());
descriptor.setFontName(fontName.split("-")[0]);
descriptor.setForceBold(FontUtils.isBold(subsetFont));
descriptor.setItalic(FontUtils.isItalic(subsetFont));
LOG.debug(
"Searching the system for a font matching name '{}' and description [name:{}, bold:{}, italic:{}]",
lookupName, descriptor.getFontName(), descriptor.isForceBold(), descriptor.isItalic());
FontMapping fontMapping = FontMappers.instance().getTrueTypeFont(lookupName, descriptor);
if (fontMapping != null && fontMapping.getFont() != null) {
TrueTypeFont mappedFont = fontMapping.getFont();
try {
if (fontMapping.isFallback()) {
LOG.debug("Fallback font available on the system: {} (for {})", mappedFont.getName(), fontName);
} else {
LOG.debug("Original font available on the system: {}", fontName);
}
return PDType0Font.load(document, mappedFont.getOriginalData());
} catch (IOException ioe) {
LOG.warn("Failed to load font from system", ioe);
try {
mappedFont.close();
} catch (Exception e) {
LOG.warn("Failed closing font", e);
}
}
}
return null;
}
}
/**
* Wraps the given text on multiple lines, if it does not fit within the given maxWidth It will try to determine if all text can be written with given font and find a fallback
* for parts that are not supported.
*/
public static List wrapLines(String rawLabel, PDFont font, float fontSize, double maxWidth,
PDDocument document) throws TaskIOException {
List lines = new ArrayList<>();
String label = org.sejda.commons.util.StringUtils.normalizeWhitespace(rawLabel);
StringBuilder currentString = new StringBuilder();
double currentWidth = 0;
List resolvedStringsToFonts = FontUtils.resolveFonts(label, font, document);
for (TextWithFont stringAndFont : resolvedStringsToFonts) {
try {
PDFont resolvedFont = stringAndFont.getFont();
String resolvedLabel = stringAndFont.getText();
if (isNull(resolvedFont)) {
throw new UnsupportedTextException(
"Unable to find suitable font for string \"" + resolvedLabel + "\"",
resolvedLabel);
}
String[] words = visualToLogical(resolvedLabel).split("(?<=\\b)");
for (String word : words) {
double textWidth = getSimpleStringWidth(word, resolvedFont, fontSize);
if (textWidth > maxWidth || word.length() > 10) {
// this is a giant word that has no breaks and exceeds max width
// check for each char if it can be added to current line, wrap on new line if not
Iterator codePointIterator = word.codePoints().iterator();
while (codePointIterator.hasNext()) {
int codePoint = codePointIterator.next();
String ch = new String(Character.toChars(codePoint));
double chWidth = getSimpleStringWidth(ch, resolvedFont, fontSize);
if (currentWidth + chWidth > maxWidth) {
currentString.append("-");
lines.add(currentString.toString().trim());
currentString = new StringBuilder();
currentWidth = 0;
}
currentWidth += chWidth;
currentString.append(ch);
}
} else {
// regular scenario: check if word can be added to current line, wrap on new line if not
if (currentWidth + textWidth > maxWidth) {
lines.add(currentString.toString().trim());
currentString = new StringBuilder();
currentWidth = 0;
}
currentWidth += textWidth;
currentString.append(word);
}
}
} catch (IOException e) {
throw new TaskIOException(e);
}
}
if (!currentString.toString().isEmpty()) {
lines.add(currentString.toString().trim());
}
return lines;
}
/**
* Calculates the width of the string using the given font. Does not try to find out if the text can actually be written with the given font and find fallback
*/
public static double getSimpleStringWidth(String text, PDFont font, double fontSize) throws IOException {
double textWidth = font.getStringWidth(text) / 1000 * fontSize;
// sometimes the string width is reported incorrectly, too small. when writing ' ' (space) it leads to missing spaces.
// use the largest value between font average width and text string width
// TODO: replace zero with heuristic based "small value"
if (textWidth == 0) {
textWidth = font.getAverageFontWidth() / 1000 * fontSize;
}
return textWidth;
}
/**
* Supports writing labels which require multiple fonts (eg: mixing thai and english words) Returns a list of text with associated font.
*/
public static List resolveFonts(String label, PDFont font, PDDocument document) {
PDFont currentFont = font;
StringBuilder currentString = new StringBuilder();
// we want to keep the insertion order
List result = new ArrayList<>();
Iterator codePointIterator = label.codePoints().iterator();
while (codePointIterator.hasNext()) {
int codePoint = codePointIterator.next();
String s = new String(Character.toChars(codePoint));
PDFont f = fontOrFallback(s, font, document);
if (s.equals(" ")) {
// we want space to be a separate text item
// because some fonts are missing the space glyph
// so we'll handle it separate from the other chars
// some fonts don't have glyphs for space.
// figure out if that's the case and switch to a standard font as fallback
if (!FontUtils.canDisplaySpace(f)) {
f = FontUtils.getStandardType1Font(StandardType1Font.HELVETICA);
}
if (f != currentFont) {
// end current string, before space
if (currentString.length() > 0) {
result.add(new TextWithFont(currentString.toString(), currentFont));
}
// add space
result.add(new TextWithFont(" ", f));
currentString = new StringBuilder();
currentFont = f;
} else {
currentString.append(s);
}
} else if (currentFont == f) {
currentString.append(s);
} else {
if (currentString.length() > 0) {
result.add(new TextWithFont(currentString.toString(), currentFont));
}
currentString = new StringBuilder(s);
currentFont = f;
}
}
result.add(new TextWithFont(currentString.toString(), currentFont));
for (TextWithFont each : result) {
LOG.trace("Will write '{}' with {}", each.getText(), each.getFont());
}
return result;
}
/**
* Splits an input string into multiple fragments, when glyphs with 0 width are detected
*/
public static List resolveTextFragments(String input, PDFont font) {
List result = new ArrayList<>();
List current = new ArrayList<>();
// splitting text breaks arabic cursive writing
if(org.sejda.core.support.util.StringUtils.isRtl(input)) {
result.add(input);
return result;
}
for(int codePoint: input.codePoints().toArray()){
try {
if(font.getWidth(codePoint) == 0) {
if(current.size() > 0) {
StringBuilder s = new StringBuilder();
current.stream().map(Character::toChars).forEach(s::append);
result.add(s.toString());
}
result.add(new String(Character.toChars(codePoint)));
current = new ArrayList<>();
} else {
current.add(codePoint);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
if(current.size() > 0) {
StringBuilder s = new StringBuilder();
current.stream().map(Character::toChars).forEach(s::append);
result.add(s.toString());
}
return result;
}
public static String removeUnsupportedCharacters(String text, PDDocument doc) {
return replaceUnsupportedCharacters(text, doc, "");
}
public static String replaceUnsupportedCharacters(String text, PDDocument doc, String replacement) {
List resolved = resolveFonts(text, HELVETICA, doc);
Set unsupported = new HashSet<>();
resolved.forEach(tf -> {
if (tf.getFont() == null) {
unsupported.add(tf.getText());
}
});
// replace unsupported groups of text longer ones first
// eg: first replace "ääç" and then "ä"
List unsupportedSortedByLength = new ArrayList<>(unsupported);
unsupportedSortedByLength.sort((o1, o2) -> Integer.valueOf(o2.length()).compareTo(o1.length()));
String result = text;
for (String s : unsupportedSortedByLength) {
result = result.replaceAll(Pattern.quote(s), replacement);
}
return result;
}
public static boolean areEncodeDecodeSame(PDFont font, String text) throws IOException {
return areEncodeDecodeSame(font, text, font.encode(text));
}
private static boolean areEncodeDecodeSame(PDFont font, String text, byte[] encoded) throws IOException {
String decoded = decode(font, encoded);
// japanese requires normalization
// see https://stackoverflow.com/questions/45447843/testing-equality-for-visually-identical-characters-with-different-utf-8-encoding
boolean result = org.sejda.core.support.util.StringUtils.equalsNormalized(text, decoded);
if (!result) {
LOG.info("Font " + font.getName() + " cannot encode/decode text: '" + text + "', decoded was: '" + decoded
+ "' " + org.sejda.commons.util.StringUtils.asUnicodes(text) + " "
+ org.sejda.commons.util.StringUtils.asUnicodes(decoded));
}
return result;
}
public static void assertEncodeDecodeSame(PDFont font, String text) throws IOException {
if (!areEncodeDecodeSame(font, text)) {
throw new IllegalStateException("Font " + font.getName() + " cannot encode/decode text: " + text);
}
}
public static String decode(PDFont font, byte[] bytes) throws IOException {
InputStream in = new ByteArrayInputStream(bytes);
StringBuilder result = new StringBuilder();
while (in.available() > 0) {
int code = font.readCode(in);
result.append(font.toUnicode(code));
}
return result.toString();
}
}