org.sejda.sambox.cos.PDFDocEncoding Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sejda.sambox.cos;
import java.io.ByteArrayOutputStream;
import java.util.HashMap;
import java.util.Map;
/**
* The "PDFDocEncoding" encoding. Note that this is *not* a Type 1 font encoding, it is used only within PDF
* "text strings".
*/
final class PDFDocEncoding
{
private static final char REPLACEMENT_CHARACTER = '\uFFFD';
private static final int[] CODE_TO_UNI;
private static final Map UNI_TO_CODE;
static
{
CODE_TO_UNI = new int[256];
UNI_TO_CODE = new HashMap<>(256);
// initialize with basically ISO-8859-1
for (int i = 0; i < 256; i++)
{
// skip entries not in Unicode column
if (i > 0x17 && i < 0x20)
{
continue;
}
if (i > 0x7E && i < 0xA1)
{
continue;
}
if (i == 0xAD)
{
continue;
}
set(i, (char) i);
}
// then do all deviations (based on the table in ISO 32000-1:2008)
// block 1
set(0x18, '\u02D8'); // BREVE
set(0x19, '\u02C7'); // CARON
set(0x1A, '\u02C6'); // MODIFIER LETTER CIRCUMFLEX ACCENT
set(0x1B, '\u02D9'); // DOT ABOVE
set(0x1C, '\u02DD'); // DOUBLE ACUTE ACCENT
set(0x1D, '\u02DB'); // OGONEK
set(0x1E, '\u02DA'); // RING ABOVE
set(0x1F, '\u02DC'); // SMALL TILDE
// block 2
set(0x7F, REPLACEMENT_CHARACTER); // undefined
set(0x80, '\u2022'); // BULLET
set(0x81, '\u2020'); // DAGGER
set(0x82, '\u2021'); // DOUBLE DAGGER
set(0x83, '\u2026'); // HORIZONTAL ELLIPSIS
set(0x84, '\u2014'); // EM DASH
set(0x85, '\u2013'); // EN DASH
set(0x86, '\u0192'); // LATIN SMALL LETTER SCRIPT F
set(0x87, '\u2044'); // FRACTION SLASH (solidus)
set(0x88, '\u2039'); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
set(0x89, '\u203A'); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
set(0x8A, '\u2212'); // MINUS SIGN
set(0x8B, '\u2030'); // PER MILLE SIGN
set(0x8C, '\u201E'); // DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
set(0x8D, '\u201C'); // LEFT DOUBLE QUOTATION MARK (quotedblleft)
set(0x8E, '\u201D'); // RIGHT DOUBLE QUOTATION MARK (quotedblright)
set(0x8F, '\u2018'); // LEFT SINGLE QUOTATION MARK (quoteleft)
set(0x90, '\u2019'); // RIGHT SINGLE QUOTATION MARK (quoteright)
set(0x91, '\u201A'); // SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
set(0x92, '\u2122'); // TRADE MARK SIGN
set(0x93, '\uFB01'); // LATIN SMALL LIGATURE FI
set(0x94, '\uFB02'); // LATIN SMALL LIGATURE FL
set(0x95, '\u0141'); // LATIN CAPITAL LETTER L WITH STROKE
set(0x96, '\u0152'); // LATIN CAPITAL LIGATURE OE
set(0x97, '\u0160'); // LATIN CAPITAL LETTER S WITH CARON
set(0x98, '\u0178'); // LATIN CAPITAL LETTER Y WITH DIAERESIS
set(0x99, '\u017D'); // LATIN CAPITAL LETTER Z WITH CARON
set(0x9A, '\u0131'); // LATIN SMALL LETTER DOTLESS I
set(0x9B, '\u0142'); // LATIN SMALL LETTER L WITH STROKE
set(0x9C, '\u0153'); // LATIN SMALL LIGATURE OE
set(0x9D, '\u0161'); // LATIN SMALL LETTER S WITH CARON
set(0x9E, '\u017E'); // LATIN SMALL LETTER Z WITH CARON
set(0x9F, REPLACEMENT_CHARACTER); // undefined
set(0xA0, '\u20AC'); // EURO SIGN
// end of deviations
}
private PDFDocEncoding()
{
}
private static void set(int code, char unicode)
{
CODE_TO_UNI[code] = unicode;
UNI_TO_CODE.put(unicode, code);
}
/**
* @return the string representation of the given PDFDocEncoded bytes.
*/
public static String toString(byte[] bytes)
{
StringBuilder sb = new StringBuilder();
for (byte b : bytes)
{
if ((b & 0xff) >= CODE_TO_UNI.length)
{
sb.append('?');
}
else
{
sb.append((char) CODE_TO_UNI[b & 0xff]);
}
}
return sb.toString();
}
/**
* @return the given string encoded with PDFDocEncoding or null if the string cannot be encoded
*/
public static byte[] getBytes(String text)
{
ByteArrayOutputStream out = new ByteArrayOutputStream();
for (char c : text.toCharArray())
{
Integer code = UNI_TO_CODE.get(c);
if (code == null)
{
return null;
}
out.write(code);
}
return out.toByteArray();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy