com.sun.pdfrenderer.tools.charset.PDFDocEncodingMapGenerator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdf-renderer Show documentation
Show all versions of pdf-renderer Show documentation
PDF renderer implementation supporting the subset of PDF 1.4 specification.
The newest version!
/*
* Copyright 2008 Pirion Systems Pty Ltd, 139 Warry St,
* Fortitude Valley, Queensland, Australia
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
package com.sun.pdfrenderer.tools.charset;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
/**
*
* Parses text from the PDF reference describing the PDFDocEncoding
* and verifies it against standard Unicode character names and
* a few other heuristics to establish correctness. Outputs
* a table to be used for decoding, destined for PDFStringUtil.
*
*
* @author Luke Kirby
*/
public class PDFDocEncodingMapGenerator {
public final static void main(String[] args)
throws IOException {
final Map unicodeNames = readUnicodeNames();
// the unicode character for each value in PDFDocEncoding
final int[] unicodeCharacters = new int[256];
Set mappedCharacters = new HashSet();
List problemMappings = new ArrayList();
// Look at PDFDocEncodingMap.txt for a description of its
// contents
final BufferedReader r = new BufferedReader(
new InputStreamReader(
PDFDocEncodingMapGenerator.class.
getResourceAsStream("PDFDocEncodingMap.txt")));
// the line representing the character value
String charLine = null;
// the line representing the decimal value of the character in
// the encoding (0-255)
String decLine = null;
// the hex representation of the value in the encoding
String hexLine = null;
// the octal representation of the value in the encoding
String octalLine = null;
// the unicode value of the character with the indicated value in
// the encoding, expressed as U+XXXX, or "Undefined"
String unicodeLine = null;
// the unicode name of the character; not present for all characters
String nameLine = null;
// notes for the character; not present for all characters
String notesLine = null;
for (int i = 0; i < 256; ++i) {
// read the charLine, if it wasn't read on the previous iteration
if (charLine == null) {
charLine = readEncodingLine(r);
}
// read the decimal line if it wasn't read on the previous iteration
if (decLine == null) {
decLine = readEncodingLine(r);
}
hexLine = readEncodingLine(r);
octalLine = readEncodingLine(r);
unicodeLine = readEncodingLine(r);
nameLine = readEncodingLine(r);
notesLine = readEncodingLine(r);
// store the charLine and decLine associated with this iteration;
// nameLine and notesLine may actually belong to the next
// iteration, which we will reflect by setting charLine and decLine
// appropriately
String ourCharLine = charLine;
String ourDecLine = decLine;
if (notesLine != null && parseDecLine(notesLine) == (i + 1)) {
// what we read as notesLine appears to be the decLine for the
// iteration. This means that the nameLine we read is actually
// the charLine for the next iteration.
charLine = nameLine;
decLine = notesLine;
nameLine = null;
notesLine = null;
} else {
// so there was, at least, a nameLine. However, what we
// read as notesLine might have been the charLine for
// the next iteration. We'll need to read another line
// to see if it matches the decLine expected in the next
// iteration, indicating that this iteration had no notesLine
final String extraLine = readEncodingLine(r);
if (extraLine != null && parseDecLine(extraLine) == (i + 1)) {
// no notes; what we've read as notes is the next charLine
charLine = notesLine;
decLine = extraLine;
notesLine = null;
} else {
charLine = extraLine;
decLine = null;
}
}
System.out.println(ourCharLine + " " +
ourDecLine + " " + hexLine + " " + octalLine + " " +
unicodeLine + " " + nameLine + " " + notesLine);
final boolean undefinedCh = "Undefined".equals(unicodeLine);
final int unicodeCh;
if (!undefinedCh) {
unicodeCh = Integer.parseInt(unicodeLine.substring(2), 16);
if (mappedCharacters.contains(unicodeCh)) {
problemMappings.add(i);
System.out.println(
" !!! " + unicodeCh + " is already mapped to");
}
mappedCharacters.add(unicodeCh);
} else {
// conventional unmarked character
unicodeCh = '\uFFFD';
}
if (nameLine != null) {
if (!undefinedCh) {
// check that the offered unicode name matches
// the value for that name in the standard unicode
// mappings; we did have to apply some small corrections
// to some entries to have them match.
// the nameLine may have the equivalent HTML entity
// in brackets after the unicode name line, so we need
// to strip them. There are a few pesky unicode
// character names with brackets, though, but they're
// all control characters, and thus start with brackets,
// so we can easily skip them.
String unicodeName = nameLine.startsWith("(") ?
nameLine :
nameLine.replaceFirst("\\s*\\(.*$", "");
final Character mappedCh = unicodeNames.get(unicodeName);
if (mappedCh == null || mappedCh.charValue() != unicodeCh) {
if (mappedCh != null) {
System.out.println(" !!! name maps to " +
mappedCh + " (" + (int)mappedCh + ")");
problemMappings.add(i);
} else {
System.out.println(" !!! unmapped name");
problemMappings.add(i);
}
}
}
} else {
// if it's nameless, we expect that the character is alphanum,
// and identical to the ASCII/ISO-8859 and the lower-byte
// of UTF-16BE
if (!(ourCharLine.charAt(0) == (char)i)) {
System.out.println(" !!! Unnamed character is not " +
"same as Latin1 encoding");
problemMappings.add(i);
}
}
if (unicodeCh >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// needs to use a surrogate, so can't be expressed as a single
// character, which is how we construct the table
System.out.println(" !!! supplementary code point! " +
"Cannot generate with existing system");
}
unicodeCharacters[i] = unicodeCh;
}
if (!problemMappings.isEmpty()) {
System.out.println(problemMappings.size() +
" problems for decimal encodings:");
for (final Integer problemMapping : problemMappings) {
System.out.println(" " + problemMapping);
}
System.out.println("Did not generate table due to errors");
} else {
System.out.println(
"\tprivate final static char[] PDF_DOC_ENCODING_MAP = " +
"new char[] {");
for (int i = 0; i < 256; i += 8) {
System.out.print("\t ");
for (int j = i; j < i + 8; ++j) {
System.out.print(formatArrayEntry(unicodeCharacters[j]));
}
System.out.println(
" //" + String.format("%02X-%02X", i, i + 7));
}
System.out.println("\t};");
}
}
private static String formatArrayEntry(int charVal) {
return String.format("0x%04X, ", charVal);
}
private static String readEncodingLine(BufferedReader r)
throws IOException {
String line;
do {
line = r.readLine();
} while (line != null && line.startsWith("##"));
return line;
}
private static int parseDecLine(String notesLine) {
try {
return Integer.parseInt(notesLine);
} catch (NumberFormatException e) {
// not actually a dec line!
return -1;
}
}
private static CharChange c(int val, String name) {
return new CharChange(name, (char) val);
}
private static class CharChange {
String name;
char value;
private CharChange(String name, char value) {
this.name = name;
this.value = value;
}
}
private static Map readUnicodeNames()
throws IOException {
// read the UnicodeData.txt to make a mapping from character name
// to unicode character value.
//
// UnicodeData.txt is from
// http://unicode.org/Public/UNIDATA/UnicodeData.txt
// used as per the Terms of Use: http://www.unicode.org/copyright.html
final Map names = new HashMap();
final BufferedReader r = new BufferedReader(
new InputStreamReader(
PDFDocEncodingMapGenerator.class.
getResourceAsStream("UnicodeData.txt")));
String line;
while ((line = r.readLine()) != null) {
String[] cols = line.split(";");
final char c = (char) Integer.parseInt(cols[0], 16);
String name = cols[1];
if ("".equals(name)) {
if (cols.length >= 11) {
name = "(" + cols[10] + ")";
} else {
name = "(control-" + c + ")";
}
}
if (names.containsKey(name)) {
throw new IOException("Already found name " + name);
}
names.put(name, c);
}
return names;
}
}