org.apache.pdfbox.encoding.Encoding Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.encoding;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.StringTokenizer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.util.ResourceLoader;
/**
* This is an interface to a text encoder.
*
* @author Ben Litchfield
* @version $Revision: 1.15 $
*/
public abstract class Encoding implements COSObjectable
{
/**
* Log instance.
*/
private static final Log LOG = LogFactory.getLog(Encoding.class);
/** Identifies a non-mapped character. */
public static final String NOTDEF = ".notdef";
/**
* This is a mapping from a character code to a character name.
*/
protected final Map codeToName =
new HashMap();
/**
* This is a mapping from a character name to a character code.
*/
protected final Map nameToCode =
new HashMap();
private static final Map NAME_TO_CHARACTER =
new HashMap();
private static final Map CHARACTER_TO_NAME =
new HashMap();
static
{
//Loads the official Adobe Glyph List
loadGlyphList("org/apache/pdfbox/resources/glyphlist.txt");
//Loads some additional glyph mappings
loadGlyphList("org/apache/pdfbox/resources/additional_glyphlist.txt");
// Load an external glyph list file that user can give as JVM property
try
{
String location = System.getProperty("glyphlist_ext");
if(location != null)
{
File external = new File(location);
if(external.exists())
{
loadGlyphList(location);
}
}
}
catch (SecurityException e) // can occur on Sytem.getProperty
{
// PDFBOX-1946 ignore and continue
}
NAME_TO_CHARACTER.put( NOTDEF, "" );
NAME_TO_CHARACTER.put( "fi", "fi" );
NAME_TO_CHARACTER.put( "fl", "fl" );
NAME_TO_CHARACTER.put( "ffi", "ffi" );
NAME_TO_CHARACTER.put( "ff", "ff" );
NAME_TO_CHARACTER.put( "pi", "pi" );
for( Map.Entry entry : NAME_TO_CHARACTER.entrySet() )
{
CHARACTER_TO_NAME.put( entry.getValue(), entry.getKey() );
}
}
/**
* Loads a glyph list from a given location and populates the NAME_TO_CHARACTER hashmap
* for character lookups.
* @param location - The string location of the glyphlist file
*/
private static void loadGlyphList(String location)
{
BufferedReader glyphStream = null;
try
{
InputStream resource = ResourceLoader.loadResource( location );
if (resource == null)
{
throw new MissingResourceException("Glyphlist not found: " + location,
Encoding.class.getName(), location);
}
glyphStream = new BufferedReader( new InputStreamReader( resource, "ISO-8859-1" ) );
String line = null;
while( (line = glyphStream.readLine()) != null )
{
line = line.trim();
//lines starting with # are comments which we can ignore.
if( !line.startsWith("#" ) )
{
int semicolonIndex = line.indexOf( ';' );
if( semicolonIndex >= 0 )
{
String unicodeValue = null;
try
{
String characterName = line.substring( 0, semicolonIndex );
unicodeValue = line.substring( semicolonIndex+1, line.length() );
StringTokenizer tokenizer = new StringTokenizer( unicodeValue, " ", false );
StringBuilder value = new StringBuilder();
while(tokenizer.hasMoreTokens())
{
int characterCode = Integer.parseInt( tokenizer.nextToken(), 16 );
value.append((char)characterCode);
}
if (NAME_TO_CHARACTER.containsKey(characterName))
{
LOG.warn("duplicate value for characterName="+characterName+","+value);
}
else
{
NAME_TO_CHARACTER.put( characterName, value.toString() );
}
}
catch( NumberFormatException nfe )
{
LOG.error("malformed unicode value "+ unicodeValue, nfe);
}
}
}
}
}
catch( IOException io )
{
LOG.error("error while reading the glyph list.", io);
}
finally
{
if( glyphStream != null )
{
try
{
glyphStream.close();
}
catch( IOException e )
{
LOG.error("error when closing the glyph list.", e);
}
}
}
}
/**
* Returns an unmodifiable view of the Code2Name mapping.
* @return the Code2Name map
*/
public Map getCodeToNameMap()
{
return Collections.unmodifiableMap(codeToName);
}
/**
* Returns an unmodifiable view of the Name2Code mapping.
* @return the Name2Code map
*/
public Map getNameToCodeMap()
{
return Collections.unmodifiableMap(nameToCode);
}
/**
* This will add a character encoding.
*
* @param code The character code that matches the character.
* @param name The name of the character.
*/
public void addCharacterEncoding( int code, String name )
{
codeToName.put( code, name );
nameToCode.put( name, code );
}
/**
* Determines if the encoding has a mapping for the given name value.
*
* @param name the source value for the mapping
* @return the mapped value
*/
public boolean hasCodeForName(String name)
{
return nameToCode.containsKey(name);
}
/**
* Determines if the encoding has a mapping for the given code value.
*
* @param code the source value for the mapping
* @return the mapped value
*/
public boolean hasNameForCode(int code)
{
return codeToName.containsKey(code);
}
/**
* This will get the character code for the name.
*
* @param name The name of the character.
*
* @return The code for the character.
*
* @throws IOException If there is no character code for the name.
*/
public int getCode( String name ) throws IOException
{
Integer code = nameToCode.get( name );
if( code == null )
{
throw new IOException( "No character code for character name '" + name + "'" );
}
return code;
}
/**
* This will take a character code and get the name from the code.
*
* @param code The character code.
*
* @return The name of the character.
*
* @throws IOException If there is no name for the code.
*/
public String getName( int code ) throws IOException
{
return codeToName.get( code );
}
/**
* This will take a name and get the character code for that name.
*
* @param name The name.
*
* @return The name of the character.
*
*/
public static String getCharacterForName(String name)
{
if (NAME_TO_CHARACTER.containsKey(name))
{
LOG.debug("No character for name " + name);
return NAME_TO_CHARACTER.get(name);
}
return null;
}
/**
* This will take a character code and get the name from the code.
*
* @param c The character.
*
* @return The name of the character.
*
* @throws IOException If there is no name for the character.
*/
public String getNameFromCharacter( char c ) throws IOException
{
String name = CHARACTER_TO_NAME.get( Character.toString(c) );
if( name == null )
{
throw new IOException( "No name for character '" + c + "'" );
}
return name;
}
/**
* This will get the character from the code.
*
* @param code The character code.
*
* @return The printable character for the code.
*
* @throws IOException If there is not name for the character.
*/
public String getCharacter( int code ) throws IOException
{
String name = getName( code );
if (name != null)
{
return getCharacter( name );
}
return null;
}
/**
* This will get the character from the name.
*
* @param name The name of the character.
*
* @return The printable character for the code.
*/
public String getCharacter( String name )
{
String character = NAME_TO_CHARACTER.get( name );
if( character == null )
{
// test if we have a suffix and if so remove it
if ( name.indexOf('.') > 0 )
{
character = getCharacter(name.substring( 0, name.indexOf('.') ));
}
// test for Unicode name
// (uniXXXX - XXXX must be a multiple of four;
// each representing a hexadecimal Unicode code point)
else if ( name.startsWith( "uni" ) )
{
int nameLength = name.length();
StringBuilder uniStr = new StringBuilder();
try
{
for ( int chPos = 3; chPos + 4 <= nameLength; chPos += 4 )
{
int characterCode = Integer.parseInt( name.substring( chPos, chPos + 4), 16 );
if ( characterCode > 0xD7FF && characterCode < 0xE000 )
{
LOG.warn( "Unicode character name with not allowed code area: " + name );
}
else
{
uniStr.append( (char) characterCode );
}
}
character = uniStr.toString();
NAME_TO_CHARACTER.put(name, character);
}
catch (NumberFormatException nfe)
{
LOG.warn( "Not a number in Unicode character name: " + name );
character = name;
}
}
// test for an alternate Unicode name representation
else if ( name.startsWith( "u" ) )
{
try
{
int characterCode = Integer.parseInt( name.substring( 1 ), 16 );
if ( characterCode > 0xD7FF && characterCode < 0xE000 )
{
LOG.warn( "Unicode character name with not allowed code area: " + name );
}
else
{
character = String.valueOf((char)characterCode);
NAME_TO_CHARACTER.put(name, character);
}
}
catch (NumberFormatException nfe)
{
LOG.warn( "Not a number in Unicode character name: " + name );
character = name;
}
}
else if (nameToCode.containsKey(name))
{
int code = nameToCode.get(name);
character = Character.toString((char)code);
}
else
{
character = name;
}
}
return character;
}
}