All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pdfbox.encoding.Encoding Maven / Gradle / Ivy

Go to download

The Apache PDFBox library is an open source Java tool for working with PDF documents.

There is a newer version: 3.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.encoding;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.StringTokenizer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.util.ResourceLoader;

/**
 * This is an interface to a text encoder.
 *
 * @author Ben Litchfield
 * @version $Revision: 1.15 $
 */
public abstract class Encoding implements COSObjectable
{

    /**
     * Log instance.
     */
    private static final Log LOG = LogFactory.getLog(Encoding.class);

    /** Identifies a non-mapped character. */
    public static final String NOTDEF = ".notdef";

    /**
     * This is a mapping from a character code to a character name.
     */
    protected final Map codeToName =
        new HashMap();

    /**
     * This is a mapping from a character name to a character code.
     */
    protected final Map nameToCode =
        new HashMap();

    private static final Map NAME_TO_CHARACTER =
        new HashMap();

    private static final Map CHARACTER_TO_NAME =
        new HashMap();

    static
    {
        //Loads the official Adobe Glyph List
        loadGlyphList("org/apache/pdfbox/resources/glyphlist.txt");
        //Loads some additional glyph mappings
        loadGlyphList("org/apache/pdfbox/resources/additional_glyphlist.txt");

        // Load an external glyph list file that user can give as JVM property
        try
        {
            String location = System.getProperty("glyphlist_ext");
            if(location != null)
            {
                File external = new File(location);
                if(external.exists())
                {
                    loadGlyphList(location);
                }
            }
        }
        catch (SecurityException e)  // can occur on Sytem.getProperty
        {
            // PDFBOX-1946 ignore and continue
        }

        NAME_TO_CHARACTER.put( NOTDEF, "" );
        NAME_TO_CHARACTER.put( "fi", "fi" );
        NAME_TO_CHARACTER.put( "fl", "fl" );
        NAME_TO_CHARACTER.put( "ffi", "ffi" );
        NAME_TO_CHARACTER.put( "ff", "ff" );
        NAME_TO_CHARACTER.put( "pi", "pi" );

        for( Map.Entry entry : NAME_TO_CHARACTER.entrySet() )
        {
            CHARACTER_TO_NAME.put( entry.getValue(), entry.getKey() );
        }
    }

    /**
     * Loads a glyph list from a given location and populates the NAME_TO_CHARACTER hashmap
     * for character lookups.
     * @param location - The string location of the glyphlist file
     */
    private static void loadGlyphList(String location)
    {
        BufferedReader glyphStream = null;
        try
        {
            InputStream resource = ResourceLoader.loadResource( location );
            if (resource == null)
            {
                throw new MissingResourceException("Glyphlist not found: " + location,
                        Encoding.class.getName(), location);
            }
            glyphStream = new BufferedReader( new InputStreamReader( resource, "ISO-8859-1" ) );
            String line = null;
            while( (line = glyphStream.readLine()) != null )
            {
                line = line.trim();
                //lines starting with # are comments which we can ignore.
                if( !line.startsWith("#" ) )
                {
                    int semicolonIndex = line.indexOf( ';' );
                    if( semicolonIndex >= 0 )
                    {
                        String unicodeValue = null;
                        try
                        {
                            String characterName = line.substring( 0, semicolonIndex );
                            unicodeValue = line.substring( semicolonIndex+1, line.length() );
                            StringTokenizer tokenizer = new StringTokenizer( unicodeValue, " ", false );
                            StringBuilder value = new StringBuilder();
                            while(tokenizer.hasMoreTokens())
                            {
                                int characterCode = Integer.parseInt( tokenizer.nextToken(), 16 );
                                value.append((char)characterCode);
                            }
                            if (NAME_TO_CHARACTER.containsKey(characterName))
                            {
                                LOG.warn("duplicate value for characterName="+characterName+","+value);
                            }
                            else
                            {
                                NAME_TO_CHARACTER.put( characterName, value.toString() );
                            }
                        }
                        catch( NumberFormatException nfe )
                        {
                            LOG.error("malformed unicode value "+ unicodeValue, nfe);
                        }
                    }
                }
            }
        }
        catch( IOException io )
        {
            LOG.error("error while reading the glyph list.", io);
        }
        finally
        {
            if( glyphStream != null )
            {
                try
                {
                    glyphStream.close();
                }
                catch( IOException e )
                {
                    LOG.error("error when closing the glyph list.", e);
                }

            }
        }
    }

    /**
     * Returns an unmodifiable view of the Code2Name mapping.
     * @return the Code2Name map
     */
    public Map getCodeToNameMap()
    {
        return Collections.unmodifiableMap(codeToName);
    }

    /**
     * Returns an unmodifiable view of the Name2Code mapping.
     * @return the Name2Code map
     */
    public Map getNameToCodeMap()
    {
        return Collections.unmodifiableMap(nameToCode);
    }

    /**
     * This will add a character encoding.
     *
     * @param code The character code that matches the character.
     * @param name The name of the character.
     */
    public void addCharacterEncoding( int code, String name )
    {
        codeToName.put( code, name );
        nameToCode.put( name, code );
    }

    /**
     * Determines if the encoding has a mapping for the given name value.
     * 
     * @param name the source value for the mapping
     * @return the mapped value
     */
    public boolean hasCodeForName(String name)
    {
        return nameToCode.containsKey(name);
    }

    /**
     * Determines if the encoding has a mapping for the given code value.
     * 
     * @param code the source value for the mapping
     * @return the mapped value
     */
    public boolean hasNameForCode(int code)
    {
        return codeToName.containsKey(code);
    }
    
    /**
     * This will get the character code for the name.
     *
     * @param name The name of the character.
     *
     * @return The code for the character.
     *
     * @throws IOException If there is no character code for the name.
     */
    public int getCode( String name ) throws IOException
    {
        Integer code = nameToCode.get( name );
        if( code == null )
        {
            throw new IOException( "No character code for character name '" + name + "'" );
        }
        return code;
    }

    /**
     * This will take a character code and get the name from the code.
     *
     * @param code The character code.
     *
     * @return The name of the character.
     *
     * @throws IOException If there is no name for the code.
     */
    public String getName( int code ) throws IOException
    {
        return codeToName.get( code );
    }

    /**
     * This will take a name and get the character code for that name.
     * 
     * @param name The name.
     * 
     * @return The name of the character.
     * 
     */
    public static String getCharacterForName(String name)
    {
        if (NAME_TO_CHARACTER.containsKey(name))
        {
            LOG.debug("No character for name " + name);
            return NAME_TO_CHARACTER.get(name);
        }
        return null;
    }

    /**
     * This will take a character code and get the name from the code.
     *
     * @param c The character.
     *
     * @return The name of the character.
     *
     * @throws IOException If there is no name for the character.
     */
    public String getNameFromCharacter( char c ) throws IOException
    {
        String name = CHARACTER_TO_NAME.get( Character.toString(c) );
        if( name == null )
        {
            throw new IOException( "No name for character '" + c + "'" );
        }
        return name;
    }

    /**
     * This will get the character from the code.
     *
     * @param code The character code.
     *
     * @return The printable character for the code.
     *
     * @throws IOException If there is not name for the character.
     */
    public String getCharacter( int code ) throws IOException
    {
        String name = getName( code );
        if (name != null)
        {
            return getCharacter( name );
        }
        return null;
    }

    /**
     * This will get the character from the name.
     *
     * @param name The name of the character.
     *
     * @return The printable character for the code.
     */
    public String getCharacter( String name )
    {
        String character = NAME_TO_CHARACTER.get( name );
        if( character == null )
        {
            // test if we have a suffix and if so remove it
            if ( name.indexOf('.') > 0 )
            {
                character = getCharacter(name.substring( 0, name.indexOf('.') ));
            }
            // test for Unicode name
            // (uniXXXX - XXXX must be a multiple of four;
            // each representing a hexadecimal Unicode code point)
            else if ( name.startsWith( "uni" ) )
            {
                int nameLength = name.length();
                StringBuilder uniStr = new StringBuilder();
                try
                {
                    for ( int chPos = 3; chPos + 4 <= nameLength; chPos += 4 )
                    {
                        int characterCode = Integer.parseInt( name.substring( chPos, chPos + 4), 16 );

                        if ( characterCode > 0xD7FF && characterCode < 0xE000 )
                        {
                            LOG.warn( "Unicode character name with not allowed code area: " + name );
                        }
                        else
                        {
                            uniStr.append( (char) characterCode );
                        }
                    }
                    character = uniStr.toString();
                    NAME_TO_CHARACTER.put(name, character);
                }
                catch (NumberFormatException nfe)
                {
                    LOG.warn( "Not a number in Unicode character name: " + name );
                    character = name;
                }
            }
            // test for an alternate Unicode name representation
            else if ( name.startsWith( "u" ) )
            {
                try
                {
                    int characterCode = Integer.parseInt( name.substring( 1 ), 16 );
                    if ( characterCode > 0xD7FF && characterCode < 0xE000 )
                    {
                        LOG.warn( "Unicode character name with not allowed code area: " + name );
                    }
                    else
                    {
                        character = String.valueOf((char)characterCode);
                        NAME_TO_CHARACTER.put(name, character);
                    }
                }
                catch (NumberFormatException nfe)
                {
                    LOG.warn( "Not a number in Unicode character name: " + name );
                    character = name;
                }
            }
            else if (nameToCode.containsKey(name))
            {
                int code = nameToCode.get(name);
                character = Character.toString((char)code);
            }
            else
            {
                character = name;
            }
        }
        return character;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy