org.apache.pdfbox.util.TextNormalize Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.util.HashMap;

/**
 * This class allows a caller to normalize text in various ways. It will load the ICU4J jar file if it is defined on the
 * classpath.
 * 
 * @author Brian Carrier
 * 
 */
public class TextNormalize
{
    private ICU4JImpl icu4j = null;
    private static final HashMap DIACHASH = new HashMap();
    private String outputEncoding;

    static
    {
        populateDiacHash();
    }

    /**
     * 
     * @param encoding The Encoding that the text will eventually be written as (or null)
     */
    public TextNormalize(String encoding)
    {
        findICU4J();
        outputEncoding = encoding;
    }

    private void findICU4J()
    {
        // see if we can load the icu4j classes from the classpath
        try
        {
            this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Bidi");
            this.getClass().getClassLoader().loadClass("com.ibm.icu.text.Normalizer");
            icu4j = new ICU4JImpl();
        }
        catch (ClassNotFoundException e)
        {
            icu4j = null;
        }
    }

    /*
     * Adds non-decomposing diacritics to the hash with their related combining character. These are values that the
     * unicode spec claims are equivalent but are not mapped in the form NFKC normalization method. Determined by going
     * through the Combining Diacritical Marks section of the Unicode spec and identifying which characters are not
     * mapped to by the normalization.
     */
    private static void populateDiacHash()
    {
        DIACHASH.put(new Integer(0x0060), "\u0300");
        DIACHASH.put(new Integer(0x02CB), "\u0300");
        DIACHASH.put(new Integer(0x0027), "\u0301");
        DIACHASH.put(new Integer(0x02B9), "\u0301");
        DIACHASH.put(new Integer(0x02CA), "\u0301");
        DIACHASH.put(new Integer(0x005e), "\u0302");
        DIACHASH.put(new Integer(0x02C6), "\u0302");
        DIACHASH.put(new Integer(0x007E), "\u0303");
        DIACHASH.put(new Integer(0x02C9), "\u0304");
        DIACHASH.put(new Integer(0x00B0), "\u030A");
        DIACHASH.put(new Integer(0x02BA), "\u030B");
        DIACHASH.put(new Integer(0x02C7), "\u030C");
        DIACHASH.put(new Integer(0x02C8), "\u030D");
        DIACHASH.put(new Integer(0x0022), "\u030E");
        DIACHASH.put(new Integer(0x02BB), "\u0312");
        DIACHASH.put(new Integer(0x02BC), "\u0313");
        DIACHASH.put(new Integer(0x0486), "\u0313");
        DIACHASH.put(new Integer(0x055A), "\u0313");
        DIACHASH.put(new Integer(0x02BD), "\u0314");
        DIACHASH.put(new Integer(0x0485), "\u0314");
        DIACHASH.put(new Integer(0x0559), "\u0314");
        DIACHASH.put(new Integer(0x02D4), "\u031D");
        DIACHASH.put(new Integer(0x02D5), "\u031E");
        DIACHASH.put(new Integer(0x02D6), "\u031F");
        DIACHASH.put(new Integer(0x02D7), "\u0320");
        DIACHASH.put(new Integer(0x02B2), "\u0321");
        DIACHASH.put(new Integer(0x02CC), "\u0329");
        DIACHASH.put(new Integer(0x02B7), "\u032B");
        DIACHASH.put(new Integer(0x02CD), "\u0331");
        DIACHASH.put(new Integer(0x005F), "\u0332");
        DIACHASH.put(new Integer(0x204E), "\u0359");
    }

    /**
     * Takes a line of text in presentation order and converts it to logical order. For most text other than Arabic and
     * Hebrew, the presentation and logical orders are the same. However, for Arabic and Hebrew, they are different and
     * if the text involves both RTL and LTR text then the Unicode BIDI algorithm must be used to determine how to map
     * between them.
     * 
     * @param str Presentation form of line to convert (i.e. left most char is first char)
     * @param isRtlDominant true if the PAGE has a dominant right to left ordering
     * @return Logical form of string (or original string if ICU4J library is not on classpath)
     * 
     * @deprecated isn't used anymore
     */
    public String makeLineLogicalOrder(String str, boolean isRtlDominant)
    {
        if (icu4j != null)
        {
            return icu4j.makeLineLogicalOrder(str, isRtlDominant);
        }
        else
        {
            return str;
        }
    }

    /**
     * Normalize the presentation forms of characters in the string. For example, convert the single "fi" ligature to
     * "f" and "i".
     * 
     * @param str String to normalize
     * @return Normalized string (or original string if ICU4J library is not on classpath)
     */
    public String normalizePres(String str)
    {
        if (icu4j != null)
        {
            return icu4j.normalizePres(str);
        }
        else
        {
            return str;
        }
    }

    /**
     * Normalize the diacritic, for example, convert non-combining diacritic characters to their combining counterparts.
     * 
     * @param str String to normalize
     * @return Normalized string (or original string if ICU4J library is not on classpath)
     */
    public String normalizeDiac(String str)
    {
        /*
         * Unicode contains special combining forms of the diacritic characters and we want to use these.
         */
        if (outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF"))
        {
            Integer c = new Integer(str.charAt(0));
            // convert the characters not defined in the Unicode spec
            if (DIACHASH.containsKey(c))
            {
                return (String) DIACHASH.get(c);
            }
            else if (icu4j != null)
            {
                return icu4j.normalizeDiac(str);
            }
            else
            {
                return str;
            }
        }
        else
        {
            return str;
        }
    }
}