All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.parser.text.CIDTextUtils Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * CIDTextUtils.java
 * ---------------
 */

package org.jpedal.parser.text;

import org.jpedal.fonts.PdfFont;
import org.jpedal.fonts.StandardFonts;
import org.jpedal.fonts.tt.FontFile2;
import org.jpedal.parser.ParserOptions;

/**
 * @author markee
 */
public class CIDTextUtils {

    static int getCIDCharValues(int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {
        
        /*
         * first time we read the first 2 values and then decide if we are in single or
         * double byte mode
         * (ie is there a 0 x 0y pattern)
         * (or do the 2 values on their own form valid settings)
         */
        final boolean debug = false;

        float actualWidth = 0;

        //lazy init if needed
        if (StandardFonts.CMAP == null) {
            StandardFonts.readCMAP();
        }

        int firstVal = glyphData.getRawInt();
        final String firstValue;
        String newValue = null;

        //System.out.println(">>"+Integer.toHexString(firstVal));

        //if escaped roll on
        if (firstVal == 92) {

            i++;

            firstVal = stream[i] & 255;

            if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {

                //see how long number is
                int numberCount = 1;
                if (Character.isDigit((char) stream[i + 1])) {
                    numberCount++;
                    if (Character.isDigit((char) stream[i + 2])) {
                        numberCount++;
                    }
                }

                // convert octal escapes
                firstVal = TD.readEscapeValue(i, numberCount, 8, stream);
                i = i + numberCount - 1;

                if (firstVal > 255) {
                    firstVal -= 256;
                }

            } else if (firstVal == 'u') { //convert unicode of format uxxxx to char value
                firstVal = TD.readEscapeValue(i + 1, 4, 16, stream);
                i += 4;

            } else {

                firstVal = convertEscapeChar(firstVal);
            }

            glyphData.setRaw(firstVal);

        } else {
            firstVal = glyphData.getRawChar();
        }

        //get as 1 byte value
        firstValue = StandardFonts.CMAP[glyphData.getRawChar()];

        if (debug) {
            System.out.println("1 byte values=" + (int) glyphData.getRawChar() + " val=" + firstValue + " isDouble=" + currentFontData.isCIDFont() + " currentFontData.hasDoubleBytes=" + currentFontData.hasDoubleBytes + ' ' + currentFontData.isDoubleBytes()); //+" "+(char)stream[i-2]+" "+(char)stream[i-1]+" "+(char)stream[i]+" "+(char)stream[i+1]+" "+(char)stream[i+2]+" "+(char)stream[i+3]);
        }
        
        /*
         * read second byte if needed (we always read first time to see if double byte or single)
         */
        final boolean isEmbedded = currentFontData.isFontEmbedded;

        //also check if mapped in Charstring
        //separates out
        // PDFdata/baseline_screens/customersDec2012/5771020130000784D.pdf and
        //PDFdata/sample_pdfs_html/general/JavaMagazine glassfish article.pdf
        final boolean hasCharString = glyphData.getRawInt() > 0 && currentFontData.CMapName != null && currentFontData.getFontType() == StandardFonts.CIDTYPE0 && currentFontData.getGlyphData().getCharStrings().containsKey(String.valueOf(glyphData.getRawInt()));

        //ignore this case
        if (currentFontData.CMapName != null && currentFontData.CMapName.equals("OneByteIdentityH")) {
            //System.out.println(currentFontData.CMapName);

        } else if (!hasCharString && (currentFontData.hasDoubleBytes || firstValue == null || currentFontData.isDoubleBytes() != 0 || (glyphData.getRawInt() > 128 && glyphData.getRawInt() != 233))) {

            //flag incase we are wrong and need to switch back
            final int iBefore = i;

            i++;

            int secondVal = stream[i] & 255;

            boolean secondByteIsEscaped = false;

            //if escaped roll on as workaround hack
            if (stream[i] == 92) {
                i++;

                secondByteIsEscaped = true;

                if (glyphData.getRawInt() == 0) {
                    while (stream[i] == 13 || (stream[i] == 92 && stream[i - 1] == 13)) { //allow for garbage in stream
                        i++;
                    }
                }
                secondVal = stream[i] & 255;

                if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {

                    //see how long number is
                    int numberCount = 1;
                    if (Character.isDigit((char) stream[i + 1])) {
                        numberCount++;
                        if (Character.isDigit((char) stream[i + 2])) {
                            numberCount++;
                        }
                    }

                    // convert octal escapes
                    secondVal = TD.readEscapeValue(i, numberCount, 8, stream);
                    i = i + numberCount - 1;

                    if (secondVal > 255) {
                        secondVal -= 256;
                    }

                } else if (secondVal == 'u') { //convert unicode of format uxxxx to char value
                    secondVal = TD.readEscapeValue(i + 1, 4, 16, stream);
                    i += 4;

                } else {

                    secondVal = convertEscapeChar(secondVal);
                }
            }

            final int secondByte = secondVal;

            final char combinedVal = (char) ((glyphData.getRawChar() << 8) + secondVal);

            //lookup in 2 byte version
            newValue = StandardFonts.CMAP[combinedVal];

            int isDouble = -1;

            //if CIDtoGID use that first to see if double byte
            if (currentFontData.isCIDFont() && currentFontData.getGlyphData().getTable(FontFile2.CMAP) == null) {
                final int first = currentFontData.getEncodedCMAPValue(firstVal);
                final int second = currentFontData.getEncodedCMAPValue(secondVal);
                final int combined = currentFontData.getEncodedCMAPValue(combinedVal);
                if (combined <= 0 && (first > 0 || second > 0)) {
                    newValue = null;
                    isDouble = 0;
                }
            }

            if (isDouble == -1) {
                isDouble = currentFontData.isDoubleBytes(firstVal, secondByte, secondByteIsEscaped);
            }

            if (debug) {
                System.out.println("2 byte values=" + newValue + ' ' + " isDouble=" + isDouble + ' ' + combinedVal + ' ' + firstValue);
            }

            //if no 2 byte value either default to 1 byte
            if (isEmbedded && (isDouble == 1 || combinedVal < 256 || newValue != null)) { // || (!secondByteIsEscaped && secondByte!=')'))){
                glyphData.setRawInt(combinedVal);
                glyphData.setRawChar(combinedVal);

                if (debug) {
                    System.out.println("use 2 values=" + Integer.toHexString(combinedVal) + " new value=" + newValue + " isEmbedded=" + isEmbedded + ' ' + (!secondByteIsEscaped && secondByte != ')'));
                }

            } else if (!isEmbedded && isDouble == 1 && (newValue != null || combinedVal < 256 || (!secondByteIsEscaped && secondByte != ')'))) {
                glyphData.setRawInt(combinedVal);
                glyphData.setRawChar(combinedVal);

                if (debug) {
                    System.out.println("use 2 values=" + combinedVal + ' ' + newValue);
                }

            } else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue != null && firstValue == null) {
                glyphData.setRawInt(combinedVal);
                glyphData.setRawChar(combinedVal);

                if (debug) {
                    System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
                }

            } else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue == null && firstValue != null) {

                i = iBefore;
                //glyphData.rawInt=combinedVal;
                //rawChar=(char)f;
                // newValue = String.valueOf(rawChar);
                newValue = firstValue;
                if (debug) {
                    System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
                }

            } else {
                i = iBefore;

                if (debug) {
                    System.out.println("reset " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
                }
            }

            if (!isEmbedded) {
                actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());

                if (actualWidth == -1) {
                    actualWidth = currentFontData.getDefaultWidth(-1);
                }
            }

        } else {

            actualWidth = -1;

            if ((!isEmbedded) &&
                    (currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
                actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());

                if (actualWidth == -1) {
                    actualWidth = currentFontData.getDefaultWidth(-1) / 2;
                }
            }
        }


        glyphData.setActualWidth(actualWidth);

        //if no value ignore for moment
        if (newValue != null) {
            glyphData.setDisplayValue(newValue);
        } else {  //default if no value
            glyphData.setDisplayValue(String.valueOf(glyphData.getRawChar()));
        }

        if (parserOptions.isTextExtracted()) { //(not sure if this is correct - may need more samples)
            glyphData.setUnicodeValue(currentFontData.getUnicodeValue(glyphData.getDisplayValue(), glyphData.getRawChar()));
        }

        //fix for \\) at end of stream
        if (glyphData.getRawChar() == 92) {
            glyphData.setValueForHTML(92);
            glyphData.setRawChar((char) 120);
        }

        if (debug) {
            System.out.println("returns =" + glyphData.getDisplayValue() + ' ' + glyphData.getUnicodeValue() + " int=" + glyphData.getRawInt() + " actualWidth=" + actualWidth);
        }

        return i;
    }

    static int convertEscapeChar(final int secondVal) {
        switch (secondVal) {
            case 'n':
                return '\n';
            case 'b':
                return '\b';
            case 't':
                return '\t';
            case 'r':
                return '\r';
            case 'f':
                return '\f';
            default:
                return secondVal;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy