All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jpedal.parser.text.CIDTextUtils Maven / Gradle / Ivy

There is a newer version: 7.15.25
Show newest version
/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
 @LICENSE@
 *
 * ---------------
 * CIDTextUtils.java
 * ---------------
 */
package org.jpedal.parser.text;

import org.jpedal.fonts.CodeSpaceRange;
import org.jpedal.fonts.PdfFont;
import org.jpedal.fonts.StandardFonts;
import org.jpedal.fonts.tt.FontFile2;
import org.jpedal.parser.ParserOptions;

/**
 * @author markee
 */
public class CIDTextUtils {

	static int getNonEmbedCIDCharValues(final int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {
//		System.out.println(new String(stream));
		final CodeSpaceRange cmap = glyphData.getCodeSpaceRange();
		int nn = 0;
		final int start = i;
		int b1;
		int v = 0;
		int p = 1;
		while (true) {
			b1 = stream[start + nn] & 0xff;
			nn++;
			if (b1 == 92) {
				b1 = stream[start + nn] & 0xff;
				nn++;
				if (isDigit(b1)) {
					final StringBuilder sb = new StringBuilder();
					sb.append((char) b1);
					if ((start + nn) < streamLength && isDigit(stream[start + nn] & 0xff)) {
						sb.append((char) (stream[start + nn] & 0xff));
						nn++;
						if ((start + nn) < streamLength && isDigit(stream[start + nn] & 0xff)) {
							sb.append((char) (stream[start + nn] & 0xff));
							nn++;
						}
					}
					b1 = Integer.parseInt(sb.toString(), 8);
				} else {
					b1 = convertEscapeChar(b1);
				}
			}
			v = (v << 8) | b1;
			if (cmap.isInCodeSpaceRange(v, p)) {
				break;
			}
			p++;
		}
		final int uni = cmap.getEncoding().getUnicodeValue(v);
		glyphData.setRawInt(v);
		glyphData.setRawChar((char) v);

		glyphData.setDisplayValue("" + (char) uni);
		glyphData.setUnicodeValue("" + (char) uni);
		float actualWidth;

		if (p > 1) {
			actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
			if (actualWidth == -1) {
				actualWidth = currentFontData.getDefaultWidth(-1);
			}
		} else {
			actualWidth = -1;
			if ((currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
				actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
				if (actualWidth == -1) {
					actualWidth = currentFontData.getDefaultWidth(-1) / 2;
				}
			}
		}
		if (actualWidth > 0) {
			glyphData.setActualWidth(actualWidth);
		}
		return i + nn - 1; // the parent code increasing by 1 so reduce 1 here

	}

	private static boolean isDigit(final int v) {
		return v >= 48 && v <= 57;
	}

	static int getCIDCharValues(int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {

		/*
         * first time we read the first 2 values and then decide if we are in single or
         * double byte mode
         * (ie is there a 0 x 0y pattern)
         * (or do the 2 values on their own form valid settings)
		 */
		final boolean debug = false;

		float actualWidth = 0;

		//lazy init if needed
		if (StandardFonts.CMAP == null) {
			StandardFonts.readCMAP();
		}

		int firstVal = glyphData.getRawInt();
		final String firstValue;
		String newValue = null;

		//System.out.println(">>"+Integer.toHexString(firstVal));
		//if escaped roll on
		if (firstVal == 92) {

			i++;

			firstVal = stream[i] & 255;

			if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {

				//see how long number is
				int numberCount = 1;
				if (Character.isDigit((char) stream[i + 1])) {
					numberCount++;
					if (Character.isDigit((char) stream[i + 2])) {
						numberCount++;
					}
				}

				// convert octal escapes
				firstVal = TD.readEscapeValue(i, numberCount, 8, stream);
				i = i + numberCount - 1;

				if (firstVal > 255) {
					firstVal -= 256;
				}

			} else if (firstVal == 'u') { //convert unicode of format uxxxx to char value
				firstVal = TD.readEscapeValue(i + 1, 4, 16, stream);
				i += 4;

			} else {

				firstVal = convertEscapeChar(firstVal);
			}

			glyphData.setRaw(firstVal);

		} else {
			firstVal = glyphData.getRawChar();
		}

		//get as 1 byte value
		firstValue = StandardFonts.CMAP[glyphData.getRawChar()];

		if (debug) {
			System.out.println("1 byte values=" + (int) glyphData.getRawChar() + " val=" + firstValue + " isDouble=" + currentFontData.isCIDFont() + " currentFontData.hasDoubleBytes=" + currentFontData.hasDoubleBytes + ' ' + currentFontData.isDoubleBytes()); //+" "+(char)stream[i-2]+" "+(char)stream[i-1]+" "+(char)stream[i]+" "+(char)stream[i+1]+" "+(char)stream[i+2]+" "+(char)stream[i+3]);
		}

		/*
         * read second byte if needed (we always read first time to see if double byte or single)
		 */
		final boolean isEmbedded = currentFontData.isFontEmbedded;

		//also check if mapped in Charstring
		//separates out
		// PDFdata/baseline_screens/customersDec2012/5771020130000784D.pdf and
		//PDFdata/sample_pdfs_html/general/JavaMagazine glassfish article.pdf
		final boolean hasCharString = glyphData.getRawInt() > 0 && currentFontData.CMapName != null && currentFontData.getFontType() == StandardFonts.CIDTYPE0 && currentFontData.getGlyphData().getCharStrings().containsKey(String.valueOf(glyphData.getRawInt()));

		//ignore this case
		if (currentFontData.CMapName != null && currentFontData.CMapName.equals("OneByteIdentityH")) {
			//System.out.println(currentFontData.CMapName);

		} else if (!hasCharString && (currentFontData.hasDoubleBytes || firstValue == null || currentFontData.isDoubleBytes() != 0 || (glyphData.getRawInt() > 128 && glyphData.getRawInt() != 233))) {

			//flag incase we are wrong and need to switch back
			final int iBefore = i;

			i++;

			int secondVal = stream[i] & 255;

			boolean secondByteIsEscaped = false;

			//if escaped roll on as workaround hack
			if (stream[i] == 92) {
				i++;

				secondByteIsEscaped = true;

				if (glyphData.getRawInt() == 0) {
					while (stream[i] == 13 || (stream[i] == 92 && stream[i - 1] == 13)) { //allow for garbage in stream
						i++;
					}
				}
				secondVal = stream[i] & 255;

				if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {

					//see how long number is
					int numberCount = 1;
					if (Character.isDigit((char) stream[i + 1])) {
						numberCount++;
						if (Character.isDigit((char) stream[i + 2])) {
							numberCount++;
						}
					}

					// convert octal escapes
					secondVal = TD.readEscapeValue(i, numberCount, 8, stream);
					i = i + numberCount - 1;

					if (secondVal > 255) {
						secondVal -= 256;
					}

				} else if (secondVal == 'u') { //convert unicode of format uxxxx to char value
					secondVal = TD.readEscapeValue(i + 1, 4, 16, stream);
					i += 4;

				} else {

					secondVal = convertEscapeChar(secondVal);
				}
			}

			final int secondByte = secondVal;

			final char combinedVal = (char) ((glyphData.getRawChar() << 8) + secondVal);

			//lookup in 2 byte version
			newValue = StandardFonts.CMAP[combinedVal];

			int isDouble = -1;

			//if CIDtoGID use that first to see if double byte
			if (currentFontData.isCIDFont() && currentFontData.getGlyphData().getTable(FontFile2.CMAP) == null) {
				final int first = currentFontData.getEncodedCMAPValue(firstVal);
				final int second = currentFontData.getEncodedCMAPValue(secondVal);
				final int combined = currentFontData.getEncodedCMAPValue(combinedVal);
				if (combined <= 0 && (first > 0 || second > 0)) {
					newValue = null;
					isDouble = 0;
				}
			}

			if (isDouble == -1) {
				isDouble = currentFontData.isDoubleBytes(firstVal, secondByte, secondByteIsEscaped);
			}

			if (debug) {
				System.out.println("2 byte values=" + newValue + ' ' + " isDouble=" + isDouble + ' ' + combinedVal + ' ' + firstValue);
			}

			//if no 2 byte value either default to 1 byte
			if (isEmbedded && (isDouble == 1 || combinedVal < 256 || newValue != null)) { // || (!secondByteIsEscaped && secondByte!=')'))){
				glyphData.setRawInt(combinedVal);
				glyphData.setRawChar(combinedVal);

				if (debug) {
					System.out.println("use 2 values=" + Integer.toHexString(combinedVal) + " new value=" + newValue + " isEmbedded=" + isEmbedded + ' ' + (!secondByteIsEscaped && secondByte != ')'));
				}

			} else if (!isEmbedded && isDouble == 1 && (newValue != null || combinedVal < 256 || (!secondByteIsEscaped && secondByte != ')'))) {
				glyphData.setRawInt(combinedVal);
				glyphData.setRawChar(combinedVal);

				if (debug) {
					System.out.println("use 2 values=" + combinedVal + ' ' + newValue);
				}

			} else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue != null && firstValue == null) {
				glyphData.setRawInt(combinedVal);
				glyphData.setRawChar(combinedVal);

				if (debug) {
					System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
				}

			} else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue == null && firstValue != null) {

				i = iBefore;
				//glyphData.rawInt=combinedVal;
				//rawChar=(char)f;
				// newValue = String.valueOf(rawChar);
				newValue = firstValue;
				if (debug) {
					System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
				}

			} else {
				i = iBefore;

				if (debug) {
					System.out.println("reset " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
				}
			}

			if (!isEmbedded) {
				actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());

				if (actualWidth == -1) {
					actualWidth = currentFontData.getDefaultWidth(-1);
				}
			}

		} else {

			actualWidth = -1;

			if ((!isEmbedded)
					&& (currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
				actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());

				if (actualWidth == -1) {
					actualWidth = currentFontData.getDefaultWidth(-1) / 2;
				}
			}
		}

		glyphData.setActualWidth(actualWidth);

		//if no value ignore for moment
		if (newValue != null) {
			glyphData.setDisplayValue(newValue);
		} else {  //default if no value
			glyphData.setDisplayValue(String.valueOf(glyphData.getRawChar()));
		}

		if (parserOptions.isTextExtracted()) { //(not sure if this is correct - may need more samples)
			glyphData.setUnicodeValue(currentFontData.getUnicodeValue(glyphData.getDisplayValue(), glyphData.getRawChar()));
		}

		//fix for \\) at end of stream
		if (glyphData.getRawChar() == 92) {
			glyphData.setValueForHTML(92);
			glyphData.setRawChar((char) 120);
		}

		if (debug) {
			System.out.println("returns =" + glyphData.getDisplayValue() + ' ' + glyphData.getUnicodeValue() + " int=" + glyphData.getRawInt() + " actualWidth=" + actualWidth);
		}

		return i;
	}

	static int convertEscapeChar(final int secondVal) {
		switch (secondVal) {
			case 'n':
				return '\n';
			case 'b':
				return '\b';
			case 't':
				return '\t';
			case 'r':
				return '\r';
			case 'f':
				return '\f';
			default:
				return secondVal;
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy