![JAR search and dependency download from the Maven repository](/logo.png)
org.jpedal.parser.text.CIDTextUtils Maven / Gradle / Ivy
/*
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.idrsolutions.com
* Help section for developers at http://www.idrsolutions.com/support/
*
* (C) Copyright 1997-2017 IDRsolutions and Contributors.
*
* This file is part of JPedal/JPDF2HTML5
*
@LICENSE@
*
* ---------------
* CIDTextUtils.java
* ---------------
*/
package org.jpedal.parser.text;
import org.jpedal.fonts.CodeSpaceRange;
import org.jpedal.fonts.PdfFont;
import org.jpedal.fonts.StandardFonts;
import org.jpedal.fonts.tt.FontFile2;
import org.jpedal.parser.ParserOptions;
/**
* @author markee
*/
public class CIDTextUtils {
static int getNonEmbedCIDCharValues(final int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {
// System.out.println(new String(stream));
final CodeSpaceRange cmap = glyphData.getCodeSpaceRange();
int nn = 0;
final int start = i;
int b1;
int v = 0;
int p = 1;
while (true) {
b1 = stream[start + nn] & 0xff;
nn++;
if (b1 == 92) {
b1 = stream[start + nn] & 0xff;
nn++;
if (isDigit(b1)) {
final StringBuilder sb = new StringBuilder();
sb.append((char) b1);
if ((start + nn) < streamLength && isDigit(stream[start + nn] & 0xff)) {
sb.append((char) (stream[start + nn] & 0xff));
nn++;
if ((start + nn) < streamLength && isDigit(stream[start + nn] & 0xff)) {
sb.append((char) (stream[start + nn] & 0xff));
nn++;
}
}
b1 = Integer.parseInt(sb.toString(), 8);
} else {
b1 = convertEscapeChar(b1);
}
}
v = (v << 8) | b1;
if (cmap.isInCodeSpaceRange(v, p)) {
break;
}
p++;
}
final int cid = cmap.cidMap[v];
final int uni = cmap.uniMap[cid];
glyphData.setRawInt(v);
glyphData.setRawChar((char) v);
glyphData.setDisplayValue("" + (char) uni);
glyphData.setUnicodeValue("" + (char) uni);
float actualWidth;
if (p > 1) {
actualWidth = currentFontData.getDefaultWidth(cid);
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1);
}
} else {
actualWidth = -1;
if ((currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
actualWidth = currentFontData.getDefaultWidth(cid);
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1) / 2;
}
}
}
if (actualWidth > 0) {
glyphData.setActualWidth(actualWidth);
}
return i + nn - 1; // the parent code increasing by 1 so reduce 1 here
}
private static boolean isDigit(final int v) {
return v >= 48 && v <= 57;
}
static int getCIDCharValues(int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {
/*
* first time we read the first 2 values and then decide if we are in single or
* double byte mode
* (ie is there a 0 x 0y pattern)
* (or do the 2 values on their own form valid settings)
*/
final boolean debug = false;
float actualWidth = 0;
//lazy init if needed
if (StandardFonts.CMAP == null) {
StandardFonts.readCMAP();
}
int firstVal = glyphData.getRawInt();
final String firstValue;
String newValue = null;
//System.out.println(">>"+Integer.toHexString(firstVal));
//if escaped roll on
if (firstVal == 92) {
i++;
firstVal = stream[i] & 255;
if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {
//see how long number is
int numberCount = 1;
if (Character.isDigit((char) stream[i + 1])) {
numberCount++;
if (Character.isDigit((char) stream[i + 2])) {
numberCount++;
}
}
// convert octal escapes
firstVal = TD.readEscapeValue(i, numberCount, 8, stream);
i = i + numberCount - 1;
if (firstVal > 255) {
firstVal -= 256;
}
} else if (firstVal == 'u') { //convert unicode of format uxxxx to char value
firstVal = TD.readEscapeValue(i + 1, 4, 16, stream);
i += 4;
} else {
firstVal = convertEscapeChar(firstVal);
}
glyphData.setRaw(firstVal);
} else {
firstVal = glyphData.getRawChar();
}
//get as 1 byte value
firstValue = StandardFonts.CMAP[glyphData.getRawChar()];
if (debug) {
System.out.println("1 byte values=" + (int) glyphData.getRawChar() + " val=" + firstValue + " isDouble=" + currentFontData.isCIDFont() + " currentFontData.hasDoubleBytes=" + currentFontData.hasDoubleBytes + ' ' + currentFontData.isDoubleBytes()); //+" "+(char)stream[i-2]+" "+(char)stream[i-1]+" "+(char)stream[i]+" "+(char)stream[i+1]+" "+(char)stream[i+2]+" "+(char)stream[i+3]);
}
/*
* read second byte if needed (we always read first time to see if double byte or single)
*/
final boolean isEmbedded = currentFontData.isFontEmbedded;
//also check if mapped in Charstring
//separates out
// PDFdata/baseline_screens/customersDec2012/5771020130000784D.pdf and
//PDFdata/sample_pdfs_html/general/JavaMagazine glassfish article.pdf
final boolean hasCharString = glyphData.getRawInt() > 0 && currentFontData.CMapName != null && currentFontData.getFontType() == StandardFonts.CIDTYPE0 && currentFontData.getGlyphData().getCharStrings().containsKey(String.valueOf(glyphData.getRawInt()));
//ignore this case
if (currentFontData.CMapName != null && currentFontData.CMapName.equals("OneByteIdentityH")) {
//System.out.println(currentFontData.CMapName);
} else if (!hasCharString && (currentFontData.hasDoubleBytes || firstValue == null || currentFontData.isDoubleBytes() != 0 || (glyphData.getRawInt() > 128 && glyphData.getRawInt() != 233))) {
//flag incase we are wrong and need to switch back
final int iBefore = i;
i++;
int secondVal = stream[i] & 255;
boolean secondByteIsEscaped = false;
//if escaped roll on as workaround hack
if (stream[i] == 92) {
i++;
secondByteIsEscaped = true;
if (glyphData.getRawInt() == 0) {
while (stream[i] == 13 || (stream[i] == 92 && stream[i - 1] == 13)) { //allow for garbage in stream
i++;
}
}
secondVal = stream[i] & 255;
if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {
//see how long number is
int numberCount = 1;
if (Character.isDigit((char) stream[i + 1])) {
numberCount++;
if (Character.isDigit((char) stream[i + 2])) {
numberCount++;
}
}
// convert octal escapes
secondVal = TD.readEscapeValue(i, numberCount, 8, stream);
i = i + numberCount - 1;
if (secondVal > 255) {
secondVal -= 256;
}
} else if (secondVal == 'u') { //convert unicode of format uxxxx to char value
secondVal = TD.readEscapeValue(i + 1, 4, 16, stream);
i += 4;
} else {
secondVal = convertEscapeChar(secondVal);
}
}
final int secondByte = secondVal;
final char combinedVal = (char) ((glyphData.getRawChar() << 8) + secondVal);
//lookup in 2 byte version
newValue = StandardFonts.CMAP[combinedVal];
int isDouble = -1;
//if CIDtoGID use that first to see if double byte
if (currentFontData.isCIDFont() && currentFontData.getGlyphData().getTable(FontFile2.CMAP) == null) {
final int first = currentFontData.getEncodedCMAPValue(firstVal);
final int second = currentFontData.getEncodedCMAPValue(secondVal);
final int combined = currentFontData.getEncodedCMAPValue(combinedVal);
if (combined <= 0 && (first > 0 || second > 0)) {
newValue = null;
isDouble = 0;
}
}
if (isDouble == -1) {
isDouble = currentFontData.isDoubleBytes(firstVal, secondByte, secondByteIsEscaped);
}
if (debug) {
System.out.println("2 byte values=" + newValue + ' ' + " isDouble=" + isDouble + ' ' + combinedVal + ' ' + firstValue);
}
//if no 2 byte value either default to 1 byte
if (isEmbedded && (isDouble == 1 || combinedVal < 256 || newValue != null)) { // || (!secondByteIsEscaped && secondByte!=')'))){
glyphData.setRawInt(combinedVal);
glyphData.setRawChar(combinedVal);
if (debug) {
System.out.println("use 2 values=" + Integer.toHexString(combinedVal) + " new value=" + newValue + " isEmbedded=" + isEmbedded + ' ' + (!secondByteIsEscaped && secondByte != ')'));
}
} else if (!isEmbedded && isDouble == 1 && (newValue != null || combinedVal < 256 || (!secondByteIsEscaped && secondByte != ')'))) {
glyphData.setRawInt(combinedVal);
glyphData.setRawChar(combinedVal);
if (debug) {
System.out.println("use 2 values=" + combinedVal + ' ' + newValue);
}
} else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue != null && firstValue == null) {
glyphData.setRawInt(combinedVal);
glyphData.setRawChar(combinedVal);
if (debug) {
System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
}
} else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue == null && firstValue != null) {
i = iBefore;
//glyphData.rawInt=combinedVal;
//rawChar=(char)f;
// newValue = String.valueOf(rawChar);
newValue = firstValue;
if (debug) {
System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
}
} else {
i = iBefore;
if (debug) {
System.out.println("reset " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
}
}
if (!isEmbedded) {
actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1);
}
}
} else {
actualWidth = -1;
if ((!isEmbedded)
&& (currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1) / 2;
}
}
}
glyphData.setActualWidth(actualWidth);
//if no value ignore for moment
if (newValue != null) {
glyphData.setDisplayValue(newValue);
} else { //default if no value
glyphData.setDisplayValue(String.valueOf(glyphData.getRawChar()));
}
if (parserOptions.isTextExtracted()) { //(not sure if this is correct - may need more samples)
glyphData.setUnicodeValue(currentFontData.getUnicodeValue(glyphData.getDisplayValue(), glyphData.getRawChar()));
}
//fix for \\) at end of stream
if (glyphData.getRawChar() == 92) {
glyphData.setValueForHTML(92);
glyphData.setRawChar((char) 120);
}
if (debug) {
System.out.println("returns =" + glyphData.getDisplayValue() + ' ' + glyphData.getUnicodeValue() + " int=" + glyphData.getRawInt() + " actualWidth=" + actualWidth);
}
return i;
}
static int convertEscapeChar(final int secondVal) {
switch (secondVal) {
case 'n':
return '\n';
case 'b':
return '\b';
case 't':
return '\t';
case 'r':
return '\r';
case 'f':
return '\f';
default:
return secondVal;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy