org.jpedal.parser.text.CIDTextUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of OpenViewerFX Show documentation
Show all versions of OpenViewerFX Show documentation
Open Source (LGPL) JavaFX PDF Viewer
/*
* ===========================================
* Java Pdf Extraction Decoding Access Library
* ===========================================
*
* Project Info: http://www.idrsolutions.com
* Help section for developers at http://www.idrsolutions.com/support/
*
* (C) Copyright 1997-2017 IDRsolutions and Contributors.
*
* This file is part of JPedal/JPDF2HTML5
*
@LICENSE@
*
* ---------------
* CIDTextUtils.java
* ---------------
*/
package org.jpedal.parser.text;
import org.jpedal.fonts.CodeSpaceRange;
import org.jpedal.fonts.PdfFont;
import org.jpedal.fonts.StandardFonts;
import org.jpedal.fonts.tt.FontFile2;
import org.jpedal.parser.ParserOptions;
/**
* @author markee
*/
public class CIDTextUtils {
static int getNonEmbedCIDCharValues(final int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {
// System.out.println(new String(stream));
final CodeSpaceRange cmap = glyphData.getCodeSpaceRange();
int nn = 0;
final int start = i;
int b1;
int v = 0;
int p = 1;
while (true) {
b1 = stream[start + nn] & 0xff;
nn++;
if (b1 == 92) {
b1 = stream[start + nn] & 0xff;
nn++;
if (isDigit(b1)) {
final StringBuilder sb = new StringBuilder();
sb.append((char) b1);
if ((start + nn) < streamLength && isDigit(stream[start + nn] & 0xff)) {
sb.append((char) (stream[start + nn] & 0xff));
nn++;
if ((start + nn) < streamLength && isDigit(stream[start + nn] & 0xff)) {
sb.append((char) (stream[start + nn] & 0xff));
nn++;
}
}
b1 = Integer.parseInt(sb.toString(), 8);
} else {
b1 = convertEscapeChar(b1);
}
}
v = (v << 8) | b1;
if (cmap.isInCodeSpaceRange(v, p)) {
break;
}
p++;
}
final int uni = cmap.getEncoding().getUnicodeValue(v);
glyphData.setRawInt(v);
glyphData.setRawChar((char) v);
glyphData.setDisplayValue("" + (char) uni);
glyphData.setUnicodeValue("" + (char) uni);
float actualWidth;
if (p > 1) {
actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1);
}
} else {
actualWidth = -1;
if ((currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1) / 2;
}
}
}
if (actualWidth > 0) {
glyphData.setActualWidth(actualWidth);
}
return i + nn - 1; // the parent code increasing by 1 so reduce 1 here
}
private static boolean isDigit(final int v) {
return v >= 48 && v <= 57;
}
static int getCIDCharValues(int i, final byte[] stream, final int streamLength, final GlyphData glyphData, final PdfFont currentFontData, final ParserOptions parserOptions) {
/*
* first time we read the first 2 values and then decide if we are in single or
* double byte mode
* (ie is there a 0 x 0y pattern)
* (or do the 2 values on their own form valid settings)
*/
final boolean debug = false;
float actualWidth = 0;
//lazy init if needed
if (StandardFonts.CMAP == null) {
StandardFonts.readCMAP();
}
int firstVal = glyphData.getRawInt();
final String firstValue;
String newValue = null;
//System.out.println(">>"+Integer.toHexString(firstVal));
//if escaped roll on
if (firstVal == 92) {
i++;
firstVal = stream[i] & 255;
if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {
//see how long number is
int numberCount = 1;
if (Character.isDigit((char) stream[i + 1])) {
numberCount++;
if (Character.isDigit((char) stream[i + 2])) {
numberCount++;
}
}
// convert octal escapes
firstVal = TD.readEscapeValue(i, numberCount, 8, stream);
i = i + numberCount - 1;
if (firstVal > 255) {
firstVal -= 256;
}
} else if (firstVal == 'u') { //convert unicode of format uxxxx to char value
firstVal = TD.readEscapeValue(i + 1, 4, 16, stream);
i += 4;
} else {
firstVal = convertEscapeChar(firstVal);
}
glyphData.setRaw(firstVal);
} else {
firstVal = glyphData.getRawChar();
}
//get as 1 byte value
firstValue = StandardFonts.CMAP[glyphData.getRawChar()];
if (debug) {
System.out.println("1 byte values=" + (int) glyphData.getRawChar() + " val=" + firstValue + " isDouble=" + currentFontData.isCIDFont() + " currentFontData.hasDoubleBytes=" + currentFontData.hasDoubleBytes + ' ' + currentFontData.isDoubleBytes()); //+" "+(char)stream[i-2]+" "+(char)stream[i-1]+" "+(char)stream[i]+" "+(char)stream[i+1]+" "+(char)stream[i+2]+" "+(char)stream[i+3]);
}
/*
* read second byte if needed (we always read first time to see if double byte or single)
*/
final boolean isEmbedded = currentFontData.isFontEmbedded;
//also check if mapped in Charstring
//separates out
// PDFdata/baseline_screens/customersDec2012/5771020130000784D.pdf and
//PDFdata/sample_pdfs_html/general/JavaMagazine glassfish article.pdf
final boolean hasCharString = glyphData.getRawInt() > 0 && currentFontData.CMapName != null && currentFontData.getFontType() == StandardFonts.CIDTYPE0 && currentFontData.getGlyphData().getCharStrings().containsKey(String.valueOf(glyphData.getRawInt()));
//ignore this case
if (currentFontData.CMapName != null && currentFontData.CMapName.equals("OneByteIdentityH")) {
//System.out.println(currentFontData.CMapName);
} else if (!hasCharString && (currentFontData.hasDoubleBytes || firstValue == null || currentFontData.isDoubleBytes() != 0 || (glyphData.getRawInt() > 128 && glyphData.getRawInt() != 233))) {
//flag incase we are wrong and need to switch back
final int iBefore = i;
i++;
int secondVal = stream[i] & 255;
boolean secondByteIsEscaped = false;
//if escaped roll on as workaround hack
if (stream[i] == 92) {
i++;
secondByteIsEscaped = true;
if (glyphData.getRawInt() == 0) {
while (stream[i] == 13 || (stream[i] == 92 && stream[i - 1] == 13)) { //allow for garbage in stream
i++;
}
}
secondVal = stream[i] & 255;
if ((streamLength > (i + 2)) && (Character.isDigit((char) stream[i]))) {
//see how long number is
int numberCount = 1;
if (Character.isDigit((char) stream[i + 1])) {
numberCount++;
if (Character.isDigit((char) stream[i + 2])) {
numberCount++;
}
}
// convert octal escapes
secondVal = TD.readEscapeValue(i, numberCount, 8, stream);
i = i + numberCount - 1;
if (secondVal > 255) {
secondVal -= 256;
}
} else if (secondVal == 'u') { //convert unicode of format uxxxx to char value
secondVal = TD.readEscapeValue(i + 1, 4, 16, stream);
i += 4;
} else {
secondVal = convertEscapeChar(secondVal);
}
}
final int secondByte = secondVal;
final char combinedVal = (char) ((glyphData.getRawChar() << 8) + secondVal);
//lookup in 2 byte version
newValue = StandardFonts.CMAP[combinedVal];
int isDouble = -1;
//if CIDtoGID use that first to see if double byte
if (currentFontData.isCIDFont() && currentFontData.getGlyphData().getTable(FontFile2.CMAP) == null) {
final int first = currentFontData.getEncodedCMAPValue(firstVal);
final int second = currentFontData.getEncodedCMAPValue(secondVal);
final int combined = currentFontData.getEncodedCMAPValue(combinedVal);
if (combined <= 0 && (first > 0 || second > 0)) {
newValue = null;
isDouble = 0;
}
}
if (isDouble == -1) {
isDouble = currentFontData.isDoubleBytes(firstVal, secondByte, secondByteIsEscaped);
}
if (debug) {
System.out.println("2 byte values=" + newValue + ' ' + " isDouble=" + isDouble + ' ' + combinedVal + ' ' + firstValue);
}
//if no 2 byte value either default to 1 byte
if (isEmbedded && (isDouble == 1 || combinedVal < 256 || newValue != null)) { // || (!secondByteIsEscaped && secondByte!=')'))){
glyphData.setRawInt(combinedVal);
glyphData.setRawChar(combinedVal);
if (debug) {
System.out.println("use 2 values=" + Integer.toHexString(combinedVal) + " new value=" + newValue + " isEmbedded=" + isEmbedded + ' ' + (!secondByteIsEscaped && secondByte != ')'));
}
} else if (!isEmbedded && isDouble == 1 && (newValue != null || combinedVal < 256 || (!secondByteIsEscaped && secondByte != ')'))) {
glyphData.setRawInt(combinedVal);
glyphData.setRawChar(combinedVal);
if (debug) {
System.out.println("use 2 values=" + combinedVal + ' ' + newValue);
}
} else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue != null && firstValue == null) {
glyphData.setRawInt(combinedVal);
glyphData.setRawChar(combinedVal);
if (debug) {
System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
}
} else if (isDouble == 0 && !isEmbedded && firstVal > 128 && newValue == null && firstValue != null) {
i = iBefore;
//glyphData.rawInt=combinedVal;
//rawChar=(char)f;
// newValue = String.valueOf(rawChar);
newValue = firstValue;
if (debug) {
System.out.println("TEST2 " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
}
} else {
i = iBefore;
if (debug) {
System.out.println("reset " + newValue + ' ' + StandardFonts.CMAP[secondByte]);
}
}
if (!isEmbedded) {
actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1);
}
}
} else {
actualWidth = -1;
if ((!isEmbedded)
&& (currentFontData.getFontType() == StandardFonts.CIDTYPE0 || currentFontData.getFontType() == StandardFonts.CIDTYPE2)) {
actualWidth = currentFontData.getDefaultWidth(glyphData.getRawInt());
if (actualWidth == -1) {
actualWidth = currentFontData.getDefaultWidth(-1) / 2;
}
}
}
glyphData.setActualWidth(actualWidth);
//if no value ignore for moment
if (newValue != null) {
glyphData.setDisplayValue(newValue);
} else { //default if no value
glyphData.setDisplayValue(String.valueOf(glyphData.getRawChar()));
}
if (parserOptions.isTextExtracted()) { //(not sure if this is correct - may need more samples)
glyphData.setUnicodeValue(currentFontData.getUnicodeValue(glyphData.getDisplayValue(), glyphData.getRawChar()));
}
//fix for \\) at end of stream
if (glyphData.getRawChar() == 92) {
glyphData.setValueForHTML(92);
glyphData.setRawChar((char) 120);
}
if (debug) {
System.out.println("returns =" + glyphData.getDisplayValue() + ' ' + glyphData.getUnicodeValue() + " int=" + glyphData.getRawInt() + " actualWidth=" + actualWidth);
}
return i;
}
static int convertEscapeChar(final int secondVal) {
switch (secondVal) {
case 'n':
return '\n';
case 'b':
return '\b';
case 't':
return '\t';
case 'r':
return '\r';
case 'f':
return '\f';
default:
return secondVal;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy