Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
*
* This file is part of the iText (R) project.
Copyright (c) 1998-2019 iText Group NV
* Authors: Bruno Lowagie, et al.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License version 3
* as published by the Free Software Foundation with the addition of the
* following permission added to Section 15 as permitted in Section 7(a):
* FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
* ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
* OF THIRD PARTY RIGHTS
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses or write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA, 02110-1301 USA, or download the license from the following URL:
* http://itextpdf.com/terms-of-use/
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU Affero General Public License.
*
* In accordance with Section 7(b) of the GNU Affero General Public License,
* a covered work must retain the producer line in every PDF that is created
* or manipulated using iText.
*
* You can be released from the requirements of the license by purchasing
* a commercial license. Buying such a license is mandatory as soon as you
* develop commercial activities involving the iText software without
* disclosing the source code of your own applications.
* These activities include: offering paid services to customers as an ASP,
* serving PDFs on the fly in a web application, shipping iText with a closed
* source product.
*
* For more information, please contact iText Software Corp. at this
* address: [email protected]
*/
package com.itextpdf.text.pdf.parser;
import com.itextpdf.text.error_messages.MessageLocalization;
import com.itextpdf.text.pdf.*;
import com.itextpdf.text.xml.XMLUtil;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Set;
/**
* Converts a tagged PDF document into an XML file.
*
* @since 5.0.2
*/
public class TaggedPdfReaderTool {
/** The reader object from which the content streams are read. */
protected PdfReader reader;
/** The writer object to which the XML will be written */
protected PrintWriter out;
/**
* Parses a string with structured content.
*
* @param reader
* the PdfReader that has access to the PDF file
* @param os
* the OutputStream to which the resulting xml will be written
* @param charset
* the charset to encode the data
* @since 5.0.5
*/
public void convertToXml(PdfReader reader, OutputStream os, String charset)
throws IOException {
this.reader = reader;
OutputStreamWriter outs = new OutputStreamWriter(os, charset);
out = new PrintWriter(outs);
// get the StructTreeRoot from the root object
PdfDictionary catalog = reader.getCatalog();
PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT);
if (struct == null)
throw new IOException(MessageLocalization.getComposedMessage("no.structtreeroot.found"));
// Inspect the child or children of the StructTreeRoot
inspectChild(struct.getDirectObject(PdfName.K));
out.flush();
out.close();
}
/**
* Parses a string with structured content. The output is done using the
* current charset.
*
* @param reader
* the PdfReader that has access to the PDF file
* @param os
* the OutputStream to which the resulting xml will be written
*/
public void convertToXml(PdfReader reader, OutputStream os)
throws IOException {
convertToXml(reader, os, "UTF-8");
}
/**
* Inspects a child of a structured element. This can be an array or a
* dictionary.
*
* @param k
* the child to inspect
* @throws IOException
*/
public void inspectChild(PdfObject k) throws IOException {
if (k == null)
return;
if (k instanceof PdfArray)
inspectChildArray((PdfArray) k);
else if (k instanceof PdfDictionary)
inspectChildDictionary((PdfDictionary) k);
}
/**
* If the child of a structured element is an array, we need to loop over
* the elements.
*
* @param k
* the child array to inspect
*/
public void inspectChildArray(PdfArray k) throws IOException {
if (k == null)
return;
for (int i = 0; i < k.size(); i++) {
inspectChild(k.getDirectObject(i));
}
}
/**
* If the child of a structured element is a dictionary, we inspect the
* child; we may also draw a tag.
*
* @param k
* the child dictionary to inspect
*/
public void inspectChildDictionary(PdfDictionary k) throws IOException {
inspectChildDictionary(k, false);
}
/**
* If the child of a structured element is a dictionary, we inspect the
* child; we may also draw a tag.
*
* @param k
* the child dictionary to inspect
*/
public void inspectChildDictionary(PdfDictionary k, boolean inspectAttributes) throws IOException {
if (k == null)
return;
PdfName s = k.getAsName(PdfName.S);
if (s != null) {
String tagN = PdfName.decodeName(s.toString());
String tag = fixTagName(tagN);
out.print("<");
out.print(tag);
if (inspectAttributes) {
PdfDictionary a = k.getAsDict(PdfName.A);
if (a != null) {
Set keys = a.getKeys();
for (PdfName key : keys) {
out.print(' ');
PdfObject value = a.get(key);
value = PdfReader.getPdfObject(value);
out.print(xmlName(key));
out.print("=\"");
out.print(value.toString());
out.print("\"");
}
}
}
out.print(">");
PdfObject alt = k.get(PdfName.ALT);
if (alt != null && alt.toString() != null) {
out.print("");
}
PdfDictionary dict = k.getAsDict(PdfName.PG);
if (dict != null)
parseTag(tagN, k.getDirectObject(PdfName.K), dict);
inspectChild(k.getDirectObject(PdfName.K));
out.print("");
} else
inspectChild(k.getDirectObject(PdfName.K));
}
protected String xmlName(PdfName name) {
String xmlName = name.toString().replaceFirst("/", "");
xmlName = Character.toLowerCase(xmlName.charAt(0))
+ xmlName.substring(1);
return xmlName;
}
private static String fixTagName(String tag) {
StringBuilder sb = new StringBuilder();
for (int k = 0; k < tag.length(); ++k) {
char c = tag.charAt(k);
boolean nameStart =
c == ':'
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c >= 'a' && c <= 'z')
|| (c >= '\u00c0' && c <= '\u00d6')
|| (c >= '\u00d8' && c <= '\u00f6')
|| (c >= '\u00f8' && c <= '\u02ff')
|| (c >= '\u0370' && c <= '\u037d')
|| (c >= '\u037f' && c <= '\u1fff')
|| (c >= '\u200c' && c <= '\u200d')
|| (c >= '\u2070' && c <= '\u218f')
|| (c >= '\u2c00' && c <= '\u2fef')
|| (c >= '\u3001' && c <= '\ud7ff')
|| (c >= '\uf900' && c <= '\ufdcf')
|| (c >= '\ufdf0' && c <= '\ufffd');
boolean nameMiddle =
c == '-'
|| c == '.'
|| (c >= '0' && c <= '9')
|| c == '\u00b7'
|| (c >= '\u0300' && c <= '\u036f')
|| (c >= '\u203f' && c <= '\u2040')
|| nameStart;
if (k == 0) {
if (!nameStart)
c = '_';
}
else {
if (!nameMiddle)
c = '-';
}
sb.append(c);
}
return sb.toString();
}
/**
* Searches for a tag in a page.
*
* @param tag
* the name of the tag
* @param object
* an identifier to find the marked content
* @param page
* a page dictionary
* @throws IOException
*/
public void parseTag(String tag, PdfObject object, PdfDictionary page)
throws IOException {
// if the identifier is a number, we can extract the content right away
if (object instanceof PdfNumber) {
PdfNumber mcid = (PdfNumber) object;
RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
FilteredTextRenderListener listener = new FilteredTextRenderListener(
strategy, filter);
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
listener);
processor.processContent(PdfReader.getPageContent(page), page
.getAsDict(PdfName.RESOURCES));
out.print(XMLUtil.escapeXML(listener.getResultantText(), true));
}
// if the identifier is an array, we call the parseTag method
// recursively
else if (object instanceof PdfArray) {
PdfArray arr = (PdfArray) object;
int n = arr.size();
for (int i = 0; i < n; i++) {
parseTag(tag, arr.getPdfObject(i), page);
if (i < n - 1)
out.println();
}
}
// if the identifier is a dictionary, we get the resources from the
// dictionary
else if (object instanceof PdfDictionary) {
PdfDictionary mcr = (PdfDictionary) object;
parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr
.getAsDict(PdfName.PG));
}
}
}