All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lowagie.text.html.simpleparser.HTMLWorker Maven / Gradle / Ivy

Go to download

Itext is a java library to create and manipulate PDFs. This is a fork of version 2.1.7 the last MPL/LGPL version. It's focused basically on mantain compatibility with newer bouncycastle releases and small bugfixes.

There is a newer version: 2.2.2
Show newest version
/*
 * Copyright 2004 Paulo Soares
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * Contributions by:
 * Lubos Strapko
 * 
 * If you didn't download this code from the following link, you should check if
 * you aren't using an obsolete version:
 * http://www.lowagie.com/iText/
 */

package com.lowagie.text.html.simpleparser;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Stack;
import java.util.StringTokenizer;

import com.lowagie.text.Chunk;
import com.lowagie.text.DocListener;
import com.lowagie.text.DocumentException;
import com.lowagie.text.Element;
import com.lowagie.text.ElementTags;
import com.lowagie.text.ExceptionConverter;
import com.lowagie.text.FontFactoryImp;
import com.lowagie.text.HeaderFooter;
import com.lowagie.text.Image;
import com.lowagie.text.List;
import com.lowagie.text.ListItem;
import com.lowagie.text.Paragraph;
import com.lowagie.text.Phrase;
import com.lowagie.text.Rectangle;
import com.lowagie.text.TextElementArray;
import com.lowagie.text.html.HtmlTags;
import com.lowagie.text.html.Markup;
import com.lowagie.text.pdf.PdfPTable;
import com.lowagie.text.pdf.draw.LineSeparator;
import com.lowagie.text.xml.simpleparser.SimpleXMLDocHandler;
import com.lowagie.text.xml.simpleparser.SimpleXMLParser;

public class HTMLWorker implements SimpleXMLDocHandler, DocListener {

	protected ArrayList objectList;

	protected DocListener document;

	private Paragraph currentParagraph;

	private ChainedProperties cprops = new ChainedProperties();

	private Stack stack = new Stack();

	private boolean pendingTR = false;

	private boolean pendingTD = false;

	private boolean pendingLI = false;
	
	private boolean pendingUL = false;

	private StyleSheet style = new StyleSheet();

	private boolean isPRE = false;

	private Stack tableState = new Stack();

	private boolean skipText = false;

	private HashMap interfaceProps;

	private FactoryProperties factoryProperties = new FactoryProperties();

	/** Creates a new instance of HTMLWorker
	 * @param document A class that implements DocListener
	 * */
	public HTMLWorker(DocListener document) {
		this.document = document;
	}

	public void setStyleSheet(StyleSheet style) {
		this.style = style;
	}

	public StyleSheet getStyleSheet() {
		return style;
	}

	public void setInterfaceProps(HashMap interfaceProps) {
		this.interfaceProps = interfaceProps;
		FontFactoryImp ff = null;
		if (interfaceProps != null)
			ff = (FontFactoryImp) interfaceProps.get("font_factory");
		if (ff != null)
			factoryProperties.setFontImp(ff);
	}

	public HashMap getInterfaceProps() {
		return interfaceProps;
	}

	public void parse(Reader reader) throws IOException {
		SimpleXMLParser.parse(this, null, reader, true);
	}

	public static ArrayList parseToList(Reader reader, StyleSheet style)
			throws IOException {
		return parseToList(reader, style, null);
	}

	public static ArrayList parseToList(Reader reader, StyleSheet style,
			HashMap interfaceProps) throws IOException {
		HTMLWorker worker = new HTMLWorker(null);
		if (style != null)
			worker.style = style;
		worker.document = worker;
		worker.setInterfaceProps(interfaceProps);
		worker.objectList = new ArrayList();
		worker.parse(reader);
		return worker.objectList;
	}

	public void endDocument() {
		try {
			for (int k = 0; k < stack.size(); ++k)
				document.add((Element) stack.elementAt(k));
			if (currentParagraph != null)
				document.add(currentParagraph);
			currentParagraph = null;
		} catch (Exception e) {
			throw new ExceptionConverter(e);
		}
	}

	public void startDocument() {
		HashMap h = new HashMap();
		style.applyStyle("body", h);
		cprops.addToChain("body", h);
	}

	public void startElement(String tag, HashMap h) {
		if (!tagsSupported.containsKey(tag))
			return;
		try {
			 if(!pendingLI && pendingUL && !tag.equals(HtmlTags.LISTITEM)) {
				 endElement(HtmlTags.UNORDEREDLIST);
				 pendingUL = false;
			 }
			style.applyStyle(tag, h);
			String follow = (String) FactoryProperties.followTags.get(tag);
			if (follow != null) {
				HashMap prop = new HashMap();
				prop.put(follow, null);
				cprops.addToChain(follow, prop);
				return;
			}
			FactoryProperties.insertStyle(h, cprops);
			if (tag.equals(HtmlTags.ANCHOR)) {
				cprops.addToChain(tag, h);
				if (currentParagraph == null) {
					currentParagraph = new Paragraph();
				}
				stack.push(currentParagraph);
				currentParagraph = new Paragraph();
				return;
			}
			if (tag.equals(HtmlTags.NEWLINE)) {
				if (currentParagraph == null) {
					currentParagraph = new Paragraph();
				}
				currentParagraph.add(factoryProperties
						.createChunk("\n", cprops));
				return;
			}
			if (tag.equals(HtmlTags.HORIZONTALRULE)) {
				// Attempting to duplicate the behavior seen on Firefox with
				// http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_hr_test
				// where an initial break is only inserted when the preceding element doesn't
				// end with a break, but a trailing break is always inserted.
				boolean addLeadingBreak = true;
				if (currentParagraph == null) {
					currentParagraph = new Paragraph();
					addLeadingBreak = false;
				}
				if (addLeadingBreak) { // Not a new paragraph
					int numChunks = currentParagraph.getChunks().size();
					if (numChunks == 0 ||
							((Chunk)(currentParagraph.getChunks().get(numChunks - 1))).getContent().endsWith("\n"))
						addLeadingBreak = false;
				}
				String align = (String) h.get("align");
				int hrAlign = Element.ALIGN_CENTER;
				if (align != null) {
					if (align.equalsIgnoreCase("left"))
						hrAlign = Element.ALIGN_LEFT; 
					if (align.equalsIgnoreCase("right"))
						hrAlign = Element.ALIGN_RIGHT;
				}
				String width = (String) h.get("width");
				float hrWidth = 1;
				if (width != null) {
					float tmpWidth = Markup.parseLength(width, Markup.DEFAULT_FONT_SIZE);
					if (tmpWidth > 0) hrWidth = tmpWidth;
					if (!width.endsWith("%"))
						hrWidth = 100; // Treat a pixel width as 100% for now.
				}
				String size = (String) h.get("size");
				float hrSize = 1;
				if (size != null) {
					float tmpSize = Markup.parseLength(size, Markup.DEFAULT_FONT_SIZE);
					if (tmpSize > 0)
						hrSize = tmpSize;
				}
				if (addLeadingBreak)
					currentParagraph.add(Chunk.NEWLINE);
				currentParagraph.add(new LineSeparator(hrSize, hrWidth, null, hrAlign, currentParagraph.getLeading()/2));
				currentParagraph.add(Chunk.NEWLINE);
				return;
			}
			if (tag.equals(HtmlTags.CHUNK) || tag.equals(HtmlTags.SPAN)) {
				cprops.addToChain(tag, h);
				return;
			}
			if (tag.equals(HtmlTags.IMAGE)) {
				String src = (String) h.get(ElementTags.SRC);
				if (src == null)
					return;
				cprops.addToChain(tag, h);
				Image img = null;
				if (interfaceProps != null) {
					ImageProvider ip = (ImageProvider) interfaceProps
							.get("img_provider");
					if (ip != null)
						img = ip.getImage(src, h, cprops, document);
					if (img == null) {
						HashMap images = (HashMap) interfaceProps
								.get("img_static");
						if (images != null) {
							Image tim = (Image) images.get(src);
							if (tim != null)
								img = Image.getInstance(tim);
						} else {
							if (!src.startsWith("http")) { // relative src references only
								String baseurl = (String) interfaceProps
										.get("img_baseurl");
								if (baseurl != null) {
									src = baseurl + src;
									img = Image.getInstance(src);
								}
							}
						}
					}
				}
				if (img == null) {
					if (!src.startsWith("http")) {
						String path = cprops.getProperty("image_path");
						if (path == null)
							path = "";
						src = new File(path, src).getPath();
					}
					img = Image.getInstance(src);
				}
				String align = (String) h.get("align");
				String width = (String) h.get("width");
				String height = (String) h.get("height");
				String before = cprops.getProperty("before");
				String after = cprops.getProperty("after");
				if (before != null)
					img.setSpacingBefore(Float.parseFloat(before));
				if (after != null)
					img.setSpacingAfter(Float.parseFloat(after));
				float actualFontSize = Markup.parseLength(cprops
						.getProperty(ElementTags.SIZE),
						Markup.DEFAULT_FONT_SIZE);
				if (actualFontSize <= 0f)
					actualFontSize = Markup.DEFAULT_FONT_SIZE;
				float widthInPoints = Markup.parseLength(width, actualFontSize);
				float heightInPoints = Markup.parseLength(height,
						actualFontSize);
				if (widthInPoints > 0 && heightInPoints > 0) {
					img.scaleAbsolute(widthInPoints, heightInPoints);
				} else if (widthInPoints > 0) {
					heightInPoints = img.getHeight() * widthInPoints
							/ img.getWidth();
					img.scaleAbsolute(widthInPoints, heightInPoints);
				} else if (heightInPoints > 0) {
					widthInPoints = img.getWidth() * heightInPoints
							/ img.getHeight();
					img.scaleAbsolute(widthInPoints, heightInPoints);
				}
				img.setWidthPercentage(0);
				if (align != null) {
					endElement("p");
					int ralign = Image.MIDDLE;
					if (align.equalsIgnoreCase("left"))
						ralign = Image.LEFT;
					else if (align.equalsIgnoreCase("right"))
						ralign = Image.RIGHT;
					img.setAlignment(ralign);
					Img i = null;
					boolean skip = false;
					if (interfaceProps != null) {
						i = (Img) interfaceProps.get("img_interface");
						if (i != null)
							skip = i.process(img, h, cprops, document);
					}
					if (!skip)
						document.add(img);
					cprops.removeChain(tag);
				} else {
					cprops.removeChain(tag);
					if (currentParagraph == null) {
						currentParagraph = FactoryProperties
								.createParagraph(cprops);
					}
					currentParagraph.add(new Chunk(img, 0, 0));
				}
				return;
			}
			endElement("p");
			if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3")
					|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) {
				if (!h.containsKey(ElementTags.SIZE)) {
					int v = 7 - Integer.parseInt(tag.substring(1));
					h.put(ElementTags.SIZE, Integer.toString(v));
				}
				cprops.addToChain(tag, h);
				return;
			}
			if (tag.equals(HtmlTags.UNORDEREDLIST)) {
				if (pendingLI)
					endElement(HtmlTags.LISTITEM);
				skipText = true;
				cprops.addToChain(tag, h);
				List list = new List(false);
				try{
					list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue());
				}catch (Exception e) {
					list.setAutoindent(true);
				}
				list.setListSymbol("\u2022");
				stack.push(list);
				return;
			}
			if (tag.equals(HtmlTags.ORDEREDLIST)) {
				if (pendingLI)
					endElement(HtmlTags.LISTITEM);
				skipText = true;
				cprops.addToChain(tag, h);
				List list = new List(true);
				try{
					list.setIndentationLeft(new Float(cprops.getProperty("indent")).floatValue());
				}catch (Exception e) {
					list.setAutoindent(true);
				}
				stack.push(list);
				return;
			}
			if (tag.equals(HtmlTags.LISTITEM)) {
				if (pendingLI)
					endElement(HtmlTags.LISTITEM);
				
				Object list = stack.size() > 0 ? stack.peek() : null;
				if ((!(list instanceof List) && !(list instanceof ListItem)) || list == null) {
					startElement(HtmlTags.UNORDEREDLIST, h);
					pendingUL = true;
				}
				skipText = false;
				pendingLI = true;
				cprops.addToChain(tag, h);
				ListItem item = FactoryProperties.createListItem(cprops);
				stack.push(item);
				return;
			}
			if (tag.equals(HtmlTags.DIV) || tag.equals(HtmlTags.BODY) || tag.equals("p")) {
				cprops.addToChain(tag, h);
				return;
			}
			if (tag.equals(HtmlTags.PRE)) {
				if (!h.containsKey(ElementTags.FACE)) {
					h.put(ElementTags.FACE, "Courier");
				}
				cprops.addToChain(tag, h);
				isPRE = true;
				return;
			}
			if (tag.equals("tr")) {
				if (pendingTR)
					endElement("tr");
				skipText = true;
				pendingTR = true;
				cprops.addToChain("tr", h);
				return;
			}
			if (tag.equals("td") || tag.equals("th")) {
				if (pendingTD)
					endElement(tag);
				skipText = false;
				pendingTD = true;
				cprops.addToChain("td", h);
				stack.push(new IncCell(tag, cprops));
				return;
			}
			if (tag.equals("table")) {
				cprops.addToChain("table", h);
				IncTable table = new IncTable(h);
				stack.push(table);
				tableState.push(new boolean[] { pendingTR, pendingTD });
				pendingTR = pendingTD = false;
				skipText = true;
				return;
			}
		} catch (Exception e) {
			throw new ExceptionConverter(e);
		}
	}

	public void endElement(String tag) {
		if (!tagsSupported.containsKey(tag))
			return;
		try {
			String follow = (String) FactoryProperties.followTags.get(tag);
			if (follow != null) {
				cprops.removeChain(follow);
				return;
			}
			if (tag.equals("font") || tag.equals("span")) {
				cprops.removeChain(tag);
				return;
			}
			if (tag.equals("a")) {
				if (currentParagraph == null) {
					currentParagraph = new Paragraph();
				}
				boolean skip = false;
				if (interfaceProps != null) {
					ALink i = (ALink) interfaceProps.get("alink_interface");
					if (i != null)
						skip = i.process(currentParagraph, cprops);
				}
				if (!skip) {
					String href = cprops.getProperty("href");
					if (href != null) {
						ArrayList chunks = currentParagraph.getChunks();
						int size = chunks.size();
						for (int k = 0; k < size; ++k) {
							Chunk ck = (Chunk) chunks.get(k);
							ck.setAnchor(href);
						}
					}
				}
				Paragraph tmp = (Paragraph) stack.pop();
				Phrase tmp2 = new Phrase();
				tmp2.add(currentParagraph);
				tmp.add(tmp2);
				currentParagraph = tmp;
				cprops.removeChain("a");
				return;
			}
			if (tag.equals("br")) {
				return;
			}
			if (currentParagraph != null) {
				if (stack.empty())
					document.add(currentParagraph);
				else {
					Object obj = stack.pop();
					if (obj instanceof TextElementArray) {
						TextElementArray current = (TextElementArray) obj;
						current.add(currentParagraph);
					}
					stack.push(obj);
				}
			}
			currentParagraph = null;
			if (tag.equals(HtmlTags.UNORDEREDLIST)
					|| tag.equals(HtmlTags.ORDEREDLIST)) {
				if (pendingLI)
					endElement(HtmlTags.LISTITEM);
				skipText = false;
				cprops.removeChain(tag);
				if (stack.empty())
					return;
				Object obj = stack.pop();
				if (!(obj instanceof List)) {
					stack.push(obj);
					return;
				}
				if (stack.empty())
					document.add((Element) obj);
				else
					((TextElementArray) stack.peek()).add(obj);
				return;
			}
			if (tag.equals(HtmlTags.LISTITEM)) {
				pendingLI = false;
				skipText = true;
				cprops.removeChain(tag);
				if (stack.empty())
					return;
				Object obj = stack.pop();
				if (!(obj instanceof ListItem)) {
					stack.push(obj);
					return;
				}
				if (stack.empty()) {
					document.add((Element) obj);
					return;
				}
				Object list = stack.pop();
				if (!(list instanceof List)) {
					stack.push(list);
					return;
				}
				ListItem item = (ListItem) obj;
				((List) list).add(item);
				ArrayList cks = item.getChunks();
				if (!cks.isEmpty()) {
					item.getListSymbol().setFont(((Chunk) cks.get(0)).getFont());
				}
				stack.push(list);
				return;
			}
			if (tag.equals("div") || tag.equals("body")) {
				cprops.removeChain(tag);
				return;
			}
			if (tag.equals(HtmlTags.PRE)) {
				cprops.removeChain(tag);
				isPRE = false;
				return;
			}
			if (tag.equals("p")) {
				cprops.removeChain(tag);
				return;
			}
			if (tag.equals("h1") || tag.equals("h2") || tag.equals("h3")
					|| tag.equals("h4") || tag.equals("h5") || tag.equals("h6")) {
				cprops.removeChain(tag);
				return;
			}
			if (tag.equals("table")) {
				if (pendingTR)
					endElement("tr");
				cprops.removeChain("table");
				IncTable table = (IncTable) stack.pop();
				PdfPTable tb = table.buildTable();
				tb.setSplitRows(true);
				if (stack.empty())
					document.add(tb);
				else
					((TextElementArray) stack.peek()).add(tb);
				boolean state[] = (boolean[]) tableState.pop();
				pendingTR = state[0];
				pendingTD = state[1];
				skipText = false;
				return;
			}
			if (tag.equals("tr")) {
				if (pendingTD)
					endElement("td");
				pendingTR = false;
				cprops.removeChain("tr");
				ArrayList cells = new ArrayList();
				IncTable table = null;
				while (true) {
					Object obj = stack.pop();
					if (obj instanceof IncCell) {
						cells.add(((IncCell) obj).getCell());
					}
					if (obj instanceof IncTable) {
						table = (IncTable) obj;
						break;
					}
				}
				table.addCols(cells);
				table.endRow();
				stack.push(table);
				skipText = true;
				return;
			}
			if (tag.equals("td") || tag.equals("th")) {
				pendingTD = false;
				cprops.removeChain("td");
				skipText = true;
				return;
			}
		} catch (Exception e) {
			throw new ExceptionConverter(e);
		}
	}

	public void text(String str) {
		if (skipText)
			return;
		String content = str;
		if (isPRE) {
			if (currentParagraph == null) {
				currentParagraph = FactoryProperties.createParagraph(cprops);
			}
			Chunk chunk = factoryProperties.createChunk(content, cprops);
			currentParagraph.add(chunk);
			return;
		}
		if (content.trim().length() == 0 && content.indexOf(' ') < 0) {
			return;
		}

		StringBuffer buf = new StringBuffer();
		int len = content.length();
		char character;
		boolean newline = false;
		for (int i = 0; i < len; i++) {
			switch (character = content.charAt(i)) {
			case ' ':
				if (!newline) {
					buf.append(character);
				}
				break;
			case '\n':
				if (i > 0) {
					newline = true;
					buf.append(' ');
				}
				break;
			case '\r':
				break;
			case '\t':
				break;
			default:
				newline = false;
				buf.append(character);
			}
		}
		if (currentParagraph == null) {
			currentParagraph = FactoryProperties.createParagraph(cprops);
		}
		Chunk chunk = factoryProperties.createChunk(buf.toString(), cprops);
		currentParagraph.add(chunk);
	}

	public boolean add(Element element) throws DocumentException {
		objectList.add(element);
		return true;
	}

	public void clearTextWrap() throws DocumentException {
	}

	public void close() {
	}

	public boolean newPage() {
		return true;
	}

	public void open() {
	}

	public void resetFooter() {
	}

	public void resetHeader() {
	}

	public void resetPageCount() {
	}

	public void setFooter(HeaderFooter footer) {
	}

	public void setHeader(HeaderFooter header) {
	}

	public boolean setMarginMirroring(boolean marginMirroring) {
		return false;
	}

	/**
     * @see com.lowagie.text.DocListener#setMarginMirroring(boolean)
	 * @since	2.1.6
	 */
	public boolean setMarginMirroringTopBottom(boolean marginMirroring) {
		return false;
	}

	public boolean setMargins(float marginLeft, float marginRight,
			float marginTop, float marginBottom) {
		return true;
	}

	public void setPageCount(int pageN) {
	}

	public boolean setPageSize(Rectangle pageSize) {
		return true;
	}

	public static final String tagsSupportedString = "ol ul li a pre font span br p div body table td th tr i b u sub sup em strong s strike"
			+ " h1 h2 h3 h4 h5 h6 img hr";

	public static final HashMap tagsSupported = new HashMap();

	static {
		StringTokenizer tok = new StringTokenizer(tagsSupportedString);
		while (tok.hasMoreTokens())
			tagsSupported.put(tok.nextToken(), null);
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy