All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.siegmann.epublib.utilities.HtmlSplitter Maven / Gradle / Ivy

The newest version!
package nl.siegmann.epublib.utilities;

import java.io.Reader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;

/**
 * Splits up a xhtml document into pieces that are all valid xhtml documents.
 * 
 * @author paul
 *
 */
public class HtmlSplitter {

	private XMLEventFactory xmlEventFactory = XMLEventFactory.newInstance();
	private XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
	private List headerElements = new ArrayList();
	private List footerElements = new ArrayList();
	private int footerCloseTagLength;
	private List elementStack = new ArrayList();
	private StringWriter currentDoc = new StringWriter();
	private List currentXmlEvents = new ArrayList();
	private XMLEventWriter out;
	private int maxLength = 300000; // 300K, the max length of a chapter of an epub document
	private List> result = new ArrayList>();
		
	public List> splitHtml(Reader reader, int maxLength) throws XMLStreamException {
		XMLEventReader xmlEventReader = XMLInputFactory.newInstance().createXMLEventReader(reader);
		return splitHtml(xmlEventReader, maxLength);
	}
	
	private static int calculateTotalTagStringLength(List xmlEvents) {
		int result = 0;
		for(XMLEvent xmlEvent: xmlEvents) {
			result += xmlEvent.toString().length();
		}
		return result;
	}
	
	public List> splitHtml(XMLEventReader reader, int maxLength) throws XMLStreamException {
		this.headerElements = getHeaderElements(reader);
		this.footerElements = getFooterElements();
		footerCloseTagLength = calculateTotalTagStringLength(footerElements);
		this.maxLength = (int) ((float) maxLength * 0.9);
		currentXmlEvents = new ArrayList();
		currentXmlEvents.addAll(headerElements);
		currentXmlEvents.addAll(elementStack);
		out = xmlOutputFactory.createXMLEventWriter(currentDoc);
		for(XMLEvent headerXmlEvent: headerElements) {
			out.add(headerXmlEvent);
		}
		XMLEvent xmlEvent = reader.nextEvent();
		while(! isBodyEndElement(xmlEvent)) {
			processXmlEvent(xmlEvent, result);
			xmlEvent = reader.nextEvent();
		}
		result.add(currentXmlEvents);
		return result;
	}
	
	
	private void closeCurrentDocument() throws XMLStreamException {
		closeAllTags(currentXmlEvents);
		currentXmlEvents.addAll(footerElements);
		result.add(currentXmlEvents);
	}

	private void startNewDocument() throws XMLStreamException {
		currentDoc = new StringWriter();
		out = xmlOutputFactory.createXMLEventWriter(currentDoc);
		for(XMLEvent headerXmlEvent: headerElements) {
			out.add(headerXmlEvent);
		}
		for(XMLEvent stackXmlEvent: elementStack) {
			out.add(stackXmlEvent);
		}
		
		currentXmlEvents = new ArrayList();
		currentXmlEvents.addAll(headerElements);
		currentXmlEvents.addAll(elementStack);
	}

	private void processXmlEvent(XMLEvent xmlEvent, List> docs) throws XMLStreamException {
		out.flush();
		String currentSerializerDoc = currentDoc.toString();
		if((currentSerializerDoc.length() + xmlEvent.toString().length() + footerCloseTagLength) >= maxLength) {
			closeCurrentDocument();
			startNewDocument();
		}
		updateStack(xmlEvent);
		out.add(xmlEvent);
		currentXmlEvents.add(xmlEvent);
	}

	private void closeAllTags(List xmlEvents) throws XMLStreamException {
		for(int i = elementStack.size() - 1; i>= 0; i--) {
			XMLEvent xmlEvent = elementStack.get(i);
			XMLEvent xmlEndElementEvent = xmlEventFactory.createEndElement(xmlEvent.asStartElement().getName(), null);
			xmlEvents.add(xmlEndElementEvent);
		}
	}
	
	private void updateStack(XMLEvent xmlEvent) {
		if(xmlEvent.isStartElement()) {
			elementStack.add(xmlEvent);
		} else if(xmlEvent.isEndElement()) {
			XMLEvent lastEvent = elementStack.get(elementStack.size() - 1);
			if(lastEvent.isStartElement() &&
					xmlEvent.asEndElement().getName().equals(lastEvent.asStartElement().getName())) {
				elementStack.remove(elementStack.size() - 1);
			}
		}
	}

	private List getHeaderElements(XMLEventReader reader) throws XMLStreamException {
		List result = new ArrayList();
		XMLEvent event = reader.nextEvent();
		while(event != null && (!isBodyStartElement(event))) {
			result.add(event);
			event = reader.nextEvent();
		}
		
		// add the body start tag to the result
		if(event != null) {
			result.add(event);
		}
		return result;
	}

	private List getFooterElements() throws XMLStreamException {
		List result = new ArrayList();
		result.add(xmlEventFactory.createEndElement("", null, "body"));
		result.add(xmlEventFactory.createEndElement("", null, "html"));
		result.add(xmlEventFactory.createEndDocument());
		return result;
	}

	private static boolean isBodyStartElement(XMLEvent xmlEvent) {
		return xmlEvent.isStartElement() && xmlEvent.asStartElement().getName().getLocalPart().equals("body");
	}

	private static boolean isBodyEndElement(XMLEvent xmlEvent) {
		return xmlEvent.isEndElement() && xmlEvent.asEndElement().getName().getLocalPart().equals("body");
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy