All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.siegmann.epublib.chm.HHCParser Maven / Gradle / Ivy

The newest version!
package nl.siegmann.epublib.chm;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import nl.siegmann.epublib.domain.Resource;
import nl.siegmann.epublib.domain.Resources;
import nl.siegmann.epublib.domain.TOCReference;
import nl.siegmann.epublib.util.ResourceUtil;

import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * Parses the windows help index (.hhc) file.
 * 
 * @author paul
 *
 */
public class HHCParser {

	public static final String DEFAULT_HTML_INPUT_ENCODING = "Windows-1251";
	
	public static List parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,	XPathExpressionException {
		HtmlCleaner htmlCleaner = new HtmlCleaner();
		CleanerProperties props = htmlCleaner.getProperties();
		TagNode node = htmlCleaner.clean(hhcFile);
		Document hhcDocument = new DomSerializer(props).createDOM(node);
		XPath xpath = XPathFactory.newInstance().newXPath();
		Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
				.getDocumentElement(), XPathConstants.NODE);
		List sections = processUlNode(ulNode, resources);
		return sections;
	}
	
	/*
	 * Sometimes the structure is:
	 * 
  • * ... *
      ...
    *
  • * * And sometimes: *
  • * ... *
  • *
      ...
    */ private static List processUlNode(Node ulNode, Resources resources) { List result = new ArrayList(); NodeList children = ulNode.getChildNodes(); for(int i = 0; i < children.getLength(); i++) { Node node = children.item(i); if(node.getNodeName().equals("li")) { List section = processLiNode(node, resources); result.addAll(section); } else if(node.getNodeName().equals("ul")) { List childTOCReferences = processUlNode(node, resources); if(result.isEmpty()) { result = childTOCReferences; } else { result.get(result.size() - 1).getChildren().addAll(childTOCReferences); } } } return result; } private static List processLiNode(Node liNode, Resources resources) { List result = new ArrayList(); NodeList children = liNode.getChildNodes(); for(int i = 0; i < children.getLength(); i++) { Node node = children.item(i); if(node.getNodeName().equals("object")) { TOCReference section = processObjectNode(node, resources); if(section != null) { result.add(section); } } else if(node.getNodeName().equals("ul")) { List childTOCReferences = processUlNode(node, resources); if(result.isEmpty()) { result = childTOCReferences; } else { result.get(result.size() - 1).getChildren().addAll(childTOCReferences); } } } return result; } /** * Processes a CHM object node into a TOCReference * If the local name is empty then a TOCReference node is made with a null href value. * * * * * * * * @param objectNode * * @return A TOCReference of the object has a non-blank param child with name 'Name' and a non-blank param name 'Local' */ private static TOCReference processObjectNode(Node objectNode, Resources resources) { TOCReference result = null; NodeList children = objectNode.getChildNodes(); String name = null; String href = null; for(int i = 0; i < children.getLength(); i++) { Node node = children.item(i); if(node.getNodeName().equals("param")) { String paramName = ((Element) node).getAttribute("name"); if("Name".equals(paramName)) { name = ((Element) node).getAttribute("value"); } else if("Local".equals(paramName)) { href = ((Element) node).getAttribute("value"); } } } if((! StringUtils.isBlank(href)) && href.startsWith("http://")) { return result; } if(! StringUtils.isBlank(name)) { Resource resource = resources.getByHref(href); if (resource == null) { resource = ResourceUtil.createResource(name, href); resources.add(resource); } result = new TOCReference(name, resource); } return result; } }




    © 2015 - 2024 Weber Informatics LLC | Privacy Policy