All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.siegmann.epublib.chm.ChmParser Maven / Gradle / Ivy

The newest version!
package nl.siegmann.epublib.chm;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import nl.siegmann.epublib.domain.Book;
import nl.siegmann.epublib.domain.MediaType;
import nl.siegmann.epublib.domain.Resource;
import nl.siegmann.epublib.domain.Resources;
import nl.siegmann.epublib.domain.TOCReference;
import nl.siegmann.epublib.domain.TableOfContents;
import nl.siegmann.epublib.service.MediatypeService;
import nl.siegmann.epublib.util.ResourceUtil;

import org.apache.commons.io.IOUtils;
import org.apache.commons.vfs.AllFileSelector;
import org.apache.commons.vfs.FileObject;
import org.apache.commons.vfs.FileSystemException;
import org.apache.commons.vfs.FileType;

/**
 * Reads the files that are extracted from a windows help ('.chm') file and creates a epublib Book out of it.
 * 
 * @author paul
 *
 */
public class ChmParser {

	public static final String DEFAULT_CHM_HTML_INPUT_ENCODING = "windows-1252";
	public static final int MINIMAL_SYSTEM_TITLE_LENGTH = 4;
	
	public static Book parseChm(FileObject chmRootDir) throws XPathExpressionException, IOException, ParserConfigurationException {
		return parseChm(chmRootDir, DEFAULT_CHM_HTML_INPUT_ENCODING);
	}

	public static Book parseChm(FileObject chmRootDir, String inputHtmlEncoding)
			throws IOException, ParserConfigurationException,
			XPathExpressionException {
		Book result = new Book();
		result.getMetadata().addTitle(findTitle(chmRootDir));
		FileObject hhcFileObject = findHhcFileObject(chmRootDir);
		if(hhcFileObject == null) {
			throw new IllegalArgumentException("No index file found in directory " + chmRootDir + ". (Looked for file ending with extension '.hhc'");
		}
		if(inputHtmlEncoding == null) {
			inputHtmlEncoding = DEFAULT_CHM_HTML_INPUT_ENCODING;
		}
		Resources resources = findResources(chmRootDir, inputHtmlEncoding);
		List tocReferences = HHCParser.parseHhc(hhcFileObject.getContent().getInputStream(), resources);
		result.setTableOfContents(new TableOfContents(tocReferences));
		result.setResources(resources);
		result.generateSpineFromTableOfContents();
		return result;
	}
	
	
	/**
	 * Finds in the '#SYSTEM' file the 3rd set of characters that have ascii value >= 32 and >= 126 and is more than 3 characters long.
	 * Assumes that that is then the title of the book.
	 * 
	 * @param chmRootDir
	 * @return Finds in the '#SYSTEM' file the 3rd set of characters that have ascii value >= 32 and >= 126 and is more than 3 characters long.
	 * @throws IOException
	 */
	protected static String findTitle(FileObject chmRootDir) throws IOException {
		FileObject systemFileObject = chmRootDir.resolveFile("#SYSTEM");
		InputStream in = systemFileObject.getContent().getInputStream();
		boolean inText = false;
		int lineCounter = 0;
		StringBuilder line = new StringBuilder();
		for(int c = in.read(); c >= 0; c = in.read()) {
			if(c >= 32 && c <= 126) {
				line.append((char) c);
				inText = true;
			} else {
				if(inText) {
					if(line.length() >= 3) {
						lineCounter++;
						if(lineCounter >= MINIMAL_SYSTEM_TITLE_LENGTH) {
							return line.toString();
						}
					}
					line = new StringBuilder();
				}
				inText = false;
			}
		}
		return "";
	}
	
	private static FileObject findHhcFileObject(FileObject chmRootDir) throws FileSystemException {
		FileObject[] files = chmRootDir.getChildren();
		for(int i = 0; i < files.length; i++) {
			if("hhc".equalsIgnoreCase(files[i].getName().getExtension())) {
				return files[i];
			}
		}
		return null;
	}
	
	
	private static Resources findResources(FileObject rootDir, String inputEncoding) throws IOException {
		Resources result = new Resources();
		FileObject[] allFiles = rootDir.findFiles(new AllFileSelector());
		for(int i = 0; i < allFiles.length; i++) {
			FileObject file = allFiles[i];
			if (file.getType() == FileType.FOLDER) {
				continue;
			}
			MediaType mediaType = MediatypeService.determineMediaType(file.getName().getBaseName()); 
			if(mediaType == null) {
				continue;
			}
			String href = file.getName().toString().substring(rootDir.getName().toString().length() + 1);
			byte[] resourceData = IOUtils.toByteArray(file.getContent().getInputStream());
			if(mediaType == MediatypeService.XHTML && ! nl.siegmann.epublib.Constants.CHARACTER_ENCODING.equalsIgnoreCase(inputEncoding)) {
				resourceData = ResourceUtil.recode(inputEncoding, nl.siegmann.epublib.Constants.CHARACTER_ENCODING, resourceData);
			}
			Resource fileResource = new Resource(null, resourceData, href, mediaType);
			result.add(fileResource);
		}
		return result;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy