
com.github.mertakdut.Reader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of EpubParser Show documentation
Show all versions of EpubParser Show documentation
Parses .epub files, provides reading page by page.
package com.github.mertakdut;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.github.mertakdut.BaseFindings.XmlItem;
import com.github.mertakdut.exception.OutOfPagesException;
import com.github.mertakdut.exception.ReadingException;
public class Reader {
private Content content;
public void setInfoContent(String filePath) throws ReadingException {
this.content = new Content();
this.content.setZipFilePath(filePath);
fillContent(filePath, false);
}
public void setFullContent(String filePath) throws ReadingException {
this.content = new Content();
this.content.setZipFilePath(filePath);
fillContent(filePath, true);
}
public BookSection readSection(int index) throws ReadingException, OutOfPagesException {
return content.maintainBookSections(index);
}
// Optionals
public void setMaxContentPerSection(int maxContentPerSection) {
Optionals.maxContentPerSection = maxContentPerSection;
}
public void setCssStatus(CssStatus cssStatus) {
Optionals.cssStatus = cssStatus;
}
public void setIsIncludingTextContent(boolean isIncludingTextContent) {
Optionals.isIncludingTextContent = isIncludingTextContent;
}
public void setIsOmittingTitleTag(boolean isOmittingTitleTag) {
Optionals.isOmittingTitleTag = isOmittingTitleTag;
}
private Content fillContent(String zipFilePath, boolean isFullContent) throws ReadingException {
ZipFile epubFile = null;
try {
try {
epubFile = new ZipFile(zipFilePath);
} catch (Exception e) {
e.printStackTrace();
throw new ReadingException("Error initializing ZipFile: " + e.getMessage());
}
Enumeration extends ZipEntry> files = epubFile.entries();
while (files.hasMoreElements()) {
ZipEntry entry = (ZipEntry) files.nextElement();
if (!entry.isDirectory()) {
String entryName = entry.getName();
if (entryName != null) {
content.addEntryName(entryName);
}
}
}
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(false);
factory.setValidating(false);
try {
factory.setFeature("http://xml.org/sax/features/namespaces", false);
factory.setFeature("http://xml.org/sax/features/validation", false);
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
} catch (ParserConfigurationException e) {
e.printStackTrace();
// throw new ReadingException("Error initializing DocumentBuilderFactory: " + e.getMessage());
}
DocumentBuilder docBuilder;
try {
docBuilder = factory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
throw new ReadingException("DocumentBuilder cannot be created: " + e.getMessage());
}
boolean isContainerXmlFound = false;
boolean isTocXmlFound = false;
for (int i = 0; i < content.getEntryNames().size(); i++) {
if (isContainerXmlFound && (isTocXmlFound || !isFullContent)) {
break;
}
String currentEntryName = content.getEntryNames().get(i);
if (currentEntryName.contains("container.xml")) {
isContainerXmlFound = true;
ZipEntry container = epubFile.getEntry(currentEntryName);
InputStream inputStream;
try {
inputStream = epubFile.getInputStream(container);
} catch (IOException e) {
e.printStackTrace();
throw new ReadingException("IOException while reading " + Constants.FILE_NAME_CONTAINER_XML + " file: " + e.getMessage());
}
Document document = getDocument(docBuilder, inputStream, Constants.FILE_NAME_CONTAINER_XML);
parseContainerXml(docBuilder, document, content, epubFile);
} else if (isFullContent && currentEntryName.contains(".ncx")) {
isTocXmlFound = true;
ZipEntry toc = epubFile.getEntry(currentEntryName);
InputStream inputStream;
try {
inputStream = epubFile.getInputStream(toc);
} catch (IOException e) {
e.printStackTrace();
throw new ReadingException("IOException while reading " + Constants.EXTENSION_NCX + " file: " + e.getMessage());
}
Document document = getDocument(docBuilder, inputStream, Constants.EXTENSION_NCX);
parseTocFile(document, content.getToc());
}
}
if (!isContainerXmlFound) {
throw new ReadingException("container.xml not found.");
}
if (!isTocXmlFound && isFullContent) {
throw new ReadingException("toc.ncx not found.");
}
mergeTocElements();
// Debug
// content.print();
return content;
} finally {
try {
if (epubFile != null) {
epubFile.close();
}
} catch (IOException e) {
e.printStackTrace();
throw new ReadingException("Error closing ZipFile: " + e.getMessage());
}
}
}
private void parseContainerXml(DocumentBuilder docBuilder, Document document, Content content, ZipFile epubFile) throws ReadingException {
if (document.hasChildNodes()) {
traverseDocumentNodesAndFillContent(document.getChildNodes(), content.getContainer());
}
String opfFilePath = content.getContainer().getFullPathValue();
ZipEntry opfFileEntry = epubFile.getEntry(opfFilePath);
InputStream opfFileInputStream;
try {
opfFileInputStream = epubFile.getInputStream(opfFileEntry);
} catch (IOException e) {
e.printStackTrace();
throw new ReadingException("IO error while reading " + Constants.EXTENSION_OPF + " inputstream: " + e.getMessage());
}
Document packageDocument = getDocument(docBuilder, opfFileInputStream, Constants.EXTENSION_OPF);
parseOpfFile(packageDocument, content.getPackage());
}
private void parseOpfFile(Document document, Package pckage) throws ReadingException {
if (document.hasChildNodes()) {
traverseDocumentNodesAndFillContent(document.getChildNodes(), pckage);
}
}
private void parseTocFile(Document document, Toc toc) throws ReadingException {
if (document.hasChildNodes()) {
traverseDocumentNodesAndFillContent(document.getChildNodes(), toc);
}
}
private Document getDocument(DocumentBuilder docBuilder, InputStream inputStream, String fileName) throws ReadingException {
Document document;
try {
document = docBuilder.parse(inputStream);
inputStream.close();
return document;
} catch (Exception e) {
e.printStackTrace();
throw new ReadingException("Parse error while parsing " + fileName + " file: " + e.getMessage());
}
}
private void traverseDocumentNodesAndFillContent(NodeList nodeList, BaseFindings findings) throws ReadingException {
for (int i = 0; i < nodeList.getLength(); i++) {
Node tempNode = nodeList.item(i);
// make sure it's element node.
if (tempNode.getNodeType() == Node.ELEMENT_NODE) {
findings.fillContent(tempNode);
if (tempNode.hasChildNodes()) {
// loop again if has child nodes
traverseDocumentNodesAndFillContent(tempNode.getChildNodes(), findings);
}
}
}
}
private void mergeTocElements() throws ReadingException {
List currentNavPoints = new ArrayList<>(content.getToc().getNavMap().getNavPoints());
int navPointIndex = 0; // Holds the last duplicate content position, when the new content found insertion is done from that position.
int insertedNavPointCount = 0;
for (XmlItem spine : content.getPackage().getSpine().getXmlItemList()) {
Map spineAttributes = spine.getAttributes();
String idRef = spineAttributes.get("idref");
for (XmlItem manifest : content.getPackage().getManifest().getXmlItemList()) {
Map manifestAttributes = manifest.getAttributes();
String manifestElementId = manifestAttributes.get("id");
if (idRef.equals(manifestElementId)) {
NavPoint navPoint = new NavPoint();
// navPoint.setPlayOrder(currentNavPoints.size() + spineNavPoints.size() + 1); // Is playOrder needed? I think not because we've already sorted the navPoints with playOrder before
// merging.
navPoint.setContentSrc(ContextHelper.encodeToUtf8(ContextHelper.getTextAfterCharacter(manifestAttributes.get("href"), Constants.SLASH)));
boolean duplicateContentSrc = false;
boolean isAnchoredFound = false;
for (int j = 0; j < currentNavPoints.size(); j++) {
NavPoint navPointItem = currentNavPoints.get(j);
if (navPoint.getContentSrc().equals(navPointItem.getContentSrc())) {
duplicateContentSrc = true;
navPointIndex = j;
break;
} else if (!isAnchoredFound && navPoint.getContentSrc().startsWith(navPointItem.getContentSrc()) && navPoint.getContentSrc().replace(navPointItem.getContentSrc(), "").startsWith("%23")) {
isAnchoredFound = true;
navPointIndex = j;
} else if (!isAnchoredFound && navPointItem.getContentSrc().startsWith(navPoint.getContentSrc()) && navPointItem.getContentSrc().replace(navPoint.getContentSrc(), "").startsWith("%23")) {
isAnchoredFound = true;
navPointIndex = j;
}
}
if (!duplicateContentSrc) {
content.getToc().getNavMap().getNavPoints().add(navPointIndex + insertedNavPointCount++, navPoint);
}
}
}
}
}
public Package getInfoPackage() {
return content.getPackage();
}
public Toc getToc() {
return content.getToc();
}
public byte[] getCoverImage() throws ReadingException {
if (content != null) {
return content.getCoverImage();
}
throw new ReadingException("Content info is not set.");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy