All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.modeshape.sequencer.epub.EpubMetadata Maven / Gradle / Ivy

There is a newer version: 5.4.1.Final
Show newest version
/*
 * ModeShape (http://www.modeshape.org)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.modeshape.sequencer.epub;

import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import org.modeshape.common.util.IoUtil;
import org.modeshape.common.xml.SimpleNamespaceContext;
import org.modeshape.sequencer.epub.EpubMetadataProperty.AlternateScript;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * Utility for extracting Metadata from EPUB format.
 * 
 * @since 5.1
 */
public class EpubMetadata {

    static final String[] MIME_TYPE_STRINGS = { "application/epub+zip" };

    // The XML namespace for the Dublin Core schema.
    static final String DUBLIN_CORE_PREFIX = "dc";
    static final String DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/";

    private List title = new ArrayList<>();
    private List creator = new ArrayList<>();
    private List contributor = new ArrayList<>();
    private List language = new ArrayList<>();
    private List identifier = new ArrayList<>();
    private List description = new ArrayList<>();
    private List publisher = new ArrayList<>();
    private List rights = new ArrayList<>();
    private List date = new ArrayList<>();
    private List alternateScript = new ArrayList<>();

    private InputStream in;

    public EpubMetadata( InputStream inputStream ) {
        this.in = inputStream;
    }

    /*
     * Check that given file is supported by this sequencer.
     * The sequencer first examines the container to get the list of all rootfiles.
     * Then it process each rootfile and extracts the metadata.
     */
    public boolean check() throws Exception {
        // we need to create a copy of the file, because the container file
        // might be at the end of the stream.
        File fileCopy = File.createTempFile("modeshape-sequencer-epub", ".tmp");
        IoUtil.write(in, new BufferedOutputStream(new FileOutputStream(fileCopy)));

        List rootfiles = new ArrayList<>();
        try (ZipInputStream zipStream =
                new ZipInputStream(new FileInputStream(fileCopy))) {
            rootfiles = getRootfiles(zipStream);
        }

        if (!rootfiles.isEmpty()) {
            try (ZipInputStream zipStream =
                    new ZipInputStream(new FileInputStream(fileCopy))) {
                ZipEntry entry = null;
                while ((entry = zipStream.getNextEntry()) != null) {
                    String entryName = entry.getName();
                    if (rootfiles.contains(entryName)) {
                        checkRootfile(zipStream, entry);
                    }
                }
            }
        } else {
            throw new IllegalArgumentException("No rootfile package found in given EPUB file.");
        }

        // try to delete the file immediately or on JVM exit
        boolean deleted = false;
        try {
            deleted = fileCopy.delete();
        } catch (SecurityException e) {
            // ignore
        }
        if (!deleted) {
            fileCopy.deleteOnExit();
        }

        return true;
    }

    /**
     * Process the EPUB package and extract the metadata fields.
     */
    private boolean checkRootfile( ZipInputStream zipStream,
                                   ZipEntry entry ) throws Exception {
        List properties = new ArrayList<>();
        Map propertiesWithId = new HashMap<>();

        ByteArrayOutputStream content = getZipEntryContent(zipStream, entry);

        // get metadata elements
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        DocumentBuilder builder = factory.newDocumentBuilder();
        Document doc = builder.parse(new ByteArrayInputStream(content.toByteArray()));

        XPathFactory xPathfactory = XPathFactory.newInstance();
        XPath xpath = xPathfactory.newXPath();
        xpath.setNamespaceContext(new SimpleNamespaceContext().setNamespace(DUBLIN_CORE_PREFIX, DUBLIN_CORE_URI));
        XPathExpression expr = xpath.compile("//" + DUBLIN_CORE_PREFIX + ":*");
        NodeList metadata = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);

        for (int i = 0; i < metadata.getLength(); i++) {
            EpubMetadataProperty property = new EpubMetadataProperty();

            Node node = metadata.item(i);
            property.setName(node.getLocalName());
            property.setValue(node.getTextContent());

            Node nodeId = node.getAttributes().getNamedItem("id");
            if (nodeId != null) {
                String id = nodeId.getTextContent();
                if (!propertiesWithId.containsKey(id)) {
                    propertiesWithId.put(id, property);
                }
            } else {
                properties.add(property);
            }
        }

        // read  elements which can refine the properties
        NodeList metadataRefines = doc.getElementsByTagName("meta");
        for (int i = 0; i < metadataRefines.getLength(); i++) {
            Node node = metadataRefines.item(i);
            Node refines = node.getAttributes().getNamedItem("refines");
            if (refines != null) {
                String refinesId = refines.getTextContent().substring(1);
                if (propertiesWithId.containsKey(refinesId)) {
                    EpubMetadataProperty property = propertiesWithId.get(refinesId);

                    Node schemeAttribute = node.getAttributes().getNamedItem("scheme");
                    if (schemeAttribute != null) {
                        property.setScheme(schemeAttribute.getTextContent());
                    }

                    String propertyName = node.getAttributes().getNamedItem("property").getTextContent();
                    if (propertyName.equals("title-type")) {
                        property.setTitleType(node.getTextContent());
                    } else if (propertyName.equals("identifier-type")) {
                        property.setIdentifierType(node.getTextContent());
                    } else if (propertyName.equals("metadata-authority")) {
                        property.setMetadataAuthority(node.getTextContent());
                    } else if (propertyName.equals("role")) {
                        property.setRole(node.getTextContent());
                    } else if (propertyName.equals("display-seq")) {
                        property.setDisplaySeq(Long.parseLong(node.getTextContent()));
                    } else if (propertyName.equals("group-position")) {
                        property.setGroupPosition(Long.parseLong(node.getTextContent()));
                    } else if (propertyName.equals("file-as")) {
                        property.setFileAs(node.getTextContent());
                    } else if (propertyName.equals("alternate-script")) {
                        Node languageAttribute = node.getAttributes().getNamedItem("xml:lang");
                        if (languageAttribute != null) {
                            property.setAlternateScript(new AlternateScript(node.getTextContent(), languageAttribute.getTextContent()));
                        }
                    }
                }
            }
        }

        properties.addAll(propertiesWithId.values());
        for (EpubMetadataProperty prop : properties) {
            String propertyName = prop.getName();
            if (propertyName.equals("identifier")) {
                getIdentifier().add(prop);
            } else if (propertyName.equals("title")) {
                getTitle().add(prop);
            } else if (propertyName.equals("language")) {
                getLanguage().add(prop);
            } else if (propertyName.equals("contributor")) {
                getContributor().add(prop);
            } else if (propertyName.equals("creator")) {
                getCreator().add(prop);
            } else if (propertyName.equals("description")) {
                getDescription().add(prop);
            } else if (propertyName.equals("publisher")) {
                getPublisher().add(prop);
            } else if (propertyName.equals("rights")) {
                getRights().add(prop);
            } else if (propertyName.equals("date")) {
                getDate().add(prop);
            }
        }

        return true;
    }

    /**
     * Parse the container file to get the list of all rootfile packages.
     */
    private List getRootfiles( ZipInputStream zipStream ) throws Exception {
        List rootfiles = new ArrayList<>();
        ZipEntry entry = null;
        while ((entry = zipStream.getNextEntry()) != null) {
            String entryName = entry.getName();
            if (entryName.endsWith("META-INF/container.xml")) {
                ByteArrayOutputStream content = getZipEntryContent(zipStream, entry);

                DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
                DocumentBuilder builder = factory.newDocumentBuilder();
                Document doc = builder.parse(new ByteArrayInputStream(content.toByteArray()));

                XPathFactory xPathfactory = XPathFactory.newInstance();
                XPath xpath = xPathfactory.newXPath();
                XPathExpression expr = xpath.compile("/container/rootfiles/rootfile");
                NodeList rootfileNodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);

                for (int i = 0; i < rootfileNodes.getLength(); i++) {
                    Node node = rootfileNodes.item(i);
                    rootfiles.add(node.getAttributes().getNamedItem("full-path").getNodeValue());
                }
                break;
            }
        }
        return rootfiles;
    }

    /**
     * Read the content of the ZipEntry without closing the stream.
     */
    private ByteArrayOutputStream getZipEntryContent(
            ZipInputStream zipStream,
            ZipEntry entry ) throws IOException {
        try (ByteArrayOutputStream content =
                new ByteArrayOutputStream()) {
            byte[] bytes = new byte[(int) entry.getSize()];
            int read;
            while ((read = zipStream.read(bytes, 0, bytes.length)) != -1) {
                content.write(bytes, 0, read);
            }
            return content;
        }
    }

    public List getTitle() {
        return title;
    }

    public List getCreator() {
        return creator;
    }

    public List getContributor() {
        return contributor;
    }

    public List getLanguage() {
        return language;
    }

    public List getIdentifier() {
        return identifier;
    }

    public List getDescription() {
        return description;
    }

    public List getPublisher() {
        return publisher;
    }

    public List getRights() {
        return rights;
    }

    public List getDate() {
        return date;
    }

    public List getAlternateScript() {
        return alternateScript;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy