org.dspace.content.crosswalk.OREIngestionCrosswalk Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dspace-api Show documentation
DSpace core data model and service APIs.
There is a newer version: 8.0
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.content.crosswalk;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.ConnectException;
import java.net.URL;
import java.sql.SQLException;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.logging.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamFormatService;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.BundleService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.xpath.XPath;

/**
 * ORE ingestion crosswalk
 * 
 * Processes an Atom-encoded ORE resource map and attempts to interpret it as a DSpace item.
 *
 * @author Alexey Maslov
 */
public class OREIngestionCrosswalk
    implements IngestionCrosswalk {
    /**
     * log4j category
     */
    private static final Logger log = org.apache.logging.log4j.LogManager.getLogger();

    /* Namespaces */
    public static final Namespace ATOM_NS =
        Namespace.getNamespace("atom", "http://www.w3.org/2005/Atom");
    private static final Namespace ORE_ATOM =
        Namespace.getNamespace("oreatom", "http://www.openarchives.org/ore/atom/");
    private static final Namespace ORE_NS =
        Namespace.getNamespace("ore", "http://www.openarchives.org/ore/terms/");
    private static final Namespace RDF_NS =
        Namespace.getNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
    private static final Namespace DCTERMS_NS =
        Namespace.getNamespace("dcterms", "http://purl.org/dc/terms/");
    private static final Namespace DS_NS =
        Namespace.getNamespace("ds", "http://www.dspace.org/objectModel/");


    protected BitstreamService bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
    protected BitstreamFormatService bitstreamFormatService = ContentServiceFactory.getInstance()
                                                                                   .getBitstreamFormatService();
    protected BundleService bundleService = ContentServiceFactory.getInstance().getBundleService();
    protected ItemService itemService = ContentServiceFactory.getInstance().getItemService();


    @Override
    public void ingest(Context context, DSpaceObject dso, List metadata, boolean createMissingMetadataFields)
        throws CrosswalkException, IOException, SQLException, AuthorizeException {

        // If this list contains only the root already, just pass it on
        if (metadata.size() == 1) {
            ingest(context, dso, metadata.get(0), createMissingMetadataFields);
        } else {
            // Otherwise, wrap them up
            Element wrapper = new Element("wrap", metadata.get(0).getNamespace());
            wrapper.addContent(metadata);

            ingest(context, dso, wrapper, createMissingMetadataFields);
        }
    }


    @Override
    public void ingest(Context context, DSpaceObject dso, Element root, boolean createMissingMetadataFields)
        throws CrosswalkException, IOException, SQLException, AuthorizeException {

        Date timeStart = new Date();

        if (dso.getType() != Constants.ITEM) {
            throw new CrosswalkObjectNotSupported("OREIngestionCrosswalk can only crosswalk an Item.");
        }
        Item item = (Item) dso;

        if (root == null) {
            System.err.println("The element received by ingest was null");
            return;
        }

        Document doc = new Document();
        doc.addContent(root.detach());

        XPath xpathLinks;
        List aggregatedResources;
        String entryId;
        try {
            xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel=\"" + ORE_NS.getURI() + "aggregates" + "\"]");
            xpathLinks.addNamespace(ATOM_NS);
            aggregatedResources = xpathLinks.selectNodes(doc);

            xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel='alternate']/@href");
            xpathLinks.addNamespace(ATOM_NS);
            entryId = ((Attribute) xpathLinks.selectSingleNode(doc)).getValue();
        } catch (JDOMException e) {
            throw new CrosswalkException("JDOM exception occurred while ingesting the ORE", e);
        }

        // Next for each resource, create a bitstream
        XPath xpathDesc;
        NumberFormat nf = NumberFormat.getInstance();
        nf.setGroupingUsed(false);
        nf.setMinimumIntegerDigits(4);

        for (Element resource : aggregatedResources) {
            String href = resource.getAttributeValue("href");
            log.debug("ORE processing: " + href);

            String bundleName;
            Element desc = null;
            try {
                xpathDesc = XPath.newInstance(
                    "/atom:entry/oreatom:triples/rdf:Description[@rdf:about=\"" + this.encodeForURL(href) + "\"][1]");
                xpathDesc.addNamespace(ATOM_NS);
                xpathDesc.addNamespace(ORE_ATOM);
                xpathDesc.addNamespace(RDF_NS);
                desc = (Element) xpathDesc.selectSingleNode(doc);
            } catch (JDOMException e) {
                log.warn("Could not find description for {}", href, e);
            }

            if (desc != null && desc.getChild("type", RDF_NS).getAttributeValue("resource", RDF_NS)
                                    .equals(DS_NS.getURI() + "DSpaceBitstream")) {
                bundleName = desc.getChildText("description", DCTERMS_NS);
                log.debug("Setting bundle name to: " + bundleName);
            } else {
                log.info("Could not obtain bundle name; using 'ORIGINAL'");
                bundleName = "ORIGINAL";
            }

            // Bundle names are not unique, so we just pick the first one if there's more than one.
            List targetBundles = itemService.getBundles(item, bundleName);
            Bundle targetBundle;

            // if null, create the new bundle and add it in
            if (targetBundles.size() == 0) {
                targetBundle = bundleService.create(context, item, bundleName);
                itemService.addBundle(context, item, targetBundle);
            } else {
                targetBundle = targetBundles.get(0);
            }

            URL ARurl = null;
            InputStream in = null;
            if (href != null) {
                try {
                    // Make sure the url string escapes all the oddball characters
                    String processedURL = encodeForURL(href);
                    // Generate a requeset for the aggregated resource
                    ARurl = new URL(processedURL);
                    in = ARurl.openStream();
                } catch (FileNotFoundException fe) {
                    log.error("The provided URI failed to return a resource: " + href);
                } catch (ConnectException fe) {
                    log.error("The provided URI was invalid: " + href);
                }
            } else {
                throw new CrosswalkException("Entry did not contain link to resource: " + entryId);
            }

            // ingest and update
            if (in != null) {
                Bitstream newBitstream = bitstreamService.create(context, targetBundle, in);

                String bsName = resource.getAttributeValue("title");
                newBitstream.setName(context, bsName);

                // Identify the format
                String mimeString = resource.getAttributeValue("type");
                BitstreamFormat bsFormat = bitstreamFormatService.findByMIMEType(context, mimeString);
                if (bsFormat == null) {
                    bsFormat = bitstreamFormatService.guessFormat(context, newBitstream);
                }
                newBitstream.setFormat(context, bsFormat);
                bitstreamService.update(context, newBitstream);

                bundleService.addBitstream(context, targetBundle, newBitstream);
                bundleService.update(context, targetBundle);
            } else {
                throw new CrosswalkException("Could not retrieve bitstream: " + entryId);
            }

        }
        log.info(
            "OREIngest for Item " + item.getID() + " took: " + (new Date().getTime() - timeStart.getTime()) + "ms.");
    }


    /**
     * Helper method to escape all characters that are not part of the canon set
     *
     * @param sourceString source unescaped string
     */
    private String encodeForURL(String sourceString) {
        Character lowalpha[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
            'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
            's', 't', 'u', 'v', 'w', 'x', 'y', 'z'};
        Character upalpha[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
            'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
            'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};
        Character digit[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'};
        Character mark[] = {'-', '_', '.', '!', '~', '*', '\'', '(', ')'};

        // reserved
        Character reserved[] = {';', '/', '?', ':', '@', '&', '=', '+', '$', ',', '%', '#'};

        Set URLcharsSet = new HashSet();
        URLcharsSet.addAll(Arrays.asList(lowalpha));
        URLcharsSet.addAll(Arrays.asList(upalpha));
        URLcharsSet.addAll(Arrays.asList(digit));
        URLcharsSet.addAll(Arrays.asList(mark));
        URLcharsSet.addAll(Arrays.asList(reserved));

        StringBuilder processedString = new StringBuilder();
        for (int i = 0; i < sourceString.length(); i++) {
            char ch = sourceString.charAt(i);
            if (URLcharsSet.contains(ch)) {
                processedString.append(ch);
            } else {
                processedString.append("%").append(Integer.toHexString((int) ch));
            }
        }

        return processedString.toString();
    }

}