All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dspace.content.crosswalk.OREIngestionCrosswalk Maven / Gradle / Ivy

There is a newer version: 8.0
Show newest version
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.content.crosswalk;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.ConnectException;
import java.net.URL;
import java.sql.SQLException;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.logging.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamFormatService;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.BundleService;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.xpath.XPath;

/**
 * ORE ingestion crosswalk
 * 

* Processes an Atom-encoded ORE resource map and attempts to interpret it as a DSpace item. * * @author Alexey Maslov */ public class OREIngestionCrosswalk implements IngestionCrosswalk { /** * log4j category */ private static final Logger log = org.apache.logging.log4j.LogManager.getLogger(); /* Namespaces */ public static final Namespace ATOM_NS = Namespace.getNamespace("atom", "http://www.w3.org/2005/Atom"); private static final Namespace ORE_ATOM = Namespace.getNamespace("oreatom", "http://www.openarchives.org/ore/atom/"); private static final Namespace ORE_NS = Namespace.getNamespace("ore", "http://www.openarchives.org/ore/terms/"); private static final Namespace RDF_NS = Namespace.getNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); private static final Namespace DCTERMS_NS = Namespace.getNamespace("dcterms", "http://purl.org/dc/terms/"); private static final Namespace DS_NS = Namespace.getNamespace("ds", "http://www.dspace.org/objectModel/"); protected BitstreamService bitstreamService = ContentServiceFactory.getInstance().getBitstreamService(); protected BitstreamFormatService bitstreamFormatService = ContentServiceFactory.getInstance() .getBitstreamFormatService(); protected BundleService bundleService = ContentServiceFactory.getInstance().getBundleService(); protected ItemService itemService = ContentServiceFactory.getInstance().getItemService(); @Override public void ingest(Context context, DSpaceObject dso, List metadata, boolean createMissingMetadataFields) throws CrosswalkException, IOException, SQLException, AuthorizeException { // If this list contains only the root already, just pass it on if (metadata.size() == 1) { ingest(context, dso, metadata.get(0), createMissingMetadataFields); } else { // Otherwise, wrap them up Element wrapper = new Element("wrap", metadata.get(0).getNamespace()); wrapper.addContent(metadata); ingest(context, dso, wrapper, createMissingMetadataFields); } } @Override public void ingest(Context context, DSpaceObject dso, Element root, boolean createMissingMetadataFields) throws CrosswalkException, IOException, SQLException, AuthorizeException { Date timeStart = new Date(); if (dso.getType() != Constants.ITEM) { throw new CrosswalkObjectNotSupported("OREIngestionCrosswalk can only crosswalk an Item."); } Item item = (Item) dso; if (root == null) { System.err.println("The element received by ingest was null"); return; } Document doc = new Document(); doc.addContent(root.detach()); XPath xpathLinks; List aggregatedResources; String entryId; try { xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel=\"" + ORE_NS.getURI() + "aggregates" + "\"]"); xpathLinks.addNamespace(ATOM_NS); aggregatedResources = xpathLinks.selectNodes(doc); xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel='alternate']/@href"); xpathLinks.addNamespace(ATOM_NS); entryId = ((Attribute) xpathLinks.selectSingleNode(doc)).getValue(); } catch (JDOMException e) { throw new CrosswalkException("JDOM exception occurred while ingesting the ORE", e); } // Next for each resource, create a bitstream XPath xpathDesc; NumberFormat nf = NumberFormat.getInstance(); nf.setGroupingUsed(false); nf.setMinimumIntegerDigits(4); for (Element resource : aggregatedResources) { String href = resource.getAttributeValue("href"); log.debug("ORE processing: " + href); String bundleName; Element desc = null; try { xpathDesc = XPath.newInstance( "/atom:entry/oreatom:triples/rdf:Description[@rdf:about=\"" + this.encodeForURL(href) + "\"][1]"); xpathDesc.addNamespace(ATOM_NS); xpathDesc.addNamespace(ORE_ATOM); xpathDesc.addNamespace(RDF_NS); desc = (Element) xpathDesc.selectSingleNode(doc); } catch (JDOMException e) { log.warn("Could not find description for {}", href, e); } if (desc != null && desc.getChild("type", RDF_NS).getAttributeValue("resource", RDF_NS) .equals(DS_NS.getURI() + "DSpaceBitstream")) { bundleName = desc.getChildText("description", DCTERMS_NS); log.debug("Setting bundle name to: " + bundleName); } else { log.info("Could not obtain bundle name; using 'ORIGINAL'"); bundleName = "ORIGINAL"; } // Bundle names are not unique, so we just pick the first one if there's more than one. List targetBundles = itemService.getBundles(item, bundleName); Bundle targetBundle; // if null, create the new bundle and add it in if (targetBundles.size() == 0) { targetBundle = bundleService.create(context, item, bundleName); itemService.addBundle(context, item, targetBundle); } else { targetBundle = targetBundles.get(0); } URL ARurl = null; InputStream in = null; if (href != null) { try { // Make sure the url string escapes all the oddball characters String processedURL = encodeForURL(href); // Generate a requeset for the aggregated resource ARurl = new URL(processedURL); in = ARurl.openStream(); } catch (FileNotFoundException fe) { log.error("The provided URI failed to return a resource: " + href); } catch (ConnectException fe) { log.error("The provided URI was invalid: " + href); } } else { throw new CrosswalkException("Entry did not contain link to resource: " + entryId); } // ingest and update if (in != null) { Bitstream newBitstream = bitstreamService.create(context, targetBundle, in); String bsName = resource.getAttributeValue("title"); newBitstream.setName(context, bsName); // Identify the format String mimeString = resource.getAttributeValue("type"); BitstreamFormat bsFormat = bitstreamFormatService.findByMIMEType(context, mimeString); if (bsFormat == null) { bsFormat = bitstreamFormatService.guessFormat(context, newBitstream); } newBitstream.setFormat(context, bsFormat); bitstreamService.update(context, newBitstream); bundleService.addBitstream(context, targetBundle, newBitstream); bundleService.update(context, targetBundle); } else { throw new CrosswalkException("Could not retrieve bitstream: " + entryId); } } log.info( "OREIngest for Item " + item.getID() + " took: " + (new Date().getTime() - timeStart.getTime()) + "ms."); } /** * Helper method to escape all characters that are not part of the canon set * * @param sourceString source unescaped string */ private String encodeForURL(String sourceString) { Character lowalpha[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; Character upalpha[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}; Character digit[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}; Character mark[] = {'-', '_', '.', '!', '~', '*', '\'', '(', ')'}; // reserved Character reserved[] = {';', '/', '?', ':', '@', '&', '=', '+', '$', ',', '%', '#'}; Set URLcharsSet = new HashSet(); URLcharsSet.addAll(Arrays.asList(lowalpha)); URLcharsSet.addAll(Arrays.asList(upalpha)); URLcharsSet.addAll(Arrays.asList(digit)); URLcharsSet.addAll(Arrays.asList(mark)); URLcharsSet.addAll(Arrays.asList(reserved)); StringBuilder processedString = new StringBuilder(); for (int i = 0; i < sourceString.length(); i++) { char ch = sourceString.charAt(i); if (URLcharsSet.contains(ch)) { processedString.append(ch); } else { processedString.append("%").append(Integer.toHexString((int) ch)); } } return processedString.toString(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy