org.verapdf.METSMetadataExtractor Maven / Gradle / Ivy
The newest version!
package org.verapdf;
import au.edu.apsr.mtk.base.*;
import com.adobe.xmp.XMPException;
import com.adobe.xmp.XMPMeta;
import com.adobe.xmp.XMPMetaFactory;
import com.adobe.xmp.impl.VeraPDFMeta;
import com.adobe.xmp.impl.VeraPDFXMPNode;
import com.adobe.xmp.impl.XMPSchemaRegistryImpl;
import org.verapdf.core.FeatureParsingException;
import org.verapdf.features.AbstractMetadataFeaturesExtractor;
import org.verapdf.features.MetadataFeaturesData;
import org.verapdf.features.tools.FeatureTreeNode;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import javax.xml.bind.JAXBException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* @author Maksim Bezrukov
*/
public class METSMetadataExtractor extends AbstractMetadataFeaturesExtractor {
private XMPMeta dmdMeta = null;
private XMPMeta rightsMeta = null;
private XMPMeta techMeta = null;
private XMPMeta sourceMeta = null;
private XMPMeta digiprovMeta = null;
@Override
public List getMetadataFeatures(MetadataFeaturesData metadataFeaturesData) {
List result = new ArrayList<>(1);
try {
File outFile = getOutFile(result);
convertXMPToMETS(metadataFeaturesData.getStream(), new FileOutputStream(outFile));
FeatureTreeNode node = FeatureTreeNode.createRootNode("resultFile");
node.setValue(outFile.getCanonicalPath());
result.add(node);
} catch (XMPException | METSException | IOException | SAXException | FeatureParsingException | ParserConfigurationException e) {
try {
result.clear();
FeatureTreeNode node = FeatureTreeNode.createRootNode("error");
node.setValue(e.getMessage());
result.add(node);
} catch (FeatureParsingException e1) {
throw new IllegalStateException(e1);
}
}
return result;
}
private void convertXMPToMETS(InputStream toConvert, OutputStream out) throws XMPException, METSException, IOException, SAXException, ParserConfigurationException {
VeraPDFMeta meta = VeraPDFMeta.parse(toConvert);
divideMetas(meta);
METSWrapper metsWrapper = new METSWrapper();
createMETS(metsWrapper);
//TODO: fix mets creation in such way that validating will not generate any exceptions
// metsWrapper.validate();
metsWrapper.write(out);
}
private void createMETS(METSWrapper metsWrapper) throws METSException, SAXException, ParserConfigurationException, XMPException, IOException {
METS mets = metsWrapper.getMETSObject();
if (this.dmdMeta != null) {
DmdSec dmd = mets.newDmdSec();
dmd.setID("DMD_ID_1");
MdWrap dmdWrap = dmd.newMdWrap();
addXMPTreeToMdWrap(dmdWrap, this.dmdMeta);
dmd.setMdWrap(dmdWrap);
mets.addDmdSec(dmd);
}
if (this.rightsMeta != null || this.techMeta != null || this.sourceMeta != null || this.digiprovMeta != null) {
AmdSec amd = mets.newAmdSec();
if (this.rightsMeta != null) {
RightsMD rightsMD = amd.newRightsMD();
rightsMD.setID("RIGHTSMD_ID_1");
MdWrap rightsWrap = rightsMD.newMdWrap();
addXMPTreeToMdWrap(rightsWrap, this.rightsMeta);
rightsMD.setMdWrap(rightsWrap);
amd.addRightsMD(rightsMD);
}
if (this.techMeta != null) {
TechMD techMD = amd.newTechMD();
techMD.setID("TECHMD_ID_1");
MdWrap techWrap = techMD.newMdWrap();
addXMPTreeToMdWrap(techWrap, this.techMeta);
techMD.setMdWrap(techWrap);
amd.addTechMD(techMD);
}
if (this.sourceMeta != null) {
SourceMD sourceMD = amd.newSourceMD();
sourceMD.setID("SOURCEMD_ID_1");
MdWrap sourceWrap = sourceMD.newMdWrap();
addXMPTreeToMdWrap(sourceWrap, this.sourceMeta);
sourceMD.setMdWrap(sourceWrap);
amd.addSourceMD(sourceMD);
}
if (this.digiprovMeta != null) {
DigiprovMD digiprovMD = amd.newDigiprovMD();
digiprovMD.setID("DIGIPROVMD_ID_1");
MdWrap digiprovWrap = digiprovMD.newMdWrap();
addXMPTreeToMdWrap(digiprovWrap, this.digiprovMeta);
digiprovMD.setMdWrap(digiprovWrap);
amd.addDigiprovMD(digiprovMD);
}
mets.addAmdSec(amd);
}
}
private void addXMPTreeToMdWrap(MdWrap wrap, XMPMeta meta) throws XMPException, ParserConfigurationException, IOException, SAXException {
ByteArrayOutputStream bstream = new ByteArrayOutputStream();
XMPMetaFactory.serialize(meta, bstream);
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new ByteArrayInputStream(bstream.toByteArray()));
wrap.setXmlData(doc.getDocumentElement());
wrap.setMDType("OTHER");
wrap.setOtherMDType("XMP");
}
private void divideMetas(VeraPDFMeta meta) {
List xmpProps = meta.getProperties();
List properties;
if (meta.getExtensionSchemasNode() != null) {
properties = new ArrayList<>(xmpProps.size() + 1);
properties.add(new String[]{XMPSchemaRegistryImpl.NS_PDFA_EXTENSION, "schemas"});
} else {
properties = new ArrayList<>(xmpProps.size());
}
for (VeraPDFXMPNode node : xmpProps) {
properties.add(new String[]{node.getNamespaceURI(), node.getName()});
}
this.dmdMeta = meta.getCloneOfInitialMeta();
this.rightsMeta = meta.getCloneOfInitialMeta();
this.techMeta = meta.getCloneOfInitialMeta();
this.sourceMeta = meta.getCloneOfInitialMeta();
this.digiprovMeta = meta.getCloneOfInitialMeta();
boolean isDMDEmpty = true;
boolean isRightsEmpty = true;
boolean isTechEmpty = true;
boolean isSourceEmpty = true;
boolean isDIGIPROVEmpty = true;
for (String[] el : properties) {
switch (METSTypeRegistry.getTypeForProperty(el[0], el[1])) {
case DMD_SEC:
isDMDEmpty = false;
rightsMeta.deleteProperty(el[0], el[1]);
techMeta.deleteProperty(el[0], el[1]);
sourceMeta.deleteProperty(el[0], el[1]);
digiprovMeta.deleteProperty(el[0], el[1]);
break;
case RIGHTS_MD:
isRightsEmpty = false;
dmdMeta.deleteProperty(el[0], el[1]);
techMeta.deleteProperty(el[0], el[1]);
sourceMeta.deleteProperty(el[0], el[1]);
digiprovMeta.deleteProperty(el[0], el[1]);
break;
case TECH_MD:
isTechEmpty = false;
dmdMeta.deleteProperty(el[0], el[1]);
rightsMeta.deleteProperty(el[0], el[1]);
sourceMeta.deleteProperty(el[0], el[1]);
digiprovMeta.deleteProperty(el[0], el[1]);
break;
case SOURCE_MD:
isSourceEmpty = false;
dmdMeta.deleteProperty(el[0], el[1]);
rightsMeta.deleteProperty(el[0], el[1]);
techMeta.deleteProperty(el[0], el[1]);
digiprovMeta.deleteProperty(el[0], el[1]);
break;
case DIGIPROV_MD:
isDIGIPROVEmpty = false;
dmdMeta.deleteProperty(el[0], el[1]);
rightsMeta.deleteProperty(el[0], el[1]);
techMeta.deleteProperty(el[0], el[1]);
sourceMeta.deleteProperty(el[0], el[1]);
break;
}
}
if (isDMDEmpty) {
this.dmdMeta = null;
}
if (isRightsEmpty) {
this.rightsMeta = null;
}
if (isTechEmpty) {
this.techMeta = null;
}
if (isSourceEmpty) {
this.sourceMeta = null;
}
if (isDIGIPROVEmpty) {
this.digiprovMeta = null;
}
}
private File getOutFile(List nodes) throws FeatureParsingException, IOException {
String out = getAttributes().get("outFolder");
if (out == null) {
return getOutFileInFolder(getTempFolder());
} else {
File outFolder = new File(out);
if (outFolder.isDirectory()) {
return getOutFileInFolder(outFolder);
} else {
FeatureTreeNode node = FeatureTreeNode.createRootNode("error");
node.setValue("Config file contains out folder path but it doesn't link a directory.");
nodes.add(node);
return getOutFileInFolder(getTempFolder());
}
}
}
private File getTempFolder() {
File tempDir = new File(System.getProperty("java.io.tmpdir"));
File tempFolder = new File(tempDir, "veraPDFMETSPluginTemp");
if (!tempFolder.exists()) {
tempFolder.mkdir();
}
return tempFolder;
}
private File getOutFileInFolder(File folder) throws IOException {
return File.createTempFile("veraPDF_METS_Plugin_out", ".xml", folder);
}
}