com.opentext.ia.sdk.sip.ContentAssemblerWithDedupOnHash Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of infoarchive-sdk-core Show documentation
Show all versions of infoarchive-sdk-core Show documentation
A library that makes it quick and easy to create SIPs in InfoArchive
/*
* Copyright (c) 2016-2017 by OpenText Corporation. All Rights Reserved.
*/
package com.opentext.ia.sdk.sip;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import com.opentext.ia.sdk.support.io.EncodedHash;
import com.opentext.ia.sdk.support.io.HashAssembler;
import com.opentext.ia.sdk.support.io.NoHashAssembler;
import com.opentext.ia.sdk.support.io.RepeatableInputStream;
import com.opentext.ia.sdk.support.io.ZipAssembler;
/**
* A ContentAssembler implementation which will perform deduplication based on the hash value of the content, i.e. a
* digital object with a given hash is only included once in the SIP.
*
* NoteImportant to note that this content assembler buffers the bytes that make up the digital object in memory
* so it is not suitable to very large digital objects.
*
* @param The type of domain object to assemble SIPs from
*/
public class ContentAssemblerWithDedupOnHash extends ContentAssemblerDefault {
private final Map, ContentInfo> hashesToContentInfo;
private final HashAssembler noHashAssembler = new NoHashAssembler();
public ContentAssemblerWithDedupOnHash(DigitalObjectsExtraction contentsExtraction,
HashAssembler contentHashAssembler, int estimatedMaxDigitalObjects) {
super(contentsExtraction, contentHashAssembler);
hashesToContentInfo = new HashMap<>(estimatedMaxDigitalObjects);
}
@Override
public void begin(ZipAssembler zip, Counters metrics) {
super.begin(zip, metrics);
hashesToContentInfo.clear();
}
@Override
protected ContentInfo addContent(String ri, DigitalObject digitalObject) throws IOException {
RepeatableInputStream memoryStream = memoryStreamOf(digitalObject);
// First compute hashes
Collection hashes = contentHashFor(memoryStream);
// Check if contentInfo exist of the collection of hashes.
// If yes, skip adding the content and return existing content info.
ContentInfo contentInfo = hashesToContentInfo.get(hashes);
if (contentInfo != null) {
return contentInfo;
}
try (InputStream stream = memoryStream.get()) {
addZipEntry(ri, stream, noHashAssembler);
incMetric(SipMetrics.SIZE_DIGITAL_OBJECTS, getContentHashAssembler().numBytesHashed());
contentInfo = new ContentInfo(ri, hashes);
hashesToContentInfo.put(hashes, contentInfo);
}
return contentInfo;
}
private RepeatableInputStream memoryStreamOf(DigitalObject digitalObject) throws IOException {
try (InputStream raw = digitalObject.get()) {
return new RepeatableInputStream(raw);
}
}
private Collection contentHashFor(RepeatableInputStream memoryStream) throws IOException {
try (InputStream stream = memoryStream.get()) {
return contentHashFor(stream);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy