All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.opentext.ia.sdk.sip.ContentAssemblerWithDedupOnHash Maven / Gradle / Ivy

There is a newer version: 12.8.4
Show newest version
/*
 * Copyright (c) 2016-2017 by OpenText Corporation. All Rights Reserved.
 */
package com.opentext.ia.sdk.sip;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import com.opentext.ia.sdk.support.io.EncodedHash;
import com.opentext.ia.sdk.support.io.HashAssembler;
import com.opentext.ia.sdk.support.io.NoHashAssembler;
import com.opentext.ia.sdk.support.io.RepeatableInputStream;
import com.opentext.ia.sdk.support.io.ZipAssembler;

/**
 * A ContentAssembler implementation which will perform deduplication based on the hash value of the content, i.e. a
 * digital object with a given hash is only included once in the SIP.
 * 

* NoteImportant to note that this content assembler buffers the bytes that make up the digital object in memory * so it is not suitable to very large digital objects. *

* @param The type of domain object to assemble SIPs from */ public class ContentAssemblerWithDedupOnHash extends ContentAssemblerDefault { private final Map, ContentInfo> hashesToContentInfo; private final HashAssembler noHashAssembler = new NoHashAssembler(); public ContentAssemblerWithDedupOnHash(DigitalObjectsExtraction contentsExtraction, HashAssembler contentHashAssembler, int estimatedMaxDigitalObjects) { super(contentsExtraction, contentHashAssembler); hashesToContentInfo = new HashMap<>(estimatedMaxDigitalObjects); } @Override public void begin(ZipAssembler zip, Counters metrics) { super.begin(zip, metrics); hashesToContentInfo.clear(); } @Override protected ContentInfo addContent(String ri, DigitalObject digitalObject) throws IOException { RepeatableInputStream memoryStream = memoryStreamOf(digitalObject); // First compute hashes Collection hashes = contentHashFor(memoryStream); // Check if contentInfo exist of the collection of hashes. // If yes, skip adding the content and return existing content info. ContentInfo contentInfo = hashesToContentInfo.get(hashes); if (contentInfo != null) { return contentInfo; } try (InputStream stream = memoryStream.get()) { addZipEntry(ri, stream, noHashAssembler); incMetric(SipMetrics.SIZE_DIGITAL_OBJECTS, getContentHashAssembler().numBytesHashed()); contentInfo = new ContentInfo(ri, hashes); hashesToContentInfo.put(hashes, contentInfo); } return contentInfo; } private RepeatableInputStream memoryStreamOf(DigitalObject digitalObject) throws IOException { try (InputStream raw = digitalObject.get()) { return new RepeatableInputStream(raw); } } private Collection contentHashFor(RepeatableInputStream memoryStream) throws IOException { try (InputStream stream = memoryStream.get()) { return contentHashFor(stream); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy