
ca.nrc.cadc.caom2.artifactsync.ArtifactHarvester Maven / Gradle / Ivy
/*
************************************************************************
******************* CANADIAN ASTRONOMY DATA CENTRE *******************
************** CENTRE CANADIEN DE DONNÉES ASTRONOMIQUES **************
*
* (c) 2021. (c) 2021.
* Government of Canada Gouvernement du Canada
* National Research Council Conseil national de recherches
* Ottawa, Canada, K1A 0R6 Ottawa, Canada, K1A 0R6
* All rights reserved Tous droits réservés
*
* NRC disclaims any warranties, Le CNRC dénie toute garantie
* expressed, implied, or énoncée, implicite ou légale,
* statutory, of any kind with de quelque nature que ce
* respect to the software, soit, concernant le logiciel,
* including without limitation y compris sans restriction
* any warranty of merchantability toute garantie de valeur
* or fitness for a particular marchande ou de pertinence
* purpose. NRC shall not be pour un usage particulier.
* liable in any event for any Le CNRC ne pourra en aucun cas
* damages, whether direct or être tenu responsable de tout
* indirect, special or general, dommage, direct ou indirect,
* consequential or incidental, particulier ou général,
* arising from the use of the accessoire ou fortuit, résultant
* software. Neither the name de l'utilisation du logiciel. Ni
* of the National Research le nom du Conseil National de
* Council of Canada nor the Recherches du Canada ni les noms
* names of its contributors may de ses participants ne peuvent
* be used to endorse or promote être utilisés pour approuver ou
* products derived from this promouvoir les produits dérivés
* software without specific prior de ce logiciel sans autorisation
* written permission. préalable et particulière
* par écrit.
*
* This file is part of the Ce fichier fait partie du projet
* OpenCADC project. OpenCADC.
*
* OpenCADC is free software: OpenCADC est un logiciel libre ;
* you can redistribute it and/or vous pouvez le redistribuer ou le
* modify it under the terms of modifier suivant les termes de
* the GNU Affero General Public la “GNU Affero General Public
* License as published by the License” telle que publiée
* Free Software Foundation, par la Free Software Foundation
* either version 3 of the : soit la version 3 de cette
* License, or (at your option) licence, soit (à votre gré)
* any later version. toute version ultérieure.
*
* OpenCADC is distributed in the OpenCADC est distribué
* hope that it will be useful, dans l’espoir qu’il vous
* but WITHOUT ANY WARRANTY; sera utile, mais SANS AUCUNE
* without even the implied GARANTIE : sans même la garantie
* warranty of MERCHANTABILITY implicite de COMMERCIALISABILITÉ
* or FITNESS FOR A PARTICULAR ni d’ADÉQUATION À UN OBJECTIF
* PURPOSE. See the GNU Affero PARTICULIER. Consultez la Licence
* General Public License for Générale Publique GNU Affero
* more details. pour plus de détails.
*
* You should have received Vous devriez avoir reçu une
* a copy of the GNU Affero copie de la Licence Générale
* General Public License along Publique GNU Affero avec
* with OpenCADC. If not, see OpenCADC ; si ce n’est
* . pas le cas, consultez :
* .
*
* $Revision: 5 $
*
************************************************************************
*/
package ca.nrc.cadc.caom2.artifactsync;
import ca.nrc.cadc.caom2.Artifact;
import ca.nrc.cadc.caom2.Observation;
import ca.nrc.cadc.caom2.ObservationState;
import ca.nrc.cadc.caom2.Plane;
import ca.nrc.cadc.caom2.access.AccessUtil;
import ca.nrc.cadc.caom2.artifact.ArtifactMetadata;
import ca.nrc.cadc.caom2.artifact.ArtifactStore;
import ca.nrc.cadc.caom2.artifact.StoragePolicy;
import ca.nrc.cadc.caom2.harvester.HarvestResource;
import ca.nrc.cadc.caom2.harvester.state.HarvestSkipURI;
import ca.nrc.cadc.caom2.harvester.state.HarvestSkipURIDAO;
import ca.nrc.cadc.caom2.harvester.state.HarvestState;
import ca.nrc.cadc.caom2.harvester.state.HarvestStateDAO;
import ca.nrc.cadc.caom2.harvester.state.PostgresqlHarvestStateDAO;
import ca.nrc.cadc.caom2.persistence.ObservationDAO;
import ca.nrc.cadc.date.DateUtil;
import ca.nrc.cadc.net.TransientException;
import java.net.URI;
import java.security.PrivilegedExceptionAction;
import java.text.DateFormat;
import java.util.Date;
import java.util.List;
import java.util.ListIterator;
import java.util.UUID;
import javax.lang.model.type.NullType;
import org.apache.log4j.Logger;
public class ArtifactHarvester implements PrivilegedExceptionAction, ShutdownListener {
public static final Integer DEFAULT_BATCH_SIZE = Integer.valueOf(1000);
public static final String STATE_CLASS = Artifact.class.getSimpleName();
public static final String PROPRIETARY = "Proprietary";
private static final Logger log = Logger.getLogger(ArtifactHarvester.class);
private ObservationDAO observationDAO;
private ArtifactStore artifactStore;
private HarvestStateDAO harvestStateDAO;
private HarvestSkipURIDAO harvestSkipURIDAO;
private String collection;
private StoragePolicy storagePolicy;
private int batchSize;
private boolean loop;
private String source;
private Date startDate;
private DateFormat df;
private String caomChecksum;
private String storageChecksum;
private Long caomContentLength;
private long storageContentLength;
private String reason = "None";
private String errorMessage;
// reset each run
long downloadCount = 0;
long updateCount = 0;
long processedCount = 0;
Date start = new Date();
public ArtifactHarvester(ObservationDAO observationDAO, HarvestResource harvestResource,
ArtifactStore artifactStore, int batchSize, boolean loop) {
this.observationDAO = observationDAO;
this.artifactStore = artifactStore;
this.batchSize = batchSize;
this.loop = loop;
this.source = harvestResource.getIdentifier();
this.collection = harvestResource.getCollection();
this.storagePolicy = artifactStore.getStoragePolicy(collection);
String database = harvestResource.getDatabase();
String schema = harvestResource.getSchema();
this.harvestStateDAO = new PostgresqlHarvestStateDAO(observationDAO.getDataSource(), database, schema);
this.harvestSkipURIDAO = new HarvestSkipURIDAO(observationDAO.getDataSource(), database, schema);
this.startDate = null;
this.df = DateUtil.getDateFormat(DateUtil.ISO_DATE_FORMAT, DateUtil.UTC);
}
@Override
public NullType run() throws Exception {
int loopNum = 1;
boolean stop = false;
do {
if (loop) {
log.info("-- STARTING LOOP #" + loopNum + " --");
}
stop = runIt();
if (loop) {
log.info("-- ENDING LOOP #" + loopNum + " --");
}
loopNum++;
} while (loop && !stop); // continue if work was done
return null;
}
private Boolean runIt() throws Exception {
this.downloadCount = 0;
this.processedCount = 0;
this.start = new Date();
try {
// Determine the state of the last run
HarvestState state = harvestStateDAO.get(source, STATE_CLASS);
this.startDate = state.curLastModified;
// harvest up to a little in the past because the head of
// the sequence may be volatile
long fiveMinAgo = System.currentTimeMillis() - 5 * 60000L;
Date stopDate = new Date(fiveMinAgo);
if (startDate == null) {
log.info("harvest window: null " + this.df.format(stopDate) + " [" + this.batchSize + "]");
} else {
log.info("harvest window: " + this.df.format(startDate) + " " + this.df.format(stopDate) + " [" + this.batchSize + "]");
}
List observationStates = this.observationDAO.getObservationList(this.collection, this.startDate,
stopDate, this.batchSize + 1);
// avoid re-processing the last successful one stored in
// HarvestState (normal case because query: >= startDate)
if (!observationStates.isEmpty()) {
ListIterator iter = observationStates.listIterator();
ObservationState curBatchLeader = iter.next();
if (curBatchLeader != null) {
if (state.curLastModified != null) {
log.debug("harvesState: " + format(state.curID) + ", " + this.df.format(state.curLastModified));
}
if (curBatchLeader.getMaxLastModified().equals(state.curLastModified)) {
Observation observation = this.observationDAO.get(curBatchLeader.getID());
log.debug("current batch: " + format(observation.getID()) + ", " + this.df.format(curBatchLeader.getMaxLastModified()));
if (state.curID != null && state.curID.equals(observation.getID())) {
iter.remove();
}
}
}
}
log.info("Found: " + observationStates.size());
for (ObservationState observationState : observationStates) {
try {
this.observationDAO.getTransactionManager().startTransaction();
Observation observation = this.observationDAO.get(observationState.getID());
if (observation == null) {
log.debug("Observation no longer exists: " + observationState.getURI());
} else {
// will make progress even on failures
state.curLastModified = observation.getMaxLastModified();
state.curID = observation.getID();
for (Plane plane : observation.getPlanes()) {
for (Artifact artifact : plane.getArtifacts()) {
Date releaseDate = AccessUtil.getReleaseDate(artifact, plane.metaRelease, plane.dataRelease);
if (releaseDate == null) {
// null date means private
log.debug("null release date, skipping");
} else {
logStart(format(state.curID), artifact);
boolean success = true;
boolean added = false;
String message = null;
this.caomChecksum = getMD5Sum(artifact.contentChecksum);
if (this.caomChecksum == null) {
this.caomChecksum = "null";
}
if (artifact.contentLength == null) {
this.caomContentLength = null;
} else {
this.caomContentLength = artifact.contentLength;
}
this.storageContentLength = 0;
this.reason = "None";
this.errorMessage = null;
this.processedCount++;
if (releaseDate.after(start)) {
// private and release date is not null, download in the future
this.errorMessage = ArtifactHarvester.PROPRIETARY;
}
try {
// by default, do not add to the skip table
boolean correctCopy = true;
// artifact is not in storage if storage policy is 'PUBLIC ONLY' and the artifact is proprietary
if ((StoragePolicy.ALL == storagePolicy) || this.errorMessage == null) {
// correctCopy is false if: checksum mismatch, content length mismatch or not in storage
// "not in storage" includes failing to resolve the artifact URI
correctCopy = checkArtifactInStorage(artifact.getURI());
log.debug("Artifact " + artifact.getURI() + " with MD5 " + artifact
.contentChecksum + " correct copy: " + correctCopy);
}
if ((StoragePolicy.PUBLIC_ONLY == storagePolicy
&& this.errorMessage.equals(ArtifactHarvester.PROPRIETARY)) || !correctCopy) {
HarvestSkipURI skip = harvestSkipURIDAO.get(source, STATE_CLASS, artifact.getURI());
if (skip == null) {
// not in skip table, add it
skip = new HarvestSkipURI(source, STATE_CLASS, artifact.getURI(), releaseDate, this.errorMessage);
}
if (ArtifactHarvester.PROPRIETARY.equals(skip.errorMessage)
|| ArtifactHarvester.PROPRIETARY.equals(this.errorMessage)) {
skip.setTryAfter(releaseDate);
skip.errorMessage = errorMessage;
}
this.harvestSkipURIDAO.put(skip);
this.downloadCount++;
added = true;
if (skip != null) {
this.downloadCount--;
if (this.errorMessage.equals(ArtifactHarvester.PROPRIETARY)) {
this.updateCount++;
message = this.errorMessage
+ " artifact already exists in skip table, update tryAfter date to release date.";
} else {
added = false;
String msg = "artifact already exists in skip table.";;
if (this.reason.equalsIgnoreCase("None")) {
this.reason = "Public " + msg;
} else {
this.reason = this.reason + " and public " + msg;
}
}
}
}
} catch (Exception ex) {
success = false;
message = "Failed to determine if artifact " + artifact.getURI() + " exists: " + ex.getMessage();
log.error(message, ex);
}
logEnd(format(state.curID), artifact, success, added, message);
}
}
}
}
this.harvestStateDAO.put(state);
log.debug("Updated artifact harvest state. Date: " + state.curLastModified);
log.debug("Updated artifact harvest state. ID: " + format(state.curID));
this.observationDAO.getTransactionManager().commitTransaction();
} catch (Throwable t) {
this.observationDAO.getTransactionManager().rollbackTransaction();
throw t;
}
}
return (observationStates.size() < batchSize + 1);
} finally {
logBatchEnd();
}
}
private String getMD5Sum(URI checksum) throws UnsupportedOperationException {
if (checksum == null) {
return null;
}
if (checksum.getScheme().equalsIgnoreCase("MD5")) {
return checksum.getSchemeSpecificPart();
} else {
throw new UnsupportedOperationException("Checksum algorithm " + checksum.getScheme() + " not suported.");
}
}
private boolean checkContentLength(Long artifactContentLength) {
// no contentLength in a CAOM artifact is considered a match
if (this.caomContentLength == null || this.caomContentLength == 0) {
return true;
} else {
this.storageContentLength = artifactContentLength;
if (this.storageContentLength == this.caomContentLength) {
return true;
} else {
this.reason = "ContentLengths are different";
this.errorMessage = this.reason;
return false;
}
}
}
private boolean checkChecksum(String contentMD5) {
log.debug("Expected MD5: " + this.caomChecksum);
if (this.caomChecksum.equalsIgnoreCase("null")) {
// no checksum in a CAOM artifact is considered a match
this.reason = "Null checksum";
return true;
}
log.debug("Matching artifact with md5 " + contentMD5);
this.storageChecksum = contentMD5;
if (this.caomChecksum.equalsIgnoreCase(contentMD5)) {
return true;
} else {
this.reason = "Checksums are different";
this.errorMessage = this.reason;
return false;
}
}
private boolean checkArtifactInStorage(URI artifactURI) throws TransientException {
ArtifactMetadata artifactMetadata = this.artifactStore.get(artifactURI);
if (artifactMetadata == null) {
this.reason = "Artifact not in storage";
this.errorMessage = reason;
log.debug("Artifact not in storage URI: " + artifactURI);
return false;
}
if (checkChecksum(artifactMetadata.getChecksum())) {
return checkContentLength(artifactMetadata.contentLength);
} else {
return false;
}
}
private String format(UUID id) {
if (id == null) {
return "null";
}
return id.toString();
}
private String safeToString(Long n) {
if (n == null) {
return "null";
}
return n.toString();
}
private void logStart(String observationID, Artifact artifact) {
StringBuilder startMessage = new StringBuilder();
startMessage.append("START: {");
startMessage.append("\"observationID\":\"").append(observationID).append("\"");
startMessage.append(",");
startMessage.append("\"artifact\":\"").append(artifact.getURI()).append("\"");
startMessage.append(",");
startMessage.append("\"date\":\"").append(this.df.format(new Date())).append("\"");
startMessage.append("}");
log.info(startMessage.toString());
}
private void logEnd(String observationID, Artifact artifact, boolean success, boolean added, String message) {
final String caomContentLengthStr = safeToString(this.caomContentLength);
final String storageContentLengthStr = safeToString(this.storageContentLength);
StringBuilder endMessage = new StringBuilder();
endMessage.append("END: {");
endMessage.append("\"observationID\":\"").append(observationID).append("\"");
endMessage.append(",");
endMessage.append("\"artifact\":\"").append(artifact.getURI()).append("\"");
endMessage.append(",");
endMessage.append("\"success\":\"").append(success).append("\"");
endMessage.append(",");
if (message != null && message.contains("update tryAfter date")) {
endMessage.append("\"updated\":\"").append(added).append("\"");
} else {
endMessage.append("\"added\":\"").append(added).append("\"");
}
endMessage.append(",");
endMessage.append("\"reason\":\"").append(this.reason).append("\"");
endMessage.append(",");
endMessage.append("\"caomChecksum\":\"").append(this.caomChecksum).append("\"");
endMessage.append(",");
endMessage.append("\"caomContentLength\":\"").append(caomContentLengthStr).append("\"");
endMessage.append(",");
endMessage.append("\"storageChecksum\":\"").append(this.storageChecksum).append("\"");
endMessage.append(",");
endMessage.append("\"storageContentLength\":\"").append(storageContentLengthStr).append("\"");
endMessage.append(",");
endMessage.append("\"collection\":\"").append(this.collection).append("\"");
if (message != null) {
endMessage.append(",");
endMessage.append("\"message\":\"").append(message).append("\"");
}
endMessage.append(",");
endMessage.append("\"date\":\"").append(df.format(new Date())).append("\"");
endMessage.append("}");
log.info(endMessage.toString());
}
private void logBatchEnd() {
logBatchEnd("ENDBATCH");
}
private void logBatchEnd(String endString) {
StringBuilder batchMessage = new StringBuilder();
batchMessage.append(endString + ": {");
batchMessage.append("\"total\":\"").append(this.processedCount).append("\"");
batchMessage.append(",");
batchMessage.append("\"added\":\"").append(this.downloadCount).append("\"");
batchMessage.append(",");
batchMessage.append("\"updated\":\"").append(this.updateCount).append("\"");
batchMessage.append(",");
batchMessage.append("\"time\":\"").append(System.currentTimeMillis() - this.start.getTime()).append("\"");
batchMessage.append(",");
batchMessage.append("\"date\":\"").append(this.df.format(this.start)).append("\"");
batchMessage.append("}");
log.info(batchMessage.toString());
}
@Override
public void shutdown() {
logBatchEnd("ENDDISCOVER");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy