de.dnb.oai.harvester.Harvester Maven / Gradle / Ivy
/**********************************************************************
* Class Harvester
*
* Copyright (c) 2005-2012, German National Library / Deutsche Nationalbibliothek
* Adickesallee 1, D-60322 Frankfurt am Main, Federal Republic of Germany
*
* This program is free software.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Kadir Karaca Kocer -- German National Library
*
**********************************************************************/
package de.dnb.oai.harvester;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.TimeZone;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlOptions;
import org.openarchives.oai.x20.*;
import org.springframework.context.ApplicationContext;
import de.dnb.oai.harvester.task.OaiTask;
import de.dnb.oai.harvester.task.OaiTaskDao;
import de.dnb.oai.harvester.task.Request;
import de.dnb.stm.handler.export.ExportHandler;
import de.dnb.stm.handler.export.ExportHandlerDao;
import de.dnb.stm.handler.export.ProcessorExportException;
import de.dnb.stm.manager.ManagerConstants;
import de.dnb.stm.processor.AbstractProcessor;
import de.dnb.stm.processor.ProcessorException;
import de.dnb.stm.task.TaskConstants;
import de.dnb.stm.task.TaskStopRequest;
import de.dnb.stm.task.TaskStopRequestDao;
import de.dnb.oai.harvester.util.EasySSLProtocolSocketFactory;
/** *******************************************************************
* A Harvester is a Runnable executing a defined OaiTask object.
*
* @author Kadir Karaca Kocer, German National Library
* @version 20081210
* @see de.dnb.oai.harvester.task.OaiTask
* @see de.dnb.oai.harvester.task.Request
* OAI-PMH Specification {@link "http://www.openarchives.org/pmh/"}
* W3C {@link "http://www.w3.org/TR/html40/appendix/notes.html"}
* @since 10.09.2005
**********************************************************************/
/* ********************************************************************
* CHANGELOG:
* 2012.09.26 Refactored ApplicationContextUtil
* 2009.02.20 Corrected the UTC Timezone problem
* 2008.12.05 Bugfixes, Kocer
* 2008.11.20 modified to use OAIHarvester Class, Kocer
* Refactored 15.07.2008 by Kadir Karaca Kocer, German National Library
* Created on 10.09.2005 by Alexander Slotta, German National Library
**********************************************************************/
public class Harvester extends AbstractProcessor{
private static final Log LOGGER = LogFactory.getLog(Harvester.class);
// the connections to OAI-Server
private final static MultiThreadedHttpConnectionManager CONNECTION_MANAGER = new MultiThreadedHttpConnectionManager();
private final static HttpClient HTTP_CLIENT = new HttpClient(CONNECTION_MANAGER);
private String resumptionToken;
// we want to know how many records are there and how many of them are successfully saved.
// this is a bit tricky becouse CompleteListSize tag in OAI-PMH answer is NOT mandatory!
private int completeListSize;
// so we count the records if the server does not use this tag.
// at the end we compare those two values and take the higher one
private int totalRecords; //
private int savedRecords; // how many records did we already save?
private String msgPrefix; // prefix string for logger
private boolean hasResumptionToken; // we have to know if there is a resumption token
private long taskId;
private OaiTask task;
private OaiTaskDao taskDao;
private TaskStopRequestDao taskStopRequestDao;
private ExportHandlerDao exportHandlerDao;
/**
* Constructor of class Harvester.
*
* @see java.lang.Runnable
* @see org.apache.commons.httpclient.MultiThreadedHttpConnectionManager
* @author Kadir Karaca Kocer, German National Library
*/
public Harvester() {
if (LOGGER.isDebugEnabled()) LOGGER.debug("[Harvester] A new Harvester is born!");
}
/**
* @param context ApplicationContext
*/
@Override
public void init(ApplicationContext context) {
this.taskDao = (OaiTaskDao)context.getBean("taskDao");
this.taskStopRequestDao = (TaskStopRequestDao)context.getBean("taskStopRequestDao");
this.exportHandlerDao = (ExportHandlerDao) context.getBean("exportHandlerDao");
HttpConnectionManagerParams params = new HttpConnectionManagerParams();
params.setMaxTotalConnections(50);
params.setConnectionTimeout(900000); // 15 Minutes
params.setDefaultMaxConnectionsPerHost(5);
params.setSoTimeout(0); // no timeout
params.setStaleCheckingEnabled(true);
// ...
// TODO: set all the parameters and define them in Constants.java
CONNECTION_MANAGER.setParams(params);
//Ignore self-signed certificates in HTTPS, as we cannot register them for all sources
Protocol.registerProtocol("https", new Protocol("https",(ProtocolSocketFactory) new EasySSLProtocolSocketFactory(), 443));
if (LOGGER.isDebugEnabled()) LOGGER.debug("[Harvester] Harvester initialised.");
}
/**
* Get task from harvester.
*
* @return task The task from harvester
* @author Kadir Karaca Kocer, German National Library
*/
@Override
public OaiTask getTask() {
return this.task;
}
/**
* Get taskId from harvester.
*
* @return Returns the taskId.
* @author Kadir Karaca Kocer, German National Library
*/
@Override
public long getTaskId() {
return this.taskId;
}
/**
* Set the taskId.
*
* @param task_id The taskId to set.
* @author Kadir Karaca Kocer, German National Library
*/
// OaiTask ID is maschine generated but we need to set it at server
// initialisation from the ids in Database/Hibernate
@Override
public void setTaskId(long task_id) {
if (LOGGER.isDebugEnabled()) LOGGER.debug("Setting task id to " + task_id);
this.taskId = task_id;
}
/**
* Main loop of harvester.
*
* @see java.lang.Thread#run()
* @author Kadir Karaca Kocer, German National Library
*/
@Override
public void run() {
innerLoop();
LOGGER.info(this.msgPrefix + "Harvester - End.");
}
private void innerLoop() {
LOGGER.debug("InnerLoop: TaskDao hash code: " + this.taskDao.hashCode());
// get the OaiTask
this.task = (OaiTask)this.taskDao.getTask(Long.valueOf(this.taskId));
if (this.task == null) {
LOGGER.error("ERROR! Harvester.innerLoop(): task can not be NULL! TaskId: " + this.taskId);
return;
}
this.msgPrefix = "[OaiTask " + this.task.getTaskId() + "] ";
//First check if the Harvester is already running.
//This can be the case if the previous run of harvester could not complete
//its harvesting activity yet or the software is running on a Cluster
if (this.task.getStatus() == TaskConstants.TASK_STATUS_RUNNING) {
//yes its running already -> Log and quit.
LOGGER.warn(this.msgPrefix + "Harvester is running already. Quiting.");
return;
}
LOGGER.info(this.msgPrefix + "Harvester - Start.");
// if the user did not paused this OaiTask start processing
if (this.task.getStatus() != TaskConstants.TASK_STATUS_PAUSED) {
// set its status
this.task.setStatus(TaskConstants.TASK_STATUS_RUNNING);
this.task.getRequest().setStartedAt(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
this.taskDao.saveTask(this.task);
// initialize the variables
this.completeListSize = 0;
this.totalRecords = 0;
this.savedRecords = 0;
this.resumptionToken = null;
this.hasResumptionToken = false;
// the real loop
try {
do {
harvestRecords();
} while (this.resumptionToken != null && this.task.getStatus() == TaskConstants.TASK_STATUS_RUNNING);
// if you are here --> mission ended successfully :-)
// log it and set parameters
// save the total number of records. We assume that the higher value
// must be the correct one :-)
if (this.completeListSize > this.totalRecords) {
LOGGER.error(this.msgPrefix + "ERROR: CompleteListSize > TotalRecords!");
}
// do not save the server response if the user does not want it
if (!this.task.isSaveResponse()) {
this.task.getRequest().setServerResponse(null);
LOGGER.debug(this.msgPrefix + "Server Response erased due to user preference.");
}
//If we get here with Status "Running", everything went ok, else the Process was presumably cancelled and we want to keep that status
if(this.task.getStatus() != TaskConstants.TASK_STATUS_STOPPEDBYUSER){
this.task.setStatus(TaskConstants.TASK_STATUS_SUCCESSFULL);
this.task.getRepository().setLastKnownGood(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
this.task.getRepository().setNumberOfErrors(0);
}
} catch (HttpException e) {
// error during connecting the OAI server
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_HTTP_ERROR);
} catch (HarvesterHttpException e) {
// OAI server response is NOT 200! (SC_OK)
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_RESPONSE_NOT_200);
} catch (XmlException e) {
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_INVALID_OAI_RESPONSE);
} catch (ProcessorException e) {
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_METADATA_VALIDATION_ERROR);
} catch (IOException e) {
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_IO_ERROR);
} catch (ProcessorExportException e) {
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_EXPORT_ERROR);
}
// there were unexpected exceptions, which must be threated! So this last two should work.
catch (RuntimeException e) {
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_RUNTIME_EXCEPTION);
} catch (Exception e) {
serializeException(e);
this.task.setStatus(TaskConstants.TASK_STATUS_GENERAL_EXCEPTION);
} finally {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "We are in 'finally' of Harvester.innerLoop()!");
//write the results
this.task.getRequest().setTotalRecords(this.totalRecords);
LOGGER.info(this.msgPrefix + Message.msg31(this.totalRecords));
// write the timestamp of termination
this.task.getRequest().setFinishedAt(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
// save the next Request in a temporary variable. NULL if not a repeating task!!
Request tmpRequest = this.taskDao.getNextRequest(this.task);
// move this already terminated request into the list of processed requests
this.task.getProcessedRequests().add(this.task.getRequest());
// update the actual request if not NULL.
if (tmpRequest != null) this.task.setRequest(tmpRequest);
// check if this task repeats
if (this.task.getRunInterval() > 0) {
// yes. calculate the new startDate
this.task.setStartDate(new java.util.Date(this.task.getStartDate().getTime()
+ (((long) this.task.getRunInterval()) * 1000)));
this.task.setStatus(TaskConstants.TASK_STATUS_WAITING);
}
if(!this.taskStopRequestDao.findTaskStopRequestsByTaskId(new Long(this.taskId)).isEmpty()){
//Clear CancellationRequests for this task
for (TaskStopRequest tsr : this.taskStopRequestDao.findTaskStopRequestsByTaskId(new Long(this.taskId))) {
this.taskStopRequestDao.deleteTaskStopRequest(tsr);
}
}
}
} else {
// the user paused this task manually. Look if it is a one time task.
if (this.task.getRunInterval() > 0) {
// it repeats. do not process, just log and compute new start date
LOGGER.info(this.msgPrefix + "OaiTask is paused by user. Skipping without processing.");
this.task.setStartDate(new java.util.Date(this.task.getStartDate().getTime()
+ (((long) this.task.getRunInterval()) * 1000)));
} else {
// it was a one time task and was paused. now its too late. set status to "cancelled"
LOGGER.warn(this.msgPrefix + "One-time task was paused by user. Cancelling.");
this.task.setStatus(TaskConstants.TASK_STATUS_TIMEDOUT);
}
}
// save the new state and finish
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Calling TaskDao.saveTask()");
this.taskDao.saveTask(this.task);
}
/**
* Executes OAI command ListRecords or GetRecord.
*
* @throws HttpException
* @throws ProcessorException
* @throws IOException
* @throws HarvesterHttpException
* @throws XmlException
* @throws ProcessorExportException
* @throws Exception
*/
private void harvestRecords() throws HttpException, ProcessorException, IOException, HarvesterHttpException, XmlException, ProcessorExportException, Exception {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.harvestRecords(): start");
String requestUrl = createRequestUrl();
getAndSaveResponse(requestUrl);
this.resumptionToken = null;
OAIPMHDocument oaiDocument = null;
String response = this.getTask().getRequest().getServerResponse();
if (response == null) throw new XmlException("Server response can not be NULL!");
if (response.length() < 60) throw new XmlException("Server response incomplete:\n" + response);
//added 2010-10-11 to correct missing xmlns:xsi definitions and compact the XML. Karaca
XmlOptions xmlOptions = new XmlOptions();
xmlOptions.setSaveAggressiveNamespaces();
//xmlOptions.setCharacterEncoding("UTF-8");
xmlOptions.setLoadStripWhitespace();
xmlOptions.setLoadStripComments();
//xmlOptions.setValidateOnSet();
try {
/*
* Überführe das XML-Dokument in ein XMLBeans-Objekt (Binding) Falls
* es kein korrektes XML ist, wird hier ein Fehler geworfen (not
* well-formed)
*/
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester -> parsing server response");
oaiDocument = OAIPMHDocument.Factory.parse(response, xmlOptions);
} catch (Exception e) {
throw new XmlException(Message.msg16(), e);
}
if (oaiDocument == null) throw new XmlException("OAI Document can not be NULL!");
// Check if the user wishes to validate this XML document
if (this.task.isValidateOaiResponse()) {
// yes. validate it against the defined XML-Schema
if (!validateResponse(oaiDocument, this.task.isValidateRecords())) {
// NOT valid. Throw exception!
throw new ProcessorException(Message.msg12());
}
}
//Save ResponseDate in Request if not already done in first request
if(this.getTask().getRequest().getResponseDate() == null){
this.getTask().getRequest().setResponseDate(oaiDocument.getOAIPMH().getResponseDate().getTime());
}
//OAIPMHerrorType[] errors = oaiDocument.getOAIPMH().getErrorArray(); depracated
List errors = oaiDocument.getOAIPMH().getErrorList();
if ((errors != null) && (!errors.isEmpty())) {
if (!errors.get(0).getCode().equals(OAIPMHerrorcodeType.NO_RECORDS_MATCH)) {
String errorMsg = "OAI Harvester Request: " + requestUrl + "\n OAI Provider Response: ";
for (int i = 0; i < errors.size(); i++) {
errorMsg += errors.get(i).getStringValue();
}
throw new ProcessorException(Message.msg13(errorMsg));
}
// no records match. This is NOT an error. Set 0 in attribute.
this.task.getRequest().setTotalRecords(0);
} else {
// No errors: process the response
String verb = this.task.getVerb();
if (verb.equalsIgnoreCase("GetRecord")) {
processGetRecordDocument(oaiDocument);
} else if (verb.equalsIgnoreCase("ListRecords")) {
processListRecordsDocument(oaiDocument);
} else {
throw new ProcessorException("Unsupported OAI-Verb: '" + verb + "'");
}
}
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.harvestRecords(): end");
}
/**
* Generates a request as string to send to OAI-Server.
*
* @return Request as string.
* @throws UnsupportedEncodingException
* @see java.net.URLEncoder
* {@link "http://www.openarchives.org/pmh/"}
* {@link "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"}
*/
private String createRequestUrl() throws UnsupportedEncodingException, ProcessorException {
Request request = this.task.getRequest();
String oai_verb = this.task.getVerb();
//paranoia: check if repository is ok. May be is the DataBase corrupt ...
if (this.task.getRepository() == null) throw new ProcessorException(this.msgPrefix + "Repository can not be NULL!");
//repository ok. get the granularity and URL of the server
boolean granularity = this.task.getRepository().isHasFineGranularity();
String retVal = this.task.getRepository().getBaseUrl() + "?verb=" + oai_verb;
if (oai_verb.equalsIgnoreCase("GetRecord")) {
if (this.task.getIdentifier() != null && this.task.getIdentifier().length() != 0) {
retVal += "&identifier=" + this.task.getIdentifier();
}
if (this.task.getOai_Set() != null && this.task.getOai_Set().length() != 0) {
retVal += "&set=" + this.task.getOai_Set();
}
if (this.task.getMetadataPrefix() != null && this.task.getMetadataPrefix().length() != 0) {
retVal += "&metadataPrefix=" + this.task.getMetadataPrefix();
}
} else {
if (this.resumptionToken != null && this.resumptionToken.length() != 0) {
// some servers send Resumption Tokens containing according to RFC not allowed characters!
// encode them using java.net.URLEncoder. Encoding UTF-8 is W3C recommendation.
this.resumptionToken = this.resumptionToken.trim();
this.resumptionToken = java.net.URLEncoder.encode(this.resumptionToken, "UTF-8");
retVal += "&resumptionToken=" + this.resumptionToken;
// there is a resumption token. Set the Flag.
this.hasResumptionToken = true;
} else {
// for repeatable tasks untilDate will always be the actual timestamp
if (this.task.getRunInterval() > 0) {
if (granularity) {
request.setUntilDate(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
} else {
//minus 1 day!
Calendar calendar = Calendar.getInstance(ManagerConstants.LOCALE);
calendar.add(Calendar.DAY_OF_MONTH, -1);
request.setUntilDate(calendar.getTime());
}
}
if (request.getFromDate() != null) {
if (request.getFromDate().after(request.getUntilDate())) {
//correct the dates. should not occur!
LOGGER.warn("FROM Date is greater than UNTIL Date! Please check the parameters!");
request.setFromDate(request.getUntilDate());
}
retVal += "&from=" + convertToOaiDate(request.getFromDate(), granularity);
}
// for the others the from user given timestamp
if (request.getUntilDate() != null) {
retVal += "&until=" + convertToOaiDate(request.getUntilDate(), granularity);
}
// add the OAI-Set if any
if (this.task.getOai_Set() != null && this.task.getOai_Set().length() != 0) {
retVal += "&set=" + this.task.getOai_Set();
}
if (this.task.getMetadataPrefix() != null && this.task.getMetadataPrefix().length() != 0) {
retVal += "&metadataPrefix=" + this.task.getMetadataPrefix();
} else {
// ERROR! MetadataPrefix is mandatory!
LOGGER.error(this.msgPrefix + "ERROR: MetadataPrefix is Mandatory");
}
}
}
return retVal;
}
/**
* Sends the request to the OAI-PMH server and saves the response in the
* Request object of this task.
*
* From Apache Documentation:
* The HTTP GET method is defined in section 9.3 of RFC2616: The GET method
* means retrieve whatever information (in the form of an entity) is
* identified by the Request-URI. ... GetMethods will follow redirect
* requests from the http server by default. This behavour can be disabled
* by calling setFollowRedirects(false).
*
* @param requestUrl Complete request as a string
* @throws HttpException
* @throws IOException
* @throws HarvesterHttpException
* @see DefaultHttpMethodRetryHandler
* @see HttpClientParams
* @see GetMethod
* @author Kadir Karaca Kocer, German National Library
*/
private void getAndSaveResponse(String requestUrl) throws IOException, HttpException, HarvesterHttpException, RuntimeException, Exception {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.getAndSaveResponse(): start");
DefaultHttpMethodRetryHandler retryHandler =
new DefaultHttpMethodRetryHandler(this.task.getRepository().getMaxNumberOfRetries(), true);
// save the original sent request as string for debug purposes
if (!this.hasResumptionToken) {
// but only if its the first time without resumption token
this.task.getRequest().setRequestAsString(requestUrl);
}
// HTTP Method & its parameters
GetMethod getMethod = new GetMethod(requestUrl);
getMethod.getParams().setUriCharset("UTF-8");
getMethod.getParams().setSoTimeout(this.task.getRepository().getSocketTimeout());
// HTTP Client & its parameters
HttpClientParams params = new HttpClientParams();
if (this.task.getRepository().isStrictHTTP()) {
params.makeStrict();
} else {
params.makeLenient();
}
params.setSoTimeout(this.task.getRepository().getSocketTimeout());
params.setParameter("RETRY_HANDLER", retryHandler);
int statusCode = 0;
String response = "";
try {
HTTP_CLIENT.setParams(params);
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester -> sending request:\n" + requestUrl);
statusCode = HTTP_CLIENT.executeMethod(getMethod);
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "HTTP Status Code is: " + statusCode);
}
catch (java.net.SocketTimeoutException e) {
LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): java.net.SocketTimeoutException Exception!: " + e, e);
throw e;
}
catch (IOException e) {
LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): IOException!: " + e, e);
throw e;
}
catch (RuntimeException e) {
LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): Runtime Exception!: " + e, e);
throw e;
}
catch (Exception e) {
LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): Exception!: " + e, e);
throw e;
}
finally {
response = getMethod.getResponseBodyAsString();
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Response received. Closing the connection.");
getMethod.releaseConnection();
}
if (statusCode != HttpStatus.SC_OK) {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "HarvesterHTTPException: HTTP status code is not 200!");
throw new HarvesterHttpException(Message.msg11(this.task.getRepository().getBaseUrl()));
}
// save server response for each request for further processing
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "HTTP Status Code is OK. Saving response.");
this.task.getRequest().setServerResponse(response);
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.getAndSaveResponse(): end");
}
/**
* Gets Records from ListRecords-Document and saves them.
*
* @param oaiDocument Document to process.
* @throws ProcessorException
* @throws ProcessorExportException
* @throws IOException
* @since 20.06.2008
*/
private void processListRecordsDocument(OAIPMHDocument oaiDocument) throws IOException, ProcessorExportException, ProcessorException {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processListRecordsDocument(): start");
ListRecordsType listRecords = oaiDocument.getOAIPMH().getListRecords();
if (listRecords == null) throw new ProcessorException("oaiDocument.getOAIPMH().getListRecords() returned NULL.");
// check if a resumption token exists
if (listRecords.isSetResumptionToken()) {
// yes. check if its valid
if (listRecords.getResumptionToken() == null) throw new ProcessorException("getResumptionToken() returned NULL.");
//read & save it
String tmpToken = listRecords.getResumptionToken().getStringValue().trim();
if ((tmpToken != null) && (tmpToken.length() > 0)) {
this.resumptionToken = tmpToken;
} else {
LOGGER.info(this.msgPrefix + "INFO: Resumption Token is NULL or empty!");
}
// try to determine the total number of records.
// first check: the XML-Tag is NOT mandatory!
if (listRecords.getResumptionToken().getCompleteListSize() != null) {
this.completeListSize = listRecords.getResumptionToken().getCompleteListSize().intValue();
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "CompleteListSize = " + this.completeListSize);
} else {
this.completeListSize = 0;
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "CompleteListSize is not defined, resetting it to 0");
}
}
//RecordType[] records = listRecords.getRecordArray();
List records = listRecords.getRecordList();
if (records == null) throw new ProcessorException("listRecords.getRecordArray() returned NULL.");
//this.totalRecords = this.totalRecords + records.length;
this.totalRecords = this.totalRecords + records.size();
// save the records
saveRecords(records);
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processListRecordsDocument(): end");
}
/**
* Gets Records from GetRecord-Document and saves them.
*
* @param oaiDocument Document to process.
* @throws IOException
* @throws ProcessorExportException
* @throws ProcessorException
* @since 20.06.2008
*/
private void processGetRecordDocument(OAIPMHDocument oaiDocument) throws IOException, ProcessorExportException, ProcessorException {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processGetRecordsDocument(): start");
//get the record
RecordType rt = oaiDocument.getOAIPMH().getGetRecord().getRecord();
//return if no record found
if (rt == null) return;
if (rt.isNil()) return;
// save the total number of records
this.totalRecords = this.totalRecords++;
this.task.getRequest().setTotalRecords(this.totalRecords);
ArrayList records = new ArrayList();//{ oaiDocument.getOAIPMH().getGetRecord().getRecord() };
records.add(rt);
saveRecords(records);
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processGetRecordsDocument(): end");
}
/**
* Saves the harvested records.
*
* @param records
* @throws IOException
* @throws ProcessorExportException
* @throws ProcessorException
*/
private void saveRecords(List records) throws IOException, ProcessorExportException, ProcessorException {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.saveRecords(): start");
Request request = this.task.getRequest();
// get the data receiver
//TODO: shall we get it each time? What about getting only once?
ExportHandler out = this.exportHandlerDao.getExportHandlerById(this.task.getExportHandlerId());
if (out != null) {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "saveRecords: ExportHandler initialised");
for (int i = 0; i < records.size(); i++) {
RecordType recordType = records.get(i);
if(out.save(this, recordType, new Long(System.currentTimeMillis()).toString())){
this.savedRecords++;
}
//Check if a TaskCancellationRequest was created by user while running, stop processing!
if(!this.taskStopRequestDao.findTaskStopRequestsByTaskId(new Long(this.taskId)).isEmpty()){
LOGGER.info(this.msgPrefix + "saveRecords: Processing was cancelled by demand of User!");
this.task.setStatus(TaskConstants.TASK_STATUS_STOPPEDBYUSER);
break;
}
}
request.setRecordsHarvested(this.savedRecords);
LOGGER.info(this.msgPrefix + "saveRecords: Records saved.");
} else {
LOGGER.error("ERROR: Harvester.saveRecords() -- ExportHandler is NULL!");
// problems with data receiver -> end.
throw new ProcessorException(this.msgPrefix + Message.msg15());
}
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.saveRecords(): end " + this.savedRecords + " records saved.");
}
/**
* Validiert the returned XML.
*
* @param document
* @param isValidateRecords
* @return TRUE if XML is valid. FALSE otherweise.
* @see org.openarchives.oai.x20.OAIPMHDocument
* @see org.apache.xmlbeans.XmlOptions
*/
@SuppressWarnings("static-method")
private boolean validateResponse(OAIPMHDocument document, boolean isValidateRecords) {
return true;
//TODO: find a way to validate with "strict"!
// if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.validateResponse(): start");
// boolean retVal;
// ArrayList validationErrors = new ArrayList();
// XmlOptions validationOptions = new XmlOptions();
//
// validationOptions.setErrorListener(validationErrors);
//
// if (!isValidateRecords) {
// validationOptions.setValidateTreatLaxAsSkip();
// }
//
// retVal = document.validate(validationOptions);
// if (!retVal) {
// String message = "XML validation error:\n";
// for (org.apache.xmlbeans.XmlValidationError error: validationErrors) {
// message = message + "\t" + error.toString();
// }
// LOGGER.warn(message);
// }
// if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.validateResponse(): end. Result = " + retVal);
//
// return retVal;
}
/**
* Converts a java.util.Date into OAI-PMH valid Date-String.
* Please note that all timestamps in OAI-PMH must be expressed in UTC!
*
* @param date Date to convert
* @param isComplexFormat Boolean that indicates if the result day or second granularity has.
* @return String representation in UTC of given Date-Time.
* @see java.util.TimeZone
*/
private static String convertToOaiDate(java.util.Date date, boolean isComplexFormat) {
String retVal;
if (isComplexFormat) {
java.text.SimpleDateFormat complexDate = new java.text.SimpleDateFormat(ManagerConstants.COMPLEX_DATE_PATTERN);
complexDate.setTimeZone(TimeZone.getTimeZone("UTC"));
retVal = complexDate.format(date);
} else {
java.text.SimpleDateFormat simpleDate = new java.text.SimpleDateFormat(ManagerConstants.SIMPLE_DATE_PATTERN);
simpleDate.setTimeZone(TimeZone.getTimeZone("UTC"));
retVal = simpleDate.format(date);
}
//logger.debug(this.msgPrefix + this.msgPrefix + "Harvester.convertToOaiDate(): " + retVal);
return retVal;
}
/**
* Creates a string representation of given Exception, increases the number
* of errors and saves this information in corresponding objects. Those will
* be saved persistent via Hibernate if the user wishes.
*
* @param e The Exception to serialise.
* @author Kadir Karaca Kocer, German National Library
*/
private void serializeException(Exception e) {
if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "An Exception occured! Serializing ... ");
String message = e.getMessage();
String exceptionText = "";
if ((message != null) && (message.length() > 1)) {
exceptionText = message + "\n";
} else {
exceptionText = "No message for this exception defined.\n";
}
StackTraceElement[] ste = e.getStackTrace();
for (int i = 0; i < ste.length; i++) {
exceptionText = exceptionText + "\n" + ste[i].toString();
}
exceptionText = exceptionText + "\n\nActual value of the Resumption Token is:\n" + this.resumptionToken;
exceptionText = exceptionText + "\nRecords saved by previous resumption tokens: " + this.savedRecords;
exceptionText = exceptionText + "\nError timestamp (UTC): " + convertToOaiDate(new java.util.Date(), true);
this.task.getRequest().setExceptionMessage(exceptionText);
this.task.getRepository().increaseNumberOfErrors();
LOGGER.error(this.msgPrefix + exceptionText);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy