All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.dnb.oai.harvester.Harvester Maven / Gradle / Ivy

/**********************************************************************
 * Class Harvester
 *  
 * Copyright (c) 2005-2012, German National Library / Deutsche Nationalbibliothek
 * Adickesallee 1, D-60322 Frankfurt am Main, Federal Republic of Germany 
 *
 * This program is free software.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Kadir Karaca Kocer -- German National Library
 * 
 **********************************************************************/

package de.dnb.oai.harvester;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.TimeZone;

import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlOptions;

import org.openarchives.oai.x20.*;
import org.springframework.context.ApplicationContext;

import de.dnb.oai.harvester.task.OaiTask;
import de.dnb.oai.harvester.task.OaiTaskDao;
import de.dnb.oai.harvester.task.Request;

import de.dnb.stm.handler.export.ExportHandler;
import de.dnb.stm.handler.export.ExportHandlerDao;
import de.dnb.stm.handler.export.ProcessorExportException;
import de.dnb.stm.manager.ManagerConstants;
import de.dnb.stm.processor.AbstractProcessor;
import de.dnb.stm.processor.ProcessorException;
import de.dnb.stm.task.TaskConstants;
import de.dnb.stm.task.TaskStopRequest;
import de.dnb.stm.task.TaskStopRequestDao;

import de.dnb.oai.harvester.util.EasySSLProtocolSocketFactory;

/** *******************************************************************
 * A Harvester is a Runnable executing a defined OaiTask object.
 * 
 * @author Kadir Karaca Kocer, German National Library
 * @version 20081210
 * @see de.dnb.oai.harvester.task.OaiTask
 * @see de.dnb.oai.harvester.task.Request
 * OAI-PMH Specification {@link "http://www.openarchives.org/pmh/"} 
 * W3C {@link "http://www.w3.org/TR/html40/appendix/notes.html"}
 * @since 10.09.2005
 **********************************************************************/

/* ********************************************************************
 * CHANGELOG:
 * 2012.09.26 Refactored ApplicationContextUtil
 * 2009.02.20 Corrected the UTC Timezone problem
 * 2008.12.05 Bugfixes, Kocer
 * 2008.11.20 modified to use OAIHarvester Class, Kocer 
 * Refactored 15.07.2008 by Kadir Karaca Kocer, German National Library
 * Created on 10.09.2005 by Alexander Slotta, German National Library
 **********************************************************************/

public class Harvester extends AbstractProcessor{
    private static final Log LOGGER = LogFactory.getLog(Harvester.class);
	// the connections to OAI-Server
	private final static MultiThreadedHttpConnectionManager CONNECTION_MANAGER = new MultiThreadedHttpConnectionManager();
	private final static HttpClient HTTP_CLIENT = new HttpClient(CONNECTION_MANAGER);

	private String resumptionToken;
	// we want to know how many records are there and how many of them are successfully saved.
	// this is a bit tricky becouse CompleteListSize tag in OAI-PMH answer is NOT mandatory!
	private int completeListSize;
	// so we count the records if the server does not use this tag.
	// at the end we compare those two values and take the higher one
	private int totalRecords; //
	private int savedRecords; // how many records did we already save?
	private String msgPrefix; // prefix string for logger
	private boolean hasResumptionToken; // we have to know if there is a resumption token

	private long taskId;
	private OaiTask task;
	
	private OaiTaskDao taskDao;
	private TaskStopRequestDao taskStopRequestDao;
	private ExportHandlerDao exportHandlerDao;

	/**
	 * Constructor of class Harvester.
	 * 
	 * @see java.lang.Runnable
	 * @see org.apache.commons.httpclient.MultiThreadedHttpConnectionManager
	 * @author Kadir Karaca Kocer, German National Library
	 */
	public Harvester() {
		if (LOGGER.isDebugEnabled()) LOGGER.debug("[Harvester] A new Harvester is born!");
	}

	/**
	 * @param context ApplicationContext
	 */
	@Override
    public void init(ApplicationContext context) {
	       this.taskDao = (OaiTaskDao)context.getBean("taskDao");
	        this.taskStopRequestDao = (TaskStopRequestDao)context.getBean("taskStopRequestDao");
	        this.exportHandlerDao = (ExportHandlerDao) context.getBean("exportHandlerDao");
	        
	        HttpConnectionManagerParams params = new HttpConnectionManagerParams();
	        params.setMaxTotalConnections(50);
	        params.setConnectionTimeout(900000); // 15 Minutes
	        params.setDefaultMaxConnectionsPerHost(5);
	        params.setSoTimeout(0); // no timeout
	        params.setStaleCheckingEnabled(true);
	        // ...
	        // TODO: set all the parameters and define them in Constants.java
	        CONNECTION_MANAGER.setParams(params);
	        
	        //Ignore self-signed certificates in HTTPS, as we cannot register them for all sources
	        Protocol.registerProtocol("https", new Protocol("https",(ProtocolSocketFactory) new EasySSLProtocolSocketFactory(), 443));

	        if (LOGGER.isDebugEnabled()) LOGGER.debug("[Harvester] Harvester initialised.");
	}
	/**
	 * Get task from harvester.
	 * 
	 * @return task The task from harvester
	 * @author Kadir Karaca Kocer, German National Library
	 */
	@Override
    public OaiTask getTask() {
		return this.task;
	}

	/**
	 * Get taskId from harvester.
	 * 
	 * @return Returns the taskId.
	 * @author Kadir Karaca Kocer, German National Library
	 */
	@Override
    public long getTaskId() {
		return this.taskId;
	}

	/**
	 * Set the taskId.
	 * 
	 * @param task_id The taskId to set.
	 * @author Kadir Karaca Kocer, German National Library
	 */
	// OaiTask ID is maschine generated but we need to set it at server
	// initialisation from the ids in Database/Hibernate
	@Override
    public void setTaskId(long task_id) {
	    if (LOGGER.isDebugEnabled()) LOGGER.debug("Setting task id to " + task_id);
		this.taskId = task_id;
	}

	/**
	 * Main loop of harvester.
	 * 
	 * @see java.lang.Thread#run()
	 * @author Kadir Karaca Kocer, German National Library
	 */
	@Override
	public void run() {
		innerLoop();
		LOGGER.info(this.msgPrefix + "Harvester - End.");
	}

	private void innerLoop() {
		
	    LOGGER.debug("InnerLoop: TaskDao hash code: " + this.taskDao.hashCode());
	    // get the OaiTask
		this.task = (OaiTask)this.taskDao.getTask(Long.valueOf(this.taskId));
		if (this.task == null) {
		    LOGGER.error("ERROR! Harvester.innerLoop(): task can not be NULL! TaskId: " + this.taskId);
			return;
		}
			
		this.msgPrefix = "[OaiTask " + this.task.getTaskId() + "] ";
		//First check if the Harvester is already running.
		//This can be the case if the previous run of harvester could not complete
		//its harvesting activity yet or the software is running on a Cluster 
		if (this.task.getStatus() == TaskConstants.TASK_STATUS_RUNNING) {
			//yes its running already -> Log and quit.
		    LOGGER.warn(this.msgPrefix + "Harvester is running already. Quiting.");
			return;
		}
		
		LOGGER.info(this.msgPrefix + "Harvester - Start.");
		// if the user did not paused this OaiTask start processing
		if (this.task.getStatus() != TaskConstants.TASK_STATUS_PAUSED) {
			// set its status
			this.task.setStatus(TaskConstants.TASK_STATUS_RUNNING);
			this.task.getRequest().setStartedAt(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
			this.taskDao.saveTask(this.task);
			// initialize the variables
			this.completeListSize = 0;
			this.totalRecords = 0;
			this.savedRecords = 0;
			this.resumptionToken = null;
			this.hasResumptionToken = false;
			// the real loop
			try {
				do {
					harvestRecords();
				} while (this.resumptionToken != null && this.task.getStatus() == TaskConstants.TASK_STATUS_RUNNING);

				// if you are here --> mission ended successfully :-)
				// log it and set parameters
				// save the total number of records. We assume that the higher value
				// must be the correct one :-)
				if (this.completeListSize > this.totalRecords) {
				    LOGGER.error(this.msgPrefix + "ERROR: CompleteListSize > TotalRecords!");
				}
				// do not save the server response if the user does not want it
				if (!this.task.isSaveResponse()) {
					this.task.getRequest().setServerResponse(null);
					LOGGER.debug(this.msgPrefix + "Server Response erased due to user preference.");
				}
				//If we get here with Status "Running", everything went ok, else the Process was presumably cancelled and we want to keep that status
				if(this.task.getStatus() != TaskConstants.TASK_STATUS_STOPPEDBYUSER){
					this.task.setStatus(TaskConstants.TASK_STATUS_SUCCESSFULL);
					this.task.getRepository().setLastKnownGood(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
					this.task.getRepository().setNumberOfErrors(0);
				}
			} catch (HttpException e) {
				// error during connecting the OAI server
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_HTTP_ERROR);
			} catch (HarvesterHttpException e) {
				// OAI server response is NOT 200! (SC_OK)
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_RESPONSE_NOT_200);
			} catch (XmlException e) {
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_INVALID_OAI_RESPONSE);
			} catch (ProcessorException e) {
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_METADATA_VALIDATION_ERROR);
			} catch (IOException e) {
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_IO_ERROR);
			} catch (ProcessorExportException e) {
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_EXPORT_ERROR);
			}
			// there were unexpected exceptions, which must be threated! So this last two should work.
			catch (RuntimeException e) {
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_RUNTIME_EXCEPTION);
			} catch (Exception e) {
				serializeException(e);
				this.task.setStatus(TaskConstants.TASK_STATUS_GENERAL_EXCEPTION);
			} finally {
			    if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "We are in 'finally' of Harvester.innerLoop()!");
				//write the results
				this.task.getRequest().setTotalRecords(this.totalRecords);
				LOGGER.info(this.msgPrefix	+ Message.msg31(this.totalRecords));
				// write the timestamp of termination
				this.task.getRequest().setFinishedAt(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
				// save the next Request in a temporary variable. NULL if not a repeating task!!
				Request tmpRequest = this.taskDao.getNextRequest(this.task);
				// move this already terminated request into the list of processed requests
				this.task.getProcessedRequests().add(this.task.getRequest());
				// update the actual request if not NULL.
				if (tmpRequest != null) this.task.setRequest(tmpRequest);
				// check if this task repeats
				if (this.task.getRunInterval() > 0) {
					// yes. calculate the new startDate
					this.task.setStartDate(new java.util.Date(this.task.getStartDate().getTime()
							+ (((long) this.task.getRunInterval()) * 1000)));
					this.task.setStatus(TaskConstants.TASK_STATUS_WAITING);
				}
				if(!this.taskStopRequestDao.findTaskStopRequestsByTaskId(new Long(this.taskId)).isEmpty()){
					//Clear CancellationRequests for this task
					for (TaskStopRequest tsr : this.taskStopRequestDao.findTaskStopRequestsByTaskId(new Long(this.taskId))) {
						this.taskStopRequestDao.deleteTaskStopRequest(tsr);
					}
				}
			}
		} else {
			// the user paused this task manually. Look if it is a one time task.
			if (this.task.getRunInterval() > 0) {
				// it repeats. do not process, just log and compute new start date
			    LOGGER.info(this.msgPrefix + "OaiTask is paused by user. Skipping without processing.");
				this.task.setStartDate(new java.util.Date(this.task.getStartDate().getTime()
						+ (((long) this.task.getRunInterval()) * 1000)));
			} else {
				// it was a one time task and was paused. now its too late. set status to "cancelled"
			    LOGGER.warn(this.msgPrefix + "One-time task was paused by user. Cancelling.");
				this.task.setStatus(TaskConstants.TASK_STATUS_TIMEDOUT);
			}
		}
		// save the new state and finish
		if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Calling TaskDao.saveTask()");
		this.taskDao.saveTask(this.task);
	}

	/**
	 * Executes OAI command ListRecords or GetRecord.
	 * 
	 * @throws HttpException
	 * @throws ProcessorException
	 * @throws IOException
	 * @throws HarvesterHttpException
	 * @throws XmlException
	 * @throws ProcessorExportException
	 * @throws Exception
	 */
	private void harvestRecords() throws HttpException, ProcessorException, IOException, HarvesterHttpException, XmlException, ProcessorExportException, Exception {
	    if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.harvestRecords(): start");

		String requestUrl = createRequestUrl();
		getAndSaveResponse(requestUrl);

		this.resumptionToken = null;
		OAIPMHDocument oaiDocument = null;
		String response = this.getTask().getRequest().getServerResponse();

		if (response == null) throw new XmlException("Server response can not be NULL!");
		if (response.length() < 60)	throw new XmlException("Server response incomplete:\n" + response);
		
		//added 2010-10-11 to correct missing xmlns:xsi definitions and compact the XML. Karaca
		XmlOptions xmlOptions = new XmlOptions();
		xmlOptions.setSaveAggressiveNamespaces();
		//xmlOptions.setCharacterEncoding("UTF-8");
		xmlOptions.setLoadStripWhitespace();
		xmlOptions.setLoadStripComments();
		//xmlOptions.setValidateOnSet();
		try {
			/*
			 * Überführe das XML-Dokument in ein XMLBeans-Objekt (Binding) Falls
			 * es kein korrektes XML ist, wird hier ein Fehler geworfen (not
			 * well-formed)
			 */
		    if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester -> parsing server response");
			oaiDocument = OAIPMHDocument.Factory.parse(response, xmlOptions);
		} catch (Exception e) {
			throw new XmlException(Message.msg16(), e);
		}

		if (oaiDocument == null) throw new XmlException("OAI Document can not be NULL!");

		// Check if the user wishes to validate this XML document
		if (this.task.isValidateOaiResponse()) {
			// yes. validate it against the defined XML-Schema
			if (!validateResponse(oaiDocument, this.task.isValidateRecords())) {
				// NOT valid. Throw exception!
				throw new ProcessorException(Message.msg12());
			}
		}
		
		//Save ResponseDate in Request if not already done in first request
		if(this.getTask().getRequest().getResponseDate() == null){
			this.getTask().getRequest().setResponseDate(oaiDocument.getOAIPMH().getResponseDate().getTime());
		}

		//OAIPMHerrorType[] errors = oaiDocument.getOAIPMH().getErrorArray(); depracated
		List errors = oaiDocument.getOAIPMH().getErrorList();
		if ((errors != null) && (!errors.isEmpty())) {
			if (!errors.get(0).getCode().equals(OAIPMHerrorcodeType.NO_RECORDS_MATCH)) {
				String errorMsg = "OAI Harvester Request: " + requestUrl + "\n OAI Provider Response: ";
				for (int i = 0; i < errors.size(); i++) {
					errorMsg += errors.get(i).getStringValue();
				}
				throw new ProcessorException(Message.msg13(errorMsg));
			}
			// no records match. This is NOT an error. Set 0 in attribute.
			this.task.getRequest().setTotalRecords(0);
		} else {
			// No errors: process the response
			String verb = this.task.getVerb();
			if (verb.equalsIgnoreCase("GetRecord")) {
				processGetRecordDocument(oaiDocument);
			} else if (verb.equalsIgnoreCase("ListRecords")) {
				processListRecordsDocument(oaiDocument);
			} else {
				throw new ProcessorException("Unsupported OAI-Verb: '" + verb + "'");
			}
		}
		if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.harvestRecords(): end");
	}


	/**
	 * Generates a request as string to send to OAI-Server.
	 * 
	 * @return Request as string.
	 * @throws UnsupportedEncodingException
	 * @see java.net.URLEncoder
	 * {@link "http://www.openarchives.org/pmh/"}
	 * {@link "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"}
	 */
	private String createRequestUrl() throws UnsupportedEncodingException, ProcessorException {
		Request request = this.task.getRequest();
		String oai_verb = this.task.getVerb();
		
		//paranoia: check if repository is ok. May be is the DataBase corrupt ... 
		if (this.task.getRepository() == null) throw new ProcessorException(this.msgPrefix + "Repository can not be NULL!");
		//repository ok. get the granularity and URL of the server
		boolean granularity = this.task.getRepository().isHasFineGranularity();
		String retVal = this.task.getRepository().getBaseUrl() + "?verb=" + oai_verb;

		if (oai_verb.equalsIgnoreCase("GetRecord")) {
			if (this.task.getIdentifier() != null && this.task.getIdentifier().length() != 0) {
				retVal += "&identifier=" + this.task.getIdentifier();
			}

			if (this.task.getOai_Set() != null && this.task.getOai_Set().length() != 0) {
				retVal += "&set=" + this.task.getOai_Set();
			}

			if (this.task.getMetadataPrefix() != null && this.task.getMetadataPrefix().length() != 0) {
				retVal += "&metadataPrefix=" + this.task.getMetadataPrefix();
			}
		} else {
			if (this.resumptionToken != null && this.resumptionToken.length() != 0) {
				// some servers send Resumption Tokens containing according to RFC not allowed characters!
				// encode them using java.net.URLEncoder. Encoding UTF-8 is W3C recommendation.
				this.resumptionToken = this.resumptionToken.trim();
				this.resumptionToken = java.net.URLEncoder.encode(this.resumptionToken, "UTF-8");
				retVal += "&resumptionToken=" + this.resumptionToken;
				// there is a resumption token. Set the Flag.
				this.hasResumptionToken = true;
			} else {
				// for repeatable tasks untilDate will always be the actual timestamp
				if (this.task.getRunInterval() > 0) {
					if (granularity) {
						request.setUntilDate(Calendar.getInstance(ManagerConstants.LOCALE).getTime());
					} else {
						//minus 1 day!
						Calendar calendar = Calendar.getInstance(ManagerConstants.LOCALE);
						calendar.add(Calendar.DAY_OF_MONTH, -1);
						request.setUntilDate(calendar.getTime());
					}
				}
				
				if (request.getFromDate() != null) {
					if (request.getFromDate().after(request.getUntilDate())) {
						//correct the dates. should not occur!
					    LOGGER.warn("FROM Date is greater than UNTIL Date! Please check the parameters!");
						request.setFromDate(request.getUntilDate());
					}
					retVal += "&from=" + convertToOaiDate(request.getFromDate(), granularity);
				}
				// for the others the from user given timestamp
				if (request.getUntilDate() != null) {
					retVal += "&until=" + convertToOaiDate(request.getUntilDate(), granularity);
				}
				// add the OAI-Set if any
				if (this.task.getOai_Set() != null && this.task.getOai_Set().length() != 0) {
					retVal += "&set=" + this.task.getOai_Set();
				}

				if (this.task.getMetadataPrefix() != null && this.task.getMetadataPrefix().length() != 0) {
					retVal += "&metadataPrefix=" + this.task.getMetadataPrefix();
				} else {
					// ERROR! MetadataPrefix is mandatory!
				    LOGGER.error(this.msgPrefix + "ERROR: MetadataPrefix is Mandatory");
				}
			}
		}

		return retVal;
	}


	/**
	 * Sends the request to the OAI-PMH server and saves the response in the
	 * Request object of this task.
* * From Apache Documentation:
* The HTTP GET method is defined in section 9.3 of RFC2616: The GET method * means retrieve whatever information (in the form of an entity) is * identified by the Request-URI. ... GetMethods will follow redirect * requests from the http server by default. This behavour can be disabled * by calling setFollowRedirects(false). * * @param requestUrl Complete request as a string * @throws HttpException * @throws IOException * @throws HarvesterHttpException * @see DefaultHttpMethodRetryHandler * @see HttpClientParams * @see GetMethod * @author Kadir Karaca Kocer, German National Library */ private void getAndSaveResponse(String requestUrl) throws IOException, HttpException, HarvesterHttpException, RuntimeException, Exception { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.getAndSaveResponse(): start"); DefaultHttpMethodRetryHandler retryHandler = new DefaultHttpMethodRetryHandler(this.task.getRepository().getMaxNumberOfRetries(), true); // save the original sent request as string for debug purposes if (!this.hasResumptionToken) { // but only if its the first time without resumption token this.task.getRequest().setRequestAsString(requestUrl); } // HTTP Method & its parameters GetMethod getMethod = new GetMethod(requestUrl); getMethod.getParams().setUriCharset("UTF-8"); getMethod.getParams().setSoTimeout(this.task.getRepository().getSocketTimeout()); // HTTP Client & its parameters HttpClientParams params = new HttpClientParams(); if (this.task.getRepository().isStrictHTTP()) { params.makeStrict(); } else { params.makeLenient(); } params.setSoTimeout(this.task.getRepository().getSocketTimeout()); params.setParameter("RETRY_HANDLER", retryHandler); int statusCode = 0; String response = ""; try { HTTP_CLIENT.setParams(params); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester -> sending request:\n" + requestUrl); statusCode = HTTP_CLIENT.executeMethod(getMethod); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "HTTP Status Code is: " + statusCode); } catch (java.net.SocketTimeoutException e) { LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): java.net.SocketTimeoutException Exception!: " + e, e); throw e; } catch (IOException e) { LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): IOException!: " + e, e); throw e; } catch (RuntimeException e) { LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): Runtime Exception!: " + e, e); throw e; } catch (Exception e) { LOGGER.error(this.msgPrefix + "Harvester.getAndSaveResponse(): Exception!: " + e, e); throw e; } finally { response = getMethod.getResponseBodyAsString(); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Response received. Closing the connection."); getMethod.releaseConnection(); } if (statusCode != HttpStatus.SC_OK) { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "HarvesterHTTPException: HTTP status code is not 200!"); throw new HarvesterHttpException(Message.msg11(this.task.getRepository().getBaseUrl())); } // save server response for each request for further processing if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "HTTP Status Code is OK. Saving response."); this.task.getRequest().setServerResponse(response); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.getAndSaveResponse(): end"); } /** * Gets Records from ListRecords-Document and saves them. * * @param oaiDocument Document to process. * @throws ProcessorException * @throws ProcessorExportException * @throws IOException * @since 20.06.2008 */ private void processListRecordsDocument(OAIPMHDocument oaiDocument) throws IOException, ProcessorExportException, ProcessorException { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processListRecordsDocument(): start"); ListRecordsType listRecords = oaiDocument.getOAIPMH().getListRecords(); if (listRecords == null) throw new ProcessorException("oaiDocument.getOAIPMH().getListRecords() returned NULL."); // check if a resumption token exists if (listRecords.isSetResumptionToken()) { // yes. check if its valid if (listRecords.getResumptionToken() == null) throw new ProcessorException("getResumptionToken() returned NULL."); //read & save it String tmpToken = listRecords.getResumptionToken().getStringValue().trim(); if ((tmpToken != null) && (tmpToken.length() > 0)) { this.resumptionToken = tmpToken; } else { LOGGER.info(this.msgPrefix + "INFO: Resumption Token is NULL or empty!"); } // try to determine the total number of records. // first check: the XML-Tag is NOT mandatory! if (listRecords.getResumptionToken().getCompleteListSize() != null) { this.completeListSize = listRecords.getResumptionToken().getCompleteListSize().intValue(); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "CompleteListSize = " + this.completeListSize); } else { this.completeListSize = 0; if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "CompleteListSize is not defined, resetting it to 0"); } } //RecordType[] records = listRecords.getRecordArray(); List records = listRecords.getRecordList(); if (records == null) throw new ProcessorException("listRecords.getRecordArray() returned NULL."); //this.totalRecords = this.totalRecords + records.length; this.totalRecords = this.totalRecords + records.size(); // save the records saveRecords(records); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processListRecordsDocument(): end"); } /** * Gets Records from GetRecord-Document and saves them. * * @param oaiDocument Document to process. * @throws IOException * @throws ProcessorExportException * @throws ProcessorException * @since 20.06.2008 */ private void processGetRecordDocument(OAIPMHDocument oaiDocument) throws IOException, ProcessorExportException, ProcessorException { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processGetRecordsDocument(): start"); //get the record RecordType rt = oaiDocument.getOAIPMH().getGetRecord().getRecord(); //return if no record found if (rt == null) return; if (rt.isNil()) return; // save the total number of records this.totalRecords = this.totalRecords++; this.task.getRequest().setTotalRecords(this.totalRecords); ArrayList records = new ArrayList();//{ oaiDocument.getOAIPMH().getGetRecord().getRecord() }; records.add(rt); saveRecords(records); if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.processGetRecordsDocument(): end"); } /** * Saves the harvested records. * * @param records * @throws IOException * @throws ProcessorExportException * @throws ProcessorException */ private void saveRecords(List records) throws IOException, ProcessorExportException, ProcessorException { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.saveRecords(): start"); Request request = this.task.getRequest(); // get the data receiver //TODO: shall we get it each time? What about getting only once? ExportHandler out = this.exportHandlerDao.getExportHandlerById(this.task.getExportHandlerId()); if (out != null) { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "saveRecords: ExportHandler initialised"); for (int i = 0; i < records.size(); i++) { RecordType recordType = records.get(i); if(out.save(this, recordType, new Long(System.currentTimeMillis()).toString())){ this.savedRecords++; } //Check if a TaskCancellationRequest was created by user while running, stop processing! if(!this.taskStopRequestDao.findTaskStopRequestsByTaskId(new Long(this.taskId)).isEmpty()){ LOGGER.info(this.msgPrefix + "saveRecords: Processing was cancelled by demand of User!"); this.task.setStatus(TaskConstants.TASK_STATUS_STOPPEDBYUSER); break; } } request.setRecordsHarvested(this.savedRecords); LOGGER.info(this.msgPrefix + "saveRecords: Records saved."); } else { LOGGER.error("ERROR: Harvester.saveRecords() -- ExportHandler is NULL!"); // problems with data receiver -> end. throw new ProcessorException(this.msgPrefix + Message.msg15()); } if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.saveRecords(): end " + this.savedRecords + " records saved."); } /** * Validiert the returned XML. * * @param document * @param isValidateRecords * @return TRUE if XML is valid. FALSE otherweise. * @see org.openarchives.oai.x20.OAIPMHDocument * @see org.apache.xmlbeans.XmlOptions */ @SuppressWarnings("static-method") private boolean validateResponse(OAIPMHDocument document, boolean isValidateRecords) { return true; //TODO: find a way to validate with "strict"! // if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.validateResponse(): start"); // boolean retVal; // ArrayList validationErrors = new ArrayList(); // XmlOptions validationOptions = new XmlOptions(); // // validationOptions.setErrorListener(validationErrors); // // if (!isValidateRecords) { // validationOptions.setValidateTreatLaxAsSkip(); // } // // retVal = document.validate(validationOptions); // if (!retVal) { // String message = "XML validation error:\n"; // for (org.apache.xmlbeans.XmlValidationError error: validationErrors) { // message = message + "\t" + error.toString(); // } // LOGGER.warn(message); // } // if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "Harvester.validateResponse(): end. Result = " + retVal); // // return retVal; } /** * Converts a java.util.Date into OAI-PMH valid Date-String. * Please note that all timestamps in OAI-PMH must be expressed in UTC! * * @param date Date to convert * @param isComplexFormat Boolean that indicates if the result day or second granularity has. * @return String representation in UTC of given Date-Time. * @see java.util.TimeZone */ private static String convertToOaiDate(java.util.Date date, boolean isComplexFormat) { String retVal; if (isComplexFormat) { java.text.SimpleDateFormat complexDate = new java.text.SimpleDateFormat(ManagerConstants.COMPLEX_DATE_PATTERN); complexDate.setTimeZone(TimeZone.getTimeZone("UTC")); retVal = complexDate.format(date); } else { java.text.SimpleDateFormat simpleDate = new java.text.SimpleDateFormat(ManagerConstants.SIMPLE_DATE_PATTERN); simpleDate.setTimeZone(TimeZone.getTimeZone("UTC")); retVal = simpleDate.format(date); } //logger.debug(this.msgPrefix + this.msgPrefix + "Harvester.convertToOaiDate(): " + retVal); return retVal; } /** * Creates a string representation of given Exception, increases the number * of errors and saves this information in corresponding objects. Those will * be saved persistent via Hibernate if the user wishes. * * @param e The Exception to serialise. * @author Kadir Karaca Kocer, German National Library */ private void serializeException(Exception e) { if (LOGGER.isDebugEnabled()) LOGGER.debug(this.msgPrefix + "An Exception occured! Serializing ... "); String message = e.getMessage(); String exceptionText = ""; if ((message != null) && (message.length() > 1)) { exceptionText = message + "\n"; } else { exceptionText = "No message for this exception defined.\n"; } StackTraceElement[] ste = e.getStackTrace(); for (int i = 0; i < ste.length; i++) { exceptionText = exceptionText + "\n" + ste[i].toString(); } exceptionText = exceptionText + "\n\nActual value of the Resumption Token is:\n" + this.resumptionToken; exceptionText = exceptionText + "\nRecords saved by previous resumption tokens: " + this.savedRecords; exceptionText = exceptionText + "\nError timestamp (UTC): " + convertToOaiDate(new java.util.Date(), true); this.task.getRequest().setExceptionMessage(exceptionText); this.task.getRepository().increaseNumberOfErrors(); LOGGER.error(this.msgPrefix + exceptionText); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy