org.apache.manifoldcf.crawler.system.WorkerThread Maven / Gradle / Ivy
/* $Id: WorkerThread.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.system;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.core.util.URLEncoder;
import java.util.*;
import java.io.*;
import java.lang.reflect.*;
import java.text.SimpleDateFormat;
/** This class represents a worker thread. Hundreds of these threads are instantiated in order to
* perform crawling and extraction.
*/
public class WorkerThread extends Thread
{
public static final String _rcsid = "@(#)$Id: WorkerThread.java 988245 2010-08-23 18:39:35Z kwright $";
// Local data
/** Thread id */
protected final String id;
/** This is a reference to the static main document queue */
protected final DocumentQueue documentQueue;
/** Worker thread pool reset manager */
protected final WorkerResetManager resetManager;
/** Queue tracker */
protected final QueueTracker queueTracker;
/** Process ID */
protected final String processID;
private static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss,SSS", Locale.ROOT);
/** Constructor.
*@param id is the worker thread id.
*/
public WorkerThread(String id, DocumentQueue documentQueue, WorkerResetManager resetManager, QueueTracker queueTracker, String processID)
throws ManifoldCFException
{
super();
this.id = id;
this.documentQueue = documentQueue;
this.resetManager = resetManager;
this.queueTracker = queueTracker;
this.processID = processID;
setName("Worker thread '"+id+"'");
setDaemon(true);
}
public void run()
{
// Register this thread in the worker reset manager
resetManager.registerMe();
try
{
// Create a thread context object.
IThreadContext threadContext = ThreadContextFactory.make();
IIncrementalIngester ingester = IncrementalIngesterFactory.make(threadContext);
IJobManager jobManager = JobManagerFactory.make(threadContext);
IBinManager binManager = BinManagerFactory.make(threadContext);
IRepositoryConnectionManager connMgr = RepositoryConnectionManagerFactory.make(threadContext);
ITransformationConnectionManager transformationConnectionManager = TransformationConnectionManagerFactory.make(threadContext);
IOutputConnectionManager outputConnectionManager = OutputConnectionManagerFactory.make(threadContext);
IReprioritizationTracker rt = ReprioritizationTrackerFactory.make(threadContext);
IRepositoryConnectorPool repositoryConnectorPool = RepositoryConnectorPoolFactory.make(threadContext);
// This is the set of documents that we will either be marking as complete, or requeued, depending on the kind of crawl.
List finishList = new ArrayList();
// This is where we accumulate the document QueuedDocuments to be deleted from the job queue.
List deleteList = new ArrayList();
// This is where we accumulate documents that need to be placed in the HOPCOUNTREMOVED
// state
List hopcountremoveList = new ArrayList();
// This is where we accumulate documents that need to be rescanned
List rescanList = new ArrayList();
// This is where we store document ID strings of documents that need to be noted as having
// been checked.
List ingesterCheckList = new ArrayList();
// Service interruption thrown with "abort on fail".
ManifoldCFException abortOnFail = null;
// Loop
while (true)
{
// Do another try/catch around everything in the loop
try
{
if (Thread.currentThread().isInterrupted())
throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
// Before we begin, conditionally reset
resetManager.waitForReset(threadContext);
// Once we pull something off the queue, we MUST make sure that
// we update its status, even if there is an exception!!!
// See if there is anything on the queue for me
QueuedDocumentSet qds = documentQueue.getDocument(queueTracker);
if (qds == null)
// It's a reset, so recycle
continue;
try
{
// System.out.println("Got a document set");
if (Thread.currentThread().isInterrupted())
throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
// First of all: find out if the job for these documents has been aborted, paused, etc.
// If so, we requeue the documents immediately.
IJobDescription job = qds.getJobDescription();
Long jobID = job.getID();
if (jobManager.checkJobActive(jobID) == false)
// Recycle; let these documents be requeued and go get the next set.
continue;
if (Logging.threads.isDebugEnabled())
{
StringBuilder sb = new StringBuilder();
for (int z = 0; z < qds.getCount(); z++)
{
sb.append(qds.getDocument(z).getDocumentDescription().getID()).append(" ");
}
Logging.threads.debug("Worker thread processing documents: "+sb);
}
// Build a basic pipeline specification right off; we need it whenever
// we interact with Incremental Ingester.
IPipelineConnections pipelineConnections = new PipelineConnections(new PipelineSpecificationBasic(job),transformationConnectionManager,outputConnectionManager);
String lastIndexedOutputConnectionName = ingester.getLastIndexedOutputConnectionName(pipelineConnections);
// Universal job data we'll need later
String connectionName = job.getConnectionName();
Specification spec = job.getSpecification();
int jobType = job.getType();
IRepositoryConnection connection = qds.getConnection();
OutputActivity ingestLogger = new OutputActivity(connectionName,connMgr);
// The flow through this section of the code is as follows.
// (1) We start with a list of documents
// (2) We attempt to do various things to these documents
// (3) Based on what happens, and what errors we get, we progressively move documents out of the main list
// and into secondary lists that will be all treated in the same way
// First, initialize the active document set to contain everything.
List activeDocuments = new ArrayList(qds.getCount());
for (int i = 0; i < qds.getCount(); i++)
{
QueuedDocument qd = qds.getDocument(i);
activeDocuments.add(qd);
}
// Clear out all of our disposition lists
finishList.clear();
deleteList.clear();
ingesterCheckList.clear();
hopcountremoveList.clear();
rescanList.clear(); // jobManager.resetDocument(dd,0L,IJobManager.ACTION_RESCAN,-1L,-1);
abortOnFail = null;
// Keep track of the starting processing time, for statistics calculation
long processingStartTime = System.currentTimeMillis();
// Log these documents in the overlap calculator
qds.beginProcessing(queueTracker);
try
{
long currentTime = System.currentTimeMillis();
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Worker thread starting document count is "+Integer.toString(activeDocuments.size()));
// Get the legal link types. This is needed for later hopcount checking.
String[] legalLinkTypes = null;
if (activeDocuments.size() > 0)
{
legalLinkTypes = RepositoryConnectorFactory.getRelationshipTypes(threadContext,connection.getClassName());
// If this came back null, it means that there is no underlying implementation available, so treat this like a kind of service interruption.
if (legalLinkTypes == null)
{
// Failure here puts all remaining documents into rescan list
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Moving "+makeListString(activeDocuments)+" to rescanList");
moveList(activeDocuments,rescanList);
}
}
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Post-relationship document count is "+Integer.toString(activeDocuments.size()));
// Do the hopcount checks, if any. This will iteratively reduce the viable list of
// document identifiers in need of having their versions fetched.
if (legalLinkTypes != null && activeDocuments.size() > 0)
{
// Set up the current ID array
String[] currentDocIDHashArray = new String[activeDocuments.size()];
for (int i = 0; i < currentDocIDHashArray.length; i++)
{
currentDocIDHashArray[i] = activeDocuments.get(i).getDocumentDescription().getDocumentIdentifierHash();
}
Map filterMap = job.getHopCountFilters();
Iterator filterIter = filterMap.keySet().iterator();
// Array to accumulate hopcount results for all link types
boolean[] overallResults = new boolean[currentDocIDHashArray.length];
for (int i = 0; i < overallResults.length; i++)
{
overallResults[i] = true;
}
// Calculate the hopcount result for each link type, and fold it in.
while (filterIter.hasNext())
{
String linkType = (String)filterIter.next();
int maxHop = (int)((Long)filterMap.get(linkType)).longValue();
boolean[] results = jobManager.findHopCounts(job.getID(),legalLinkTypes,currentDocIDHashArray,linkType,
maxHop,job.getHopcountMode());
for (int i = 0; i < results.length; i++)
{
overallResults[i] = overallResults[i] && results[i];
}
}
// Move all documents to the appropriate list
List newActiveSet = new ArrayList(activeDocuments.size());
for (int i = 0; i < overallResults.length; i++)
{
if (overallResults[i] == false)
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+activeDocuments.get(i).getDocumentDescription().getID()+" to hopcountremovelist");
hopcountremoveList.add(activeDocuments.get(i));
}
else
{
newActiveSet.add(activeDocuments.get(i));
}
}
activeDocuments = newActiveSet;
}
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Post-hopcount pruned document count is "+Integer.toString(activeDocuments.size()));
// From here on down we need a connector instance, so get one.
IRepositoryConnector connector = null;
if (activeDocuments.size() > 0 || hopcountremoveList.size() > 0)
{
connector = repositoryConnectorPool.grab(connection);
// If we wind up with a null here, it means that a document got queued for a connector which is now gone.
// Basically, what we want to do in that case is to treat this kind of like a service interruption - the document
// must be requeued for immediate reprocessing. When the rest of the world figures out that the job that owns this
// document is in fact unable to function, we'll stop getting such documents handed to us, because the state of the
// job will be changed.
if (connector == null)
{
// Failure here puts all remaining documents into rescan list
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Moving "+makeListString(activeDocuments)+" to rescanList");
moveList(activeDocuments,rescanList);
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Moving "+makeListString(hopcountremoveList)+" to rescanList");
moveList(hopcountremoveList,rescanList);
}
}
if (connector != null)
{
// Open try/finally block to free the connector instance no matter what
try
{
// Check for interruption before we start fetching
if (Thread.currentThread().isInterrupted())
throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
// We need first to assemble an IPipelineSpecificationWithVersions object for each document we're going to process.
// We put this in a map so it can be looked up by document identifier.
// Create a full PipelineSpecification, including description strings. (This is per-job still, but can throw ServiceInterruptions, so we do it in here.)
IPipelineSpecification pipelineSpecification;
try
{
pipelineSpecification = new PipelineSpecification(pipelineConnections,job,ingester);
}
catch (ServiceInterruption e)
{
// Handle service interruption from pipeline
if (!e.jobInactiveAbort())
Logging.jobs.warn("Service interruption reported for job "+
job.getID()+" connection '"+job.getConnectionName()+"': "+
e.getMessage());
// All documents get requeued, because we never got far enough to make distinctions. All we have to decide
// is whether to requeue or abort.
List requeueList = new ArrayList();
for (QueuedDocument qd : activeDocuments)
{
DocumentDescription dd = qd.getDocumentDescription();
// Check for hard failure. But no hard failure possible of it's a job inactive abort.
if (!e.jobInactiveAbort() && (dd.getFailTime() != -1L && dd.getFailTime() < e.getRetryTime() ||
dd.getFailRetryCount() == 0))
{
// Treat this as a hard failure.
if (e.isAbortOnFail())
{
rescanList.add(qd);
abortOnFail = new ManifoldCFException("Repeated service interruptions - failure processing document"+((e.getCause()!=null)?": "+e.getCause().getMessage():""),e.getCause());
}
else
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to requeueList because of output service interruption");
requeueList.add(qd);
}
}
else
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to requeueList because of output service interruption");
requeueList.add(qd);
}
}
requeueDocuments(jobManager,requeueList,e.getRetryTime(),e.getFailTime(),
e.getFailRetryCount());
Logging.threads.debug(" Clearing active documents list due to output service interruption");
activeDocuments.clear();
pipelineSpecification = null;
}
if (activeDocuments.size() > 0)
{
// **** New worker thread code starts here!!! ****
IExistingVersions existingVersions = new ExistingVersions(lastIndexedOutputConnectionName,activeDocuments);
String aclAuthority = connection.getACLAuthority();
if (aclAuthority == null)
aclAuthority = "";
boolean isDefaultAuthority = (aclAuthority.length() == 0);
// Build the processActivity object
Map previousDocuments = new HashMap();
String[] documentIDs = new String[activeDocuments.size()];
int k = 0;
for (QueuedDocument qd : activeDocuments)
{
previousDocuments.put(qd.getDocumentDescription().getDocumentIdentifierHash(),qd);
documentIDs[k++] = qd.getDocumentDescription().getDocumentIdentifier();
}
ProcessActivity activity = new ProcessActivity(job.getID(),processID,
rt,jobManager,ingester,
connectionName,pipelineSpecification,
previousDocuments,
currentTime,
job.getExpiration(),
job.getInterval(),
job.getMaxInterval(),
job.getHopcountMode(),
connection,connector,connMgr,legalLinkTypes,ingestLogger);
try
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Worker thread about to process "+makeListString(activeDocuments));
// Now, process in bulk -- catching and handling ServiceInterruptions
ServiceInterruption serviceInterruption = null;
try
{
connector.processDocuments(documentIDs,existingVersions,job.getSpecification(),activity,jobType,isDefaultAuthority);
// Now do everything that the connector might have done if we were not doing it for it.
// Right now, that's just getting rid of untouched components.
for (QueuedDocument qd : activeDocuments)
{
String documentIdentifier = qd.getDocumentDescription().getDocumentIdentifier();
if (!activity.wasDocumentAborted(documentIdentifier) && !activity.wasDocumentDeleted(documentIdentifier))
{
String documentIdentifierHash = qd.getDocumentDescription().getDocumentIdentifierHash();
// In order to be able to loop over all the components that the incremental ingester knows about, we need to know
// what the FIRST output is.
DocumentIngestStatusSet set = qd.getLastIngestedStatus(ingester.getFirstIndexedOutputConnectionName(pipelineConnections));
if (set != null)
{
Iterator componentHashes = set.componentIterator();
while (componentHashes.hasNext())
{
String componentHash = componentHashes.next();
// Check whether we've indexed or not
if (!activity.wasDocumentComponentTouched(documentIdentifier,
componentHash))
{
// This component must be removed.
ingester.documentRemove(
pipelineConnections,
connectionName,documentIdentifierHash,componentHash,
ingestLogger);
}
}
}
}
}
// Done with connector functionality!
}
catch (ServiceInterruption e)
{
serviceInterruption = e;
if (!e.jobInactiveAbort())
Logging.jobs.warn("Service interruption reported for job "+
job.getID()+" connection '"+job.getConnectionName()+"': "+
e.getMessage());
}
// Flush remaining references into the database!
activity.flush();
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Worker thread done processing "+Integer.toString(documentIDs.length)+" documents");
// Either way, handle the documents we were supposed to process. But if there was a service interruption,
// and the disposition of the document was unclear, then the document will need to be requeued instead of handled normally.
List requeueList = new ArrayList();
for (QueuedDocument qd : activeDocuments)
{
// If this document was aborted, then treat it specially.
if (activity.wasDocumentAborted(qd.getDocumentDescription().getDocumentIdentifier()))
{
// Special treatment for aborted documents.
// We ignore the returned version string completely, since it's presumed that processing was not completed for this doc.
// We want to give up immediately on this one, and just requeue it for immediate reprocessing (pending its prereqs being all met).
// Add to the finish list, so it gets requeued. Because the document is already marked as aborted, this should be enough to cause an
// unconditional requeue.
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to finishList");
finishList.add(qd);
}
else if (activity.wasDocumentDeleted(qd.getDocumentDescription().getDocumentIdentifier()))
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to deleteList");
deleteList.add(qd);
}
else if (serviceInterruption != null)
{
// Service interruption has precedence over unchanged, because we might have been interrupted while scanning the document
// for references
DocumentDescription dd = qd.getDocumentDescription();
// Check for hard failure. But no hard failure possible of it's a job inactive abort.
if (!serviceInterruption.jobInactiveAbort() && (dd.getFailTime() != -1L && dd.getFailTime() < serviceInterruption.getRetryTime() ||
dd.getFailRetryCount() == 0))
{
// Treat this as a hard failure.
if (serviceInterruption.isAbortOnFail())
{
// Make sure that the job aborts.
abortOnFail = new ManifoldCFException("Repeated service interruptions - failure processing document"+((serviceInterruption.getCause()!=null)?": "+serviceInterruption.getCause().getMessage():""),serviceInterruption.getCause());
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to rescanList due to service interruption");
rescanList.add(qd);
}
else
{
// Skip the document, rather than failing.
// We want this particular document to be not included in the
// reprocessing. Therefore, we do the same thing as we would
// if we got back a null version.
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to deleteList due to service interruption");
deleteList.add(qd);
}
}
else
{
// Not a hard failure. Requeue.
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to requeueList");
requeueList.add(qd);
}
}
else
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to finishList");
finishList.add(qd);
}
// Note whether the document was untouched; if so, update it
if (!activity.wasDocumentTouched(qd.getDocumentDescription().getDocumentIdentifier()))
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Adding "+qd.getDocumentDescription().getID()+" to ingesterCheckList");
ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
}
}
if (serviceInterruption != null)
{
// Requeue the documents we've identified as needing to be repeated
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Requeuing documents "+makeListString(requeueList));
requeueDocuments(jobManager,requeueList,serviceInterruption.getRetryTime(),serviceInterruption.getFailTime(),
serviceInterruption.getFailRetryCount());
}
// Note the documents that have been checked but not reingested. This should happen BEFORE we need
// the statistics (which are calculated during the finishlist step below)
if (ingesterCheckList.size() > 0)
{
String[] checkClasses = new String[ingesterCheckList.size()];
String[] checkIDs = new String[ingesterCheckList.size()];
for (int i = 0; i < checkIDs.length; i++)
{
checkClasses[i] = connectionName;
checkIDs[i] = ingesterCheckList.get(i);
}
// This method should exercise reasonable intelligence. If the document has never been indexed, it should detect that
// and stop. Otherwise, it should update the statistics accordingly.
ingester.documentCheckMultiple(pipelineConnections,checkClasses,checkIDs,currentTime);
}
// Process the finish list!
if (finishList.size() > 0)
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Finishing documents "+makeListString(finishList));
// "Finish" the documents (removing unneeded carrydown info, and compute hopcounts).
// This can ONLY be done on fully-completed documents; everything else should be left in a dangling
// state (which we know is OK because it will be fixed the next time the document is attempted).
String[] documentIDHashes = new String[finishList.size()];
k = 0;
for (QueuedDocument qd : finishList)
{
documentIDHashes[k++] = qd.getDocumentDescription().getDocumentIdentifierHash();
}
DocumentDescription[] requeueCandidates = jobManager.finishDocuments(job.getID(),legalLinkTypes,documentIDHashes,job.getHopcountMode());
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Requeueing documents due to carrydown "+makeListString(requeueCandidates));
ManifoldCF.requeueDocumentsDueToCarrydown(jobManager,requeueCandidates,connector,connection,rt,currentTime);
// In both job types, we have to go through the finishList to figure out what to do with the documents.
// In the case of a document that was aborted, we must requeue it for immediate reprocessing in BOTH job types.
switch (job.getType())
{
case IJobDescription.TYPE_CONTINUOUS:
{
// We need to populate timeArray
String[] timeIDClasses = new String[finishList.size()];
String[] timeIDHashes = new String[finishList.size()];
for (int i = 0; i < timeIDHashes.length; i++)
{
QueuedDocument qd = (QueuedDocument)finishList.get(i);
DocumentDescription dd = qd.getDocumentDescription();
String documentIDHash = dd.getDocumentIdentifierHash();
timeIDClasses[i] = connectionName;
timeIDHashes[i] = documentIDHash;
}
long[] timeArray = ingester.getDocumentUpdateIntervalMultiple(pipelineConnections,timeIDClasses,timeIDHashes);
Long[] recheckTimeArray = new Long[timeArray.length];
int[] actionArray = new int[timeArray.length];
DocumentDescription[] recrawlDocs = new DocumentDescription[finishList.size()];
for (int i = 0; i < finishList.size(); i++)
{
QueuedDocument qd = finishList.get(i);
recrawlDocs[i] = qd.getDocumentDescription();
String documentID = recrawlDocs[i].getDocumentIdentifier();
// If aborted due to sequencing issue, then requeue for reprocessing immediately, ignoring everything else.
boolean wasAborted = activity.wasDocumentAborted(documentID);
if (wasAborted)
{
// Requeue for immediate reprocessing
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug("Document '"+documentID+"' will be RESCANNED as soon as prerequisites are met");
actionArray[i] = IJobManager.ACTION_RESCAN;
recheckTimeArray[i] = new Long(0L); // Must not use null; that means 'never'.
}
else
{
// Calculate the next time to run, or time to expire.
// For run time, the formula is to calculate the running avg interval between changes,
// add an additional interval (which comes from the job description),
// and add that to the current time.
// One caveat: we really want to calculate the interval from the last
// time change was detected, but this is not implemented yet.
long timeAmt = timeArray[i];
// null value indicates never to schedule
Long recrawlTime = activity.calculateDocumentRescheduleTime(currentTime,timeAmt,documentID);
Long expireTime = activity.calculateDocumentExpireTime(currentTime,documentID);
// Merge the two times together. We decide on the action based on the action with the lowest time.
if (expireTime == null || (recrawlTime != null && recrawlTime.longValue() < expireTime.longValue()))
{
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug("Document '"+documentID+"' will be RESCANNED at "+recrawlTime.toString());
recheckTimeArray[i] = recrawlTime;
actionArray[i] = IJobManager.ACTION_RESCAN;
}
else if (recrawlTime == null || (expireTime != null && recrawlTime.longValue() > expireTime.longValue()))
{
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug("Document '"+documentID+"' will be REMOVED at "+expireTime.toString());
recheckTimeArray[i] = expireTime;
actionArray[i] = IJobManager.ACTION_REMOVE;
}
else
{
// Default activity if conflict will be rescan
if (Logging.scheduling.isDebugEnabled() && recrawlTime != null)
Logging.scheduling.debug("Document '"+documentID+"' will be RESCANNED at "+recrawlTime.toString());
recheckTimeArray[i] = recrawlTime;
actionArray[i] = IJobManager.ACTION_RESCAN;
}
}
}
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Requeuing "+makeListString(recrawlDocs));
jobManager.requeueDocumentMultiple(recrawlDocs,recheckTimeArray,actionArray);
}
break;
case IJobDescription.TYPE_SPECIFIED:
{
// Separate the ones we actually finished from the ones we need to requeue because they were aborted
List completedList = new ArrayList();
List abortedList = new ArrayList();
for (QueuedDocument qd : finishList)
{
DocumentDescription dd = qd.getDocumentDescription();
if (activity.wasDocumentAborted(dd.getDocumentIdentifier()))
{
// The document was aborted, so put it into the abortedList
abortedList.add(dd);
}
else
{
// The document was completed.
completedList.add(dd);
}
}
// Requeue the ones that must be repeated
if (abortedList.size() > 0)
{
DocumentDescription[] docDescriptions = new DocumentDescription[abortedList.size()];
Long[] recheckTimeArray = new Long[docDescriptions.length];
int[] actionArray = new int[docDescriptions.length];
for (int i = 0; i < docDescriptions.length; i++)
{
docDescriptions[i] = abortedList.get(i);
recheckTimeArray[i] = new Long(0L);
actionArray[i] = IJobManager.ACTION_RESCAN;
}
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Requeuing "+makeListString(docDescriptions));
jobManager.requeueDocumentMultiple(docDescriptions,recheckTimeArray,actionArray);
}
// Mark the ones completed that were actually completed.
if (completedList.size() > 0)
{
DocumentDescription[] docDescriptions = new DocumentDescription[completedList.size()];
for (int i = 0; i < docDescriptions.length; i++)
{
docDescriptions[i] = completedList.get(i);
}
if (Logging.threads.isDebugEnabled())
Logging.threads.debug(" Marking completed "+makeListString(docDescriptions));
jobManager.markDocumentCompletedMultiple(docDescriptions);
}
}
break;
default:
throw new ManifoldCFException("Unexpected value for job type: '"+Integer.toString(job.getType())+"'");
}
// Finally, if we're still alive, mark everything we finished as "processed".
for (QueuedDocument qd : finishList)
{
qd.setProcessed();
}
}
}
finally
{
// Make sure we don't leave any dangling carrydown files
activity.discard();
}
// Successful processing of the set
// We count 'get version' time in the average, so even if we decide not to process a doc
// it still counts.
long elapsedTime = System.currentTimeMillis() - processingStartTime;
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug("Worker thread for connection "+connectionName+" took "+new Long(elapsedTime).toString()+"ms to handle "+Integer.toString(qds.getCount())+" documents");
queueTracker.noteConnectionPerformance(qds.getCount(),connectionName,elapsedTime);
}
// Now, handle the delete list
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Deleting "+makeListString(deleteList));
processDeleteLists(pipelineConnections,connector,connection,jobManager,
deleteList,ingester,
job.getID(),legalLinkTypes,ingestLogger,job.getHopcountMode(),rt,currentTime);
// Handle hopcount removal
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Hopcount removal "+makeListString(hopcountremoveList));
processHopcountRemovalLists(pipelineConnections,connector,connection,jobManager,
hopcountremoveList,ingester,
job.getID(),legalLinkTypes,ingestLogger,job.getHopcountMode(),rt,currentTime);
}
finally
{
repositoryConnectorPool.release(connection,connector);
}
}
// Handle rescanning
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Rescanning documents "+makeListString(rescanList));
for (QueuedDocument qd : rescanList)
{
jobManager.resetDocument(qd.getDocumentDescription(),0L,IJobManager.ACTION_RESCAN,-1L,-1);
qd.setProcessed();
}
}
finally
{
// Note termination of processing of these documents in the overlap calculator
qds.endProcessing(queueTracker);
}
if (abortOnFail != null)
throw abortOnFail;
}
catch (ManifoldCFException e)
{
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
break;
if (e.getErrorCode() == ManifoldCFException.DATABASE_CONNECTION_ERROR)
throw e;
if (jobManager.errorAbort(qds.getJobDescription().getID(),e.getMessage()))
{
// We eat the exception if there was already one recorded.
// An exception occurred in the processing of a set of documents.
// Shut the corresponding job down, with an appropriate error
Logging.threads.error("Exception tossed: "+e.getMessage(),e);
}
}
finally
{
// Go through qds and requeue any that aren't closed out in one way or another. This allows the job
// to be aborted; no dangling entries are left around.
for (int i = 0; i < qds.getCount(); i++)
{
QueuedDocument qd = qds.getDocument(i);
if (!qd.wasProcessed())
{
jobManager.resetDocument(qd.getDocumentDescription(),0L,IJobManager.ACTION_RESCAN,-1L,-1);
}
}
}
}
catch (ManifoldCFException e)
{
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
break;
if (e.getErrorCode() == ManifoldCFException.DATABASE_CONNECTION_ERROR)
{
// Note the failure, which will cause a reset to occur
resetManager.noteEvent();
Logging.threads.error("Worker thread aborting and restarting due to database connection reset: "+e.getMessage(),e);
try
{
// Give the database a chance to catch up/wake up
ManifoldCF.sleep(10000L);
}
catch (InterruptedException se)
{
break;
}
continue;
}
// An exception occurred in the cleanup from another error.
// Log the error (but that's all we can do)
Logging.threads.error("Exception tossed: "+e.getMessage(),e);
}
catch (InterruptedException e)
{
// We're supposed to quit
break;
}
catch (OutOfMemoryError e)
{
System.err.println(sdf.format(new Date()) + " agents process ran out of memory - shutting down");
e.printStackTrace(System.err);
ManifoldCF.systemExit(-200);
}
catch (Throwable e)
{
// A more severe error - but stay alive
Logging.threads.fatal("Error tossed: "+e.getMessage(),e);
}
}
}
catch (Throwable e)
{
// Severe error on initialization
System.err.println("agents process could not start - shutting down");
Logging.threads.fatal("WorkerThread "+id+" initialization error tossed: "+e.getMessage(),e);
ManifoldCF.systemExit(-300);
}
}
/** Compare two sorted collection names lists.
*/
protected static boolean compareArrays(String[] array1, String[] array2)
{
if (array1.length != array2.length)
return false;
int i = 0;
while (i < array1.length)
{
if (!array1[i].equals(array2[i]))
return false;
i++;
}
return true;
}
protected static String makeListString(List sourceList)
{
StringBuilder sb = new StringBuilder("{");
for (QueuedDocument qd : sourceList)
{
sb.append(qd.getDocumentDescription().getID()).append(" ");
}
sb.append("}");
return sb.toString();
}
protected static String makeListString(DocumentDescription[] sourceList)
{
StringBuilder sb = new StringBuilder("{");
for (DocumentDescription dd : sourceList)
{
sb.append(dd.getID()).append(" ");
}
sb.append("}");
return sb.toString();
}
protected static void moveList(List sourceList, List targetList)
{
for (int i = 0; i < sourceList.size(); i++)
{
targetList.add(sourceList.get(i));
}
sourceList.clear();
}
/** Mark specified documents as 'hopcount removed', and remove them from the
* index. Documents in this state are presumed to have:
* (a) nothing in the index
* (b) no intrinsic links for which they are the origin
* In order to guarantee this situation, this method must be capable of doing much
* of what the deletion method must do. Specifically, it should be capable of deleting
* documents from the index should they be already present.
*/
protected static void processHopcountRemovalLists(IPipelineConnections pipelineConnections,
IRepositoryConnector connector,
IRepositoryConnection connection, IJobManager jobManager,
List hopcountremoveList,
IIncrementalIngester ingester,
Long jobID, String[] legalLinkTypes, OutputActivity ingestLogger,
int hopcountMethod, IReprioritizationTracker rt, long currentTime)
throws ManifoldCFException
{
// Remove from index
hopcountremoveList = removeFromIndex(pipelineConnections,connection.getName(),jobManager,hopcountremoveList,ingester,ingestLogger);
// Mark as 'hopcountremoved' in the job queue
processJobQueueHopcountRemovals(hopcountremoveList,connector,connection,
jobManager,jobID,legalLinkTypes,hopcountMethod,rt,currentTime);
}
/** Clear specified documents out of the job queue and from the appliance.
*@param pipelineConnections is the basic pipeline specification for this job.
*@param jobManager is the job manager.
*@param deleteList is a list of QueuedDocument objects to clean out.
*@param ingester is the handle to the incremental ingestion API control object.
*/
protected static void processDeleteLists(IPipelineConnections pipelineConnections,
IRepositoryConnector connector,
IRepositoryConnection connection, IJobManager jobManager,
List deleteList,
IIncrementalIngester ingester,
Long jobID, String[] legalLinkTypes, OutputActivity ingestLogger,
int hopcountMethod, IReprioritizationTracker rt, long currentTime)
throws ManifoldCFException
{
// Remove from index
deleteList = removeFromIndex(pipelineConnections,connection.getName(),jobManager,deleteList,ingester,ingestLogger);
// Delete from the job queue
processJobQueueDeletions(deleteList,connector,connection,
jobManager,jobID,legalLinkTypes,hopcountMethod,rt,currentTime);
}
/** Remove a specified set of documents from the index.
*@return the list of documents whose state needs to be updated in jobqueue.
*/
protected static List removeFromIndex(IPipelineConnections pipelineConnections,
String connectionName, IJobManager jobManager, List deleteList,
IIncrementalIngester ingester, OutputActivity ingestLogger)
throws ManifoldCFException
{
List ingesterDeleteList = new ArrayList(deleteList.size());
for (int i = 0; i < deleteList.size(); i++)
{
QueuedDocument qd = deleteList.get(i);
// See if we need to delete from index
if (qd.anyLastIngestedRecords())
{
// Queue up to issue deletion
ingesterDeleteList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
}
}
// First, do the ingester delete list. This guarantees that if the ingestion system is down, this operation will be handled atomically.
if (ingesterDeleteList.size() > 0)
{
String[] deleteClasses = new String[ingesterDeleteList.size()];
String[] deleteIDs = new String[ingesterDeleteList.size()];
for (int i = 0; i < ingesterDeleteList.size(); i++)
{
deleteClasses[i] = connectionName;
deleteIDs[i] = ingesterDeleteList.get(i);
}
// Try to delete the documents via the output connection.
try
{
ingester.documentDeleteMultiple(pipelineConnections,deleteClasses,deleteIDs,ingestLogger);
}
catch (ServiceInterruption e)
{
// It looks like the output connection is not currently functioning, so we need to requeue instead of deleting
// those documents that could not be removed.
List newDeleteList = new ArrayList();
List newRequeueList = new ArrayList();
Set ingesterSet = new HashSet();
for (int j = 0 ; j < ingesterDeleteList.size() ; j++)
{
String id = ingesterDeleteList.get(j);
ingesterSet.add(id);
}
for (int j = 0 ; j < deleteList.size() ; j++)
{
QueuedDocument qd = deleteList.get(j);
DocumentDescription dd = qd.getDocumentDescription();
String documentIdentifierHash = dd.getDocumentIdentifierHash();
if (ingesterSet.contains(documentIdentifierHash))
newRequeueList.add(qd);
else
newDeleteList.add(qd);
}
// Requeue those that are supposed to be requeued
requeueDocuments(jobManager,newRequeueList,e.getRetryTime(),e.getFailTime(),
e.getFailRetryCount());
// Process the ones that are just new job queue changes
deleteList = newDeleteList;
}
}
return deleteList;
}
/** Process job queue deletions. Either the indexer has already been updated, or it is not necessary to update it.
*/
protected static void processJobQueueDeletions(List jobmanagerDeleteList,
IRepositoryConnector connector, IRepositoryConnection connection, IJobManager jobManager,
Long jobID, String[] legalLinkTypes, int hopcountMethod, IReprioritizationTracker rt, long currentTime)
throws ManifoldCFException
{
// Now, do the document queue cleanup for deletions.
if (jobmanagerDeleteList.size() > 0)
{
DocumentDescription[] deleteDescriptions = new DocumentDescription[jobmanagerDeleteList.size()];
for (int i = 0; i < deleteDescriptions.length; i++)
{
QueuedDocument qd = jobmanagerDeleteList.get(i);
deleteDescriptions[i] = qd.getDocumentDescription();
}
// Do the actual work.
DocumentDescription[] requeueCandidates = jobManager.markDocumentDeletedMultiple(jobID,legalLinkTypes,deleteDescriptions,hopcountMethod);
// Requeue those documents that had carrydown data modifications
ManifoldCF.requeueDocumentsDueToCarrydown(jobManager,
requeueCandidates,connector,connection,rt,currentTime);
// Mark all these as done
for (int i = 0; i < jobmanagerDeleteList.size(); i++)
{
QueuedDocument qd = jobmanagerDeleteList.get(i);
qd.setProcessed();
}
}
}
/** Process job queue hopcount removals. All indexer updates have already taken place.
*/
protected static void processJobQueueHopcountRemovals(List jobmanagerRemovalList,
IRepositoryConnector connector, IRepositoryConnection connection, IJobManager jobManager,
Long jobID, String[] legalLinkTypes, int hopcountMethod, IReprioritizationTracker rt, long currentTime)
throws ManifoldCFException
{
// Now, do the document queue cleanup for deletions.
if (jobmanagerRemovalList.size() > 0)
{
DocumentDescription[] removalDescriptions = new DocumentDescription[jobmanagerRemovalList.size()];
for (int i = 0; i < removalDescriptions.length; i++)
{
QueuedDocument qd = jobmanagerRemovalList.get(i);
removalDescriptions[i] = qd.getDocumentDescription();
}
// Do the actual work.
DocumentDescription[] requeueCandidates = jobManager.markDocumentHopcountRemovalMultiple(jobID,legalLinkTypes,removalDescriptions,hopcountMethod);
// Requeue those documents that had carrydown data modifications
ManifoldCF.requeueDocumentsDueToCarrydown(jobManager,
requeueCandidates,connector,connection,rt,currentTime);
// Mark all these as done
for (QueuedDocument qd : jobmanagerRemovalList)
{
qd.setProcessed();
}
}
}
/** Requeue documents after a service interruption was detected.
*@param jobManager is the job manager object.
*@param requeueList is a list of QueuedDocument objects describing what needs to be requeued.
*@param retryTime is the time that the first retry ought to be scheduled for.
*@param failTime is the time beyond which retries lead to hard failure.
*@param failCount is the number of retries allowed until hard failure.
*/
protected static void requeueDocuments(IJobManager jobManager, List requeueList, long retryTime, long failTime, int failCount)
throws ManifoldCFException
{
if (requeueList.size() > 0)
{
DocumentDescription[] requeueDocs = new DocumentDescription[requeueList.size()];
for (int i = 0; i < requeueDocs.length; i++)
{
QueuedDocument qd = requeueList.get(i);
DocumentDescription dd = qd.getDocumentDescription();
requeueDocs[i] = dd;
}
jobManager.resetDocumentMultiple(requeueDocs,retryTime,IJobManager.ACTION_RESCAN,failTime,failCount);
for (QueuedDocument qd : requeueList)
{
qd.setProcessed();
}
}
}
/** The maximum number of adds that happen in a single transaction */
protected static final int MAX_ADDS_IN_TRANSACTION = 20;
// Nested classes
/** Process activity class wraps access to the ingester and job queue.
*/
protected static class ProcessActivity implements IProcessActivity
{
// Member variables
protected final Long jobID;
protected final String processID;
protected final IJobManager jobManager;
protected final IIncrementalIngester ingester;
protected final String connectionName;
protected final IPipelineSpecification pipelineSpecification;
protected final Map previousDocuments;
protected final long currentTime;
protected final Long expireInterval;
protected final Long recrawlInterval;
protected final Long maxInterval;
protected final int hopcountMode;
protected final IRepositoryConnection connection;
protected final IRepositoryConnector connector;
protected final IRepositoryConnectionManager connMgr;
protected final String[] legalLinkTypes;
protected final OutputActivity ingestLogger;
protected final IReprioritizationTracker rt;
// We submit references in bulk, because that's way more efficient.
protected final Map referenceList = new HashMap();
// Keep track of lower and upper reschedule bounds separately. Contains a Long and is keyed by a document identifier.
protected final Map lowerRescheduleBounds = new HashMap();
protected final Map upperRescheduleBounds = new HashMap();
protected final Map lowerExpireBounds = new HashMap();
protected final Map upperExpireBounds = new HashMap();
// Origination times
protected final Map originationTimes = new HashMap();
// Whether the document was aborted or not
protected final Set abortSet = new HashSet();
// Whether the document was touched or not
protected final Set touchedSet = new HashSet();
// Whether document was deleted
protected final Set documentDeletedSet = new HashSet();
// Whether a component was touched or not, keyed by document identifier.
// If there's an entry here, then it means that *all* components for the document are to be retained.
protected final Set allComponentsSet = new HashSet();
// This does not include primary document. The set is keyed by component id hash.
protected final Map> touchedComponentSet = new HashMap>();
// This represents primary documents.
protected final Set touchedPrimarySet = new HashSet();
/** Constructor.
*@param jobManager is the job manager
*@param ingester is the ingester
*/
public ProcessActivity(Long jobID, String processID,
IReprioritizationTracker rt, IJobManager jobManager,
IIncrementalIngester ingester,
String connectionName,
IPipelineSpecification pipelineSpecification,
Map previousDocuments,
long currentTime,
Long expireInterval,
Long recrawlInterval,
Long maxInterval,
int hopcountMode,
IRepositoryConnection connection, IRepositoryConnector connector,
IRepositoryConnectionManager connMgr, String[] legalLinkTypes, OutputActivity ingestLogger)
{
this.jobID = jobID;
this.processID = processID;
this.rt = rt;
this.jobManager = jobManager;
this.ingester = ingester;
this.connectionName = connectionName;
this.pipelineSpecification = pipelineSpecification;
this.previousDocuments = previousDocuments;
this.currentTime = currentTime;
this.expireInterval = expireInterval;
this.recrawlInterval = recrawlInterval;
this.maxInterval = maxInterval;
this.hopcountMode = hopcountMode;
this.connection = connection;
this.connector = connector;
this.connMgr = connMgr;
this.legalLinkTypes = legalLinkTypes;
this.ingestLogger = ingestLogger;
}
/** Clean up any dangling information, before abandoning this process activity object */
public void discard()
throws ManifoldCFException
{
for (DocumentReference dr : referenceList.keySet())
{
dr.discard();
}
referenceList.clear();
}
/** Check whether a document (and its version string) was touched or not.
*/
public boolean wasDocumentTouched(String documentIdentifier)
{
return touchedSet.contains(documentIdentifier);
}
/** Check whether a document component was touched or not.
*/
public boolean wasDocumentComponentTouched(String documentIdentifier,
String componentIdentifierHash)
{
if (allComponentsSet.contains(documentIdentifier))
return true;
Set components = touchedComponentSet.get(documentIdentifier);
if (components == null)
return false;
return components.contains(componentIdentifierHash);
}
/** Check whether document was deleted or not.
*/
public boolean wasDocumentDeleted(String documentIdentifier)
{
return documentDeletedSet.contains(documentIdentifier);
}
/** Check whether a document was aborted or not.
*/
public boolean wasDocumentAborted(String documentIdentifier)
{
return abortSet.contains(documentIdentifier);
}
/** Check if a document needs to be reindexed, based on a computed version string.
* Call this method to determine whether reindexing is necessary. Pass in a newly-computed version
* string. This method will return "true" if the document needs to be re-indexed.
*@param documentIdentifier is the document identifier.
*@param newVersionString is the newly-computed version string.
*@return true if the document needs to be reindexed.
*/
@Override
public boolean checkDocumentNeedsReindexing(String documentIdentifier,
String newVersionString)
throws ManifoldCFException
{
return checkDocumentNeedsReindexing(documentIdentifier,null,newVersionString);
}
/** Check if a document needs to be reindexed, based on a computed version string.
* Call this method to determine whether reindexing is necessary. Pass in a newly-computed version
* string. This method will return "true" if the document needs to be re-indexed.
*@param documentIdentifier is the document identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param newVersionString is the newly-computed version string.
*@return true if the document needs to be reindexed.
*/
@Override
public boolean checkDocumentNeedsReindexing(String documentIdentifier,
String componentIdentifier,
String newVersionString)
throws ManifoldCFException
{
String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
IPipelineSpecificationWithVersions spec = computePipelineSpecificationWithVersions(documentIdentifierHash,componentIdentifierHash,documentIdentifier);
return ingester.checkFetchDocument(spec,newVersionString,connection.getACLAuthority());
}
/** Add a document description to the current job's queue.
*@param localIdentifier is the local document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*@param dataNames is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
*@param dataValues are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
* The type of each object must either be a String, or a CharacterInput.
*@param originationTime is the time, in ms since epoch, that the document originated. Pass null if none or unknown.
*@param prereqEventNames are the names of the prerequisite events which this document requires prior to processing. Pass null if none.
*/
@Override
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType,
String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames)
throws ManifoldCFException
{
String localIdentifierHash = ManifoldCF.hash(localIdentifier);
String parentIdentifierHash = null;
if (parentIdentifier != null && parentIdentifier.length() > 0)
parentIdentifierHash = ManifoldCF.hash(parentIdentifier);
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Adding document reference, from "+((parentIdentifier==null)?"no parent":"'"+parentIdentifier+"'")
+" to '"+localIdentifier+"', relationship type "+((relationshipType==null)?"null":"'"+relationshipType+"'")
+", with "+((dataNames==null)?"no":Integer.toString(dataNames.length))+" data types, origination time="+((originationTime==null)?"unknown":originationTime.toString()));
//Long expireInterval = job.getExpiration();
if (expireInterval != null)
{
// We should not queue documents that have already expired; it wastes time
// So, calculate when this document will expire
long currentTime = System.currentTimeMillis();
long expireTime;
if (originationTime == null)
expireTime = currentTime + expireInterval.longValue();
else
expireTime = originationTime.longValue() + expireInterval.longValue();
if (expireTime <= currentTime)
{
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Not adding document reference for '"+localIdentifier+"', since it has already expired");
return;
}
}
if (referenceList.size() == MAX_ADDS_IN_TRANSACTION)
{
// Output what we've got, and reset
processDocumentReferences();
}
DocumentReference dr = new DocumentReference(localIdentifierHash,localIdentifier,new DocumentBin(parentIdentifierHash,relationshipType));
DocumentReference existingDr = referenceList.get(dr);
if (existingDr == null)
{
referenceList.put(dr,dr);
existingDr = dr;
}
// We can't just keep a reference to the passed-in data values, because if these are files the caller will delete them upon the return of this method. So, for all data values we keep,
// make a local copy, and remove the file pointer from the caller's copy. It then becomes the responsibility of the ProcessActivity object to clean up these items when it is discarded.
Object[][] savedDataValues;
if (dataValues != null)
{
savedDataValues = new Object[dataValues.length][];
int q = 0;
while (q < savedDataValues.length)
{
Object[] innerArray = dataValues[q];
if (innerArray != null)
{
savedDataValues[q] = new Object[innerArray.length];
int z = 0;
while (z < innerArray.length)
{
Object innerValue = innerArray[z];
if (innerValue != null)
{
if (innerValue instanceof CharacterInput)
(savedDataValues[q])[z] = ((CharacterInput)innerValue).transfer();
else
(savedDataValues[q])[z] = innerValue;
}
else
(savedDataValues[q])[z] = null;
z++;
}
}
else
savedDataValues[q] = null;
q++;
}
}
else
savedDataValues = null;
existingDr.addData(dataNames,savedDataValues);
existingDr.addPrerequisiteEvents(prereqEventNames);
}
/** Add a document description to the current job's queue.
*@param localIdentifier is the local document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*@param dataNames is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
*@param dataValues are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
*@param originationTime is the time, in ms since epoch, that the document originated. Pass null if none or unknown.
*/
@Override
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType,
String[] dataNames, Object[][] dataValues, Long originationTime)
throws ManifoldCFException
{
addDocumentReference(localIdentifier,parentIdentifier,relationshipType,dataNames,dataValues,originationTime,null);
}
/** Add a document description to the current job's queue.
*@param localIdentifier is the local document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*@param dataNames is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
*@param dataValues are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
*/
@Override
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType,
String[] dataNames, Object[][] dataValues)
throws ManifoldCFException
{
addDocumentReference(localIdentifier,parentIdentifier,relationshipType,dataNames,dataValues,null);
}
/** Add a document description to the current job's queue.
*@param localIdentifier is the local document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*/
@Override
public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType)
throws ManifoldCFException
{
addDocumentReference(localIdentifier,parentIdentifier,relationshipType,null,null);
}
/** Add a document description to the current job's queue. This method is equivalent to
* addDocumentReference(localIdentifier,null,null).
*@param localIdentifier is the local document identifier to add (for the connector that
* fetched the document).
*/
@Override
public void addDocumentReference(String localIdentifier)
throws ManifoldCFException
{
addDocumentReference(localIdentifier,null,null,null,null);
}
/** Retrieve data passed from parents to a specified child document.
*@param localIdentifier is the document identifier of the document we want the recorded data for.
*@param dataName is the name of the data items to retrieve.
*@return an array containing the unique data values passed from ALL parents. Note that these are in no particular order, and there will not be any duplicates.
*/
@Override
public String[] retrieveParentData(String localIdentifier, String dataName)
throws ManifoldCFException
{
return jobManager.retrieveParentData(jobID,ManifoldCF.hash(localIdentifier),dataName);
}
/** Retrieve data passed from parents to a specified child document.
*@param localIdentifier is the document identifier of the document we want the recorded data for.
*@param dataName is the name of the data items to retrieve.
*@return an array containing the unique data values passed from ALL parents. Note that these are in no particular order, and there will not be any duplicates.
*/
@Override
public CharacterInput[] retrieveParentDataAsFiles(String localIdentifier, String dataName)
throws ManifoldCFException
{
return jobManager.retrieveParentDataAsFiles(jobID,ManifoldCF.hash(localIdentifier),dataName);
}
/** Record a document version, but don't ingest it.
*@param documentIdentifier is the document identifier.
*@param version is the document version.
*/
@Override
public void recordDocument(String documentIdentifier, String version)
throws ManifoldCFException
{
recordDocument(documentIdentifier,null,version);
}
/** Record a document version, WITHOUT reindexing it, or removing it. (Other
* documents with the same URL, however, will still be removed.) This is
* useful if the version string changes but the document contents are known not
* to have changed.
*@param documentIdentifier is the document identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param version is the document version.
*/
@Override
public void recordDocument(String documentIdentifier,
String componentIdentifier,
String version)
throws ManifoldCFException
{
String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
ingester.documentRecord(
pipelineSpecification,
connectionName,documentIdentifierHash,componentIdentifierHash,
version,currentTime);
touchedSet.add(documentIdentifier);
touchComponentSet(documentIdentifier,componentIdentifierHash);
}
/** Ingest the current document.
*@param documentIdentifier is the document's local identifier.
*@param version is the version of the document, as reported by the getDocumentVersions() method of the
* corresponding repository connector.
*@param documentURI is the URI to use to retrieve this document from the search interface (and is
* also the unique key in the index).
*@param data is the document data. The data is closed after ingestion is complete.
*@throws IOException only when data stream reading fails.
*/
@Override
public void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
throws ManifoldCFException, ServiceInterruption, IOException
{
ingestDocumentWithException(documentIdentifier,null,version,documentURI,data);
}
/** Ingest the current document.
*@param documentIdentifier is the document's identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param version is the version of the document, as reported by the getDocumentVersions() method of the
* corresponding repository connector.
*@param documentURI is the URI to use to retrieve this document from the search interface (and is
* also the unique key in the index).
*@param data is the document data. The data is closed after ingestion is complete.
*@throws IOException only when data stream reading fails.
*/
@Override
public void ingestDocumentWithException(String documentIdentifier,
String componentIdentifier,
String version, String documentURI, RepositoryDocument data)
throws ManifoldCFException, ServiceInterruption, IOException
{
// We should not get called here if versions agree, unless the repository
// connector cannot distinguish between versions - in which case it must
// always ingest (essentially)
String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
// This method currently signals whether the document is accepted or rejected permanently.
// Permanent rejection should involve leaving the document in the queue (since it probably
// will be rediscovered), and noting its version (so that we don't try doing anything with it
// again). But the documentIngest method will already take care of the latter, so we really need to
// do nothing different based on the results of the documentIngest invocation.
// The only time it would be great to do something different would be if the document's version
// indicates that it should always be refetched. But I have no way to describe this situation
// in the database at the moment.
ingester.documentIngest(
computePipelineSpecificationWithVersions(documentIdentifierHash,componentIdentifierHash,documentIdentifier),
connectionName,documentIdentifierHash,componentIdentifierHash,
version,
connection.getACLAuthority(),
data,currentTime,
documentURI,
ingestLogger);
touchedSet.add(documentIdentifier);
touchComponentSet(documentIdentifier,componentIdentifierHash);
}
/** Remove the specified document from the search engine index, while keeping track of the version information
* for it (to reduce churn).
*@param documentIdentifier is the document's local identifier.
*@param version is the version string to be recorded for the document.
*/
@Override
public void noDocument(String documentIdentifier, String version)
throws ManifoldCFException, ServiceInterruption
{
noDocument(documentIdentifier,null,version);
}
/** Remove the specified document from the search engine index, and update the
* recorded version information for the document.
*@param documentIdentifier is the document's local identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param version is the version string to be recorded for the document.
*/
@Override
public void noDocument(String documentIdentifier,
String componentIdentifier,
String version)
throws ManifoldCFException, ServiceInterruption
{
// Special interpretation for empty version string; treat as if the document doesn't exist
// (by ignoring it and allowing it to be deleted later)
String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
ingester.documentNoData(
computePipelineSpecificationWithVersions(documentIdentifierHash,componentIdentifierHash,documentIdentifier),
connectionName,documentIdentifierHash,componentIdentifierHash,
version,
connection.getACLAuthority(),
currentTime,
ingestLogger);
touchedSet.add(documentIdentifier);
touchComponentSet(documentIdentifier,componentIdentifierHash);
}
/** Remove the specified document primary component permanently from the search engine index,
* and from the status table. Use this method when your document has components and
* now also has a primary document, but will not have a primary document again for the foreseeable
* future. This is a rare situation.
*@param documentIdentifier is the document's identifier.
*/
@Override
public void removeDocument(String documentIdentifier)
throws ManifoldCFException, ServiceInterruption
{
checkMultipleDispositions(documentIdentifier,null,null);
String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
ingester.documentRemove(
pipelineSpecification,
connectionName,documentIdentifierHash,null,
ingestLogger);
// Note that we touched it, so it won't get checked
touchedSet.add(documentIdentifier);
touchComponentSet(documentIdentifier,null);
}
/** Retain existing document component. Use this method to signal that an already-existing
* document component does not need to be reindexed. The default behavior is to remove
* components that are not mentioned during processing.
*@param documentIdentifier is the document's identifier.
*@param componentIdentifier is the component document identifier, which cannot be null.
*/
@Override
public void retainDocument(String documentIdentifier,
String componentIdentifier)
throws ManifoldCFException
{
String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
touchComponentSet(documentIdentifier,componentIdentifierHash);
}
/** Retain all existing document components of a primary document. Use this method to signal that
* no document components need to be reindexed. The default behavior is to remove
* components that are not mentioned during processing.
*@param documentIdentifier is the document's identifier.
*/
@Override
public void retainAllComponentDocument(String documentIdentifier)
throws ManifoldCFException
{
checkAllComponentsMultipleDispositions(documentIdentifier);
touchAllComponentsSet(documentIdentifier);
}
/** Delete the specified document from the search engine index, and from the status table. This
* method does NOT keep track of version
* information for the document and thus can lead to "churn", whereby the same document is queued, processed,
* and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
* in any case where the same decision will need to be made over and over.
*@param documentIdentifier is the document's identifier.
*/
@Override
public void deleteDocument(String documentIdentifier)
throws ManifoldCFException
{
documentDeletedSet.add(documentIdentifier);
}
/** Override the schedule for the next time a document is crawled.
* Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound,
* or a combination of these, on a specific document. This method is only effective if the job is a continuous one, and if the
* identifier you pass in is being processed.
*@param localIdentifier is the document's local identifier.
*@param lowerRecrawlBoundTime is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.
*@param upperRecrawlBoundTime is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.
*@param lowerExpireBoundTime is the time in ms since epoch that the expire time should not fall BELOW, or null if none.
*@param upperExpireBoundTime is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.
*/
@Override
public void setDocumentScheduleBounds(String localIdentifier,
Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime,
Long lowerExpireBoundTime, Long upperExpireBoundTime)
throws ManifoldCFException
{
if (lowerRecrawlBoundTime != null)
lowerRescheduleBounds.put(localIdentifier,lowerRecrawlBoundTime);
else
lowerRescheduleBounds.remove(localIdentifier);
if (upperRecrawlBoundTime != null)
upperRescheduleBounds.put(localIdentifier,upperRecrawlBoundTime);
else
upperRescheduleBounds.remove(localIdentifier);
if (lowerExpireBoundTime != null)
lowerExpireBounds.put(localIdentifier,lowerExpireBoundTime);
else
lowerExpireBounds.remove(localIdentifier);
if (upperExpireBoundTime != null)
upperExpireBounds.put(localIdentifier,upperExpireBoundTime);
else
upperExpireBounds.remove(localIdentifier);
}
/** Override a document's origination time.
* Use this method to signal the framework that a document's origination time is something other than the first time it was crawled.
*@param localIdentifier is the document's local identifier.
*@param originationTime is the document's origination time, or null if unknown.
*/
@Override
public void setDocumentOriginationTime(String localIdentifier,
Long originationTime)
throws ManifoldCFException
{
if (originationTime == null)
originationTimes.remove(localIdentifier);
else
originationTimes.put(localIdentifier,originationTime);
}
/** Find a document's lower rescheduling time bound, if any */
public Long getDocumentRescheduleLowerBoundTime(String localIdentifier)
{
return lowerRescheduleBounds.get(localIdentifier);
}
/** Find a document's upper rescheduling time bound, if any */
public Long getDocumentRescheduleUpperBoundTime(String localIdentifier)
{
return upperRescheduleBounds.get(localIdentifier);
}
/** Find a document's lower expiration time bound, if any */
public Long getDocumentExpirationLowerBoundTime(String localIdentifier)
{
return lowerExpireBounds.get(localIdentifier);
}
/** Find a document's upper expiration time bound, if any */
public Long getDocumentExpirationUpperBoundTime(String localIdentifier)
{
return upperExpireBounds.get(localIdentifier);
}
/** Get a document's origination time */
public Long getDocumentOriginationTime(String localIdentifier)
{
return originationTimes.get(localIdentifier);
}
public Long calculateDocumentRescheduleTime(long currentTime, long timeAmt, String localIdentifier)
{
Long recrawlTime = null;
//Long recrawlInterval = job.getInterval();
if (recrawlInterval != null)
{
//Long maxInterval = job.getMaxInterval();
long actualInterval = recrawlInterval.longValue() + timeAmt;
if (maxInterval != null && actualInterval > maxInterval.longValue())
actualInterval = maxInterval.longValue();
recrawlTime = new Long(currentTime + actualInterval);
}
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug("Default rescan time for document '"+localIdentifier+"' is "+((recrawlTime==null)?"NEVER":recrawlTime.toString()));
Long lowerBound = getDocumentRescheduleLowerBoundTime(localIdentifier);
if (lowerBound != null)
{
if (recrawlTime == null || recrawlTime.longValue() < lowerBound.longValue())
{
recrawlTime = lowerBound;
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug(" Rescan time overridden for document '"+localIdentifier+"' due to lower bound; new value is "+recrawlTime.toString());
}
}
Long upperBound = getDocumentRescheduleUpperBoundTime(localIdentifier);
if (upperBound != null)
{
if (recrawlTime == null || recrawlTime.longValue() > upperBound.longValue())
{
recrawlTime = upperBound;
if (Logging.scheduling.isDebugEnabled())
Logging.scheduling.debug(" Rescan time overridden for document '"+localIdentifier+"' due to upper bound; new value is "+recrawlTime.toString());
}
}
return recrawlTime;
}
public Long calculateDocumentExpireTime(long currentTime, String localIdentifier)
{
// For expire time, we take the document's origination time, plus the expiration interval (which comes from the job).
Long originationTime = getDocumentOriginationTime(localIdentifier);
if (originationTime == null)
originationTime = new Long(currentTime);
//Long expireInterval = job.getExpiration();
Long expireTime = null;
if (expireInterval != null)
expireTime = new Long(originationTime.longValue() + expireInterval.longValue());
Long lowerBound = getDocumentExpirationLowerBoundTime(localIdentifier);
if (lowerBound != null)
{
if (expireTime == null || expireTime.longValue() < lowerBound.longValue())
expireTime = lowerBound;
}
Long upperBound = getDocumentExpirationUpperBoundTime(localIdentifier);
if (upperBound != null)
{
if (expireTime == null || expireTime.longValue() > upperBound.longValue())
expireTime = upperBound;
}
return expireTime;
}
/** Reset the recorded times */
public void resetTimes()
{
lowerRescheduleBounds.clear();
upperRescheduleBounds.clear();
lowerExpireBounds.clear();
upperExpireBounds.clear();
}
/** Record time-stamped information about the activity of the connector.
*@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
* activity has an associated time; the startTime field records when the activity began. A null value
* indicates that the start time and the finishing time are the same.
*@param activityType is a string which is fully interpretable only in the context of the connector involved, which is
* used to categorize what kind of activity is being recorded. For example, a web connector might record a
* "fetch document" activity. Cannot be null.
*@param dataSize is the number of bytes of data involved in the activity, or null if not applicable.
*@param entityIdentifier is a (possibly long) string which identifies the object involved in the history record.
* The interpretation of this field will differ from connector to connector. May be null.
*@param resultCode contains a terse description of the result of the activity. The description is limited in
* size to 255 characters, and can be interpreted only in the context of the current connector. May be null.
*@param resultDescription is a (possibly long) human-readable string which adds detail, if required, to the result
* described in the resultCode field. This field is not meant to be queried on. May be null.
*@param childIdentifiers is a set of child entity identifiers associated with this activity. May be null.
*/
@Override
public void recordActivity(Long startTime, String activityType, Long dataSize,
String entityIdentifier, String resultCode, String resultDescription, String[] childIdentifiers)
throws ManifoldCFException
{
connMgr.recordHistory(connection.getName(),startTime,activityType,dataSize,entityIdentifier,resultCode,
resultDescription,childIdentifiers);
}
/** Flush the outstanding references into the database.
*/
public void flush()
throws ManifoldCFException
{
processDocumentReferences();
}
/** Process outstanding document references, in batch.
*/
protected void processDocumentReferences()
throws ManifoldCFException
{
if (referenceList.size() == 0)
return;
// We have to segregate the references by link type and parent.
Map> linkBins = new HashMap>();
for (DocumentReference dr : referenceList.keySet())
{
DocumentBin key = dr.getKey();
List set = linkBins.get(key);
if (set == null)
{
set = new ArrayList();
linkBins.put(key,set);
}
set.add(dr);
}
// Now, go through link types.
for (DocumentBin db : linkBins.keySet())
{
List set = linkBins.get(db);
String[] docidHashes = new String[set.size()];
String[] docids = new String[set.size()];
IPriorityCalculator[] priorities = new IPriorityCalculator[set.size()];
String[][] dataNames = new String[docids.length][];
Object[][][] dataValues = new Object[docids.length][][];
String[][] eventNames = new String[docids.length][];
long currentTime = System.currentTimeMillis();
double currentMinimumDepth = rt.getMinimumDepth();
rt.clearPreloadRequests();
for (int j = 0; j < docidHashes.length; j++)
{
DocumentReference dr = set.get(j);
docidHashes[j] = dr.getLocalIdentifierHash();
docids[j] = dr.getLocalIdentifier();
dataNames[j] = dr.getDataNames();
dataValues[j] = dr.getDataValues();
eventNames[j] = dr.getPrerequisiteEventNames();
// Calculate desired document priority based on current queuetracker status.
String[] bins = ManifoldCF.calculateBins(connector,dr.getLocalIdentifier());
PriorityCalculator p = new PriorityCalculator(rt,currentMinimumDepth,connection,bins,dr.getLocalIdentifier());
priorities[j] = p;
p.makePreloadRequest();
}
rt.preloadBinValues();
jobManager.addDocuments(processID,
jobID,legalLinkTypes,docidHashes,docids,db.getParentIdentifierHash(),db.getLinkType(),hopcountMode,
dataNames,dataValues,priorities,eventNames);
rt.clearPreloadedValues();
}
discard();
}
/** Check whether current job is still active.
* This method is provided to allow an individual connector that needs to wait on some long-term condition to give up waiting due to the job
* itself being aborted. If the connector should abort, this method will raise a properly-formed ServiceInterruption, which if thrown to the
* caller, will signal that the current processing activity remains incomplete and must be retried when the job is resumed.
*/
@Override
public void checkJobStillActive()
throws ManifoldCFException, ServiceInterruption
{
if (jobManager.checkJobActive(jobID) == false)
throw new ServiceInterruption("Job no longer active",System.currentTimeMillis());
}
/** Begin an event sequence.
* This method should be called by a connector when a sequencing event should enter the "pending" state. If the event is already in that state,
* this method will return false, otherwise true. The connector has the responsibility of appropriately managing sequencing given the response
* status.
*@param eventName is the event name.
*@return false if the event is already in the "pending" state.
*/
@Override
public boolean beginEventSequence(String eventName)
throws ManifoldCFException
{
return jobManager.beginEventSequence(processID,eventName);
}
/** Complete an event sequence.
* This method should be called to signal that an event is no longer in the "pending" state. This can mean that the prerequisite processing is
* completed, but it can also mean that prerequisite processing was aborted or cannot be completed.
* Note well: This method should not be called unless the connector is CERTAIN that an event is in progress, and that the current thread has
* the sole right to complete it. Otherwise, race conditions can develop which would be difficult to diagnose.
*@param eventName is the event name.
*/
@Override
public void completeEventSequence(String eventName)
throws ManifoldCFException
{
jobManager.completeEventSequence(eventName);
}
/** Abort processing a document (for sequencing reasons).
* This method should be called in order to cause the specified document to be requeued for later processing. While this is similar in some respects
* to the semantics of a ServiceInterruption, it is applicable to only one document at a time, and also does not specify any delay period, since it is
* presumed that the reason for the requeue is because of sequencing issues synchronized around an underlying event.
*@param localIdentifier is the document identifier to requeue
*/
@Override
public void retryDocumentProcessing(String localIdentifier)
throws ManifoldCFException
{
// Accumulate aborts
abortSet.add(localIdentifier);
}
/** Detect if a date is indexable or not. This method is used by participating repository connectors to pre-filter the number of
* unusable documents that will be passed to this output connector.
*@param date is the date of the document; may be null
*@return true if a document with that date is indexable by this connector.
*/
public boolean checkDateIndexable(Date date)
throws ManifoldCFException, ServiceInterruption
{
return ingester.checkDateIndexable(
pipelineSpecification,date,
ingestLogger);
}
/** Check whether a mime type is indexable by the currently specified output connector.
*@param mimeType is the mime type to check, not including any character set specification.
*@return true if the mime type is indexable.
*/
@Override
public boolean checkMimeTypeIndexable(String mimeType)
throws ManifoldCFException, ServiceInterruption
{
return ingester.checkMimeTypeIndexable(
pipelineSpecification,mimeType,
ingestLogger);
}
/** Check whether a document is indexable by the currently specified output connector.
*@param localFile is the local copy of the file to check.
*@return true if the document is indexable.
*/
@Override
public boolean checkDocumentIndexable(File localFile)
throws ManifoldCFException, ServiceInterruption
{
return ingester.checkDocumentIndexable(
pipelineSpecification,localFile,
ingestLogger);
}
/** Check whether a document of a specified length is indexable by the currently specified output connector.
*@param length is the length to check.
*@return true if the document is indexable.
*/
@Override
public boolean checkLengthIndexable(long length)
throws ManifoldCFException, ServiceInterruption
{
return ingester.checkLengthIndexable(
pipelineSpecification,length,
ingestLogger);
}
/** Pre-determine whether a document's URL is indexable by this connector. This method is used by participating repository connectors
* to help filter out documents that are not worth indexing.
*@param url is the URL of the document.
*@return true if the file is indexable.
*/
@Override
public boolean checkURLIndexable(String url)
throws ManifoldCFException, ServiceInterruption
{
return ingester.checkURLIndexable(
pipelineSpecification,url,
ingestLogger);
}
/** Create a global string from a simple string.
*@param simpleString is the simple string.
*@return a global string.
*/
@Override
public String createGlobalString(String simpleString)
{
return ManifoldCF.createGlobalString(simpleString);
}
/** Create a connection-specific string from a simple string.
*@param simpleString is the simple string.
*@return a connection-specific string.
*/
@Override
public String createConnectionSpecificString(String simpleString)
{
return ManifoldCF.createConnectionSpecificString(connection.getName(),simpleString);
}
/** Create a job-based string from a simple string.
*@param simpleString is the simple string.
*@return a job-specific string.
*/
@Override
public String createJobSpecificString(String simpleString)
{
return ManifoldCF.createJobSpecificString(jobID,simpleString);
}
protected void checkAllComponentsMultipleDispositions(String documentIdentifier)
{
if (abortSet.contains(documentIdentifier))
throw new IllegalStateException("Multiple document dispositions not allowed: Abort cannot be combined with component disposition; document '"+documentIdentifier+"'");
if (documentDeletedSet.contains(documentIdentifier))
throw new IllegalStateException("Multiple document dispositions not allowed: Document delete cannot be combined with component disposition; document '"+documentIdentifier+"'");
Set components = touchedComponentSet.get(documentIdentifier);
if (components != null && components.size() > 0)
throw new IllegalStateException("Multiple document dispositions not allowed: Retain all components cannot be combined with individual component disposition; document '"+documentIdentifier+"'");
}
protected void checkMultipleDispositions(String documentIdentifier, String componentIdentifier, String componentIdentifierHash)
{
if (abortSet.contains(documentIdentifier))
throw new IllegalStateException("Multiple document dispositions not allowed: Abort cannot be combined with component disposition; document '"+documentIdentifier+"'");
if (documentDeletedSet.contains(documentIdentifier))
throw new IllegalStateException("Multiple document dispositions not allowed: Document delete cannot be combined with component disposition; document '"+documentIdentifier+"'");
if (componentIdentifierHash == null)
{
// Primary
if (touchedPrimarySet.contains(documentIdentifier))
throw new IllegalStateException("Multiple document primary component dispositions not allowed: document '"+documentIdentifier+"'");
}
else
{
if (allComponentsSet.contains(documentIdentifier))
throw new IllegalStateException("Multiple document component dispositions not allowed: document '"+documentIdentifier+"', component '"+componentIdentifier+"'");
Set components = touchedComponentSet.get(documentIdentifier);
if (components != null && components.contains(componentIdentifierHash))
throw new IllegalStateException("Multiple document component dispositions not allowed: document '"+documentIdentifier+"', component '"+componentIdentifier+"'");
}
}
protected void touchAllComponentsSet(String documentIdentifier)
{
allComponentsSet.add(documentIdentifier);
}
protected void touchComponentSet(String documentIdentifier, String componentIdentifierHash)
{
if (componentIdentifierHash == null)
{
touchedPrimarySet.add(documentIdentifier);
return;
}
Set components = touchedComponentSet.get(documentIdentifier);
if (components == null)
{
components = new HashSet();
touchedComponentSet.put(documentIdentifier,components);
}
components.add(componentIdentifierHash);
}
protected IPipelineSpecificationWithVersions computePipelineSpecificationWithVersions(String documentIdentifierHash,
String componentIdentifierHash,
String documentIdentifier)
{
QueuedDocument qd = previousDocuments.get(documentIdentifierHash);
if (qd == null)
throw new IllegalArgumentException("Unrecognized document identifier: '"+documentIdentifier+"'");
return new PipelineSpecificationWithVersions(pipelineSpecification,qd,componentIdentifierHash);
}
}
protected static String computeComponentIDHash(String componentIdentifier)
throws ManifoldCFException
{
if (componentIdentifier != null)
return ManifoldCF.hash(componentIdentifier);
else
return null;
}
/** DocumentBin class */
protected static class DocumentBin
{
protected String linkType;
protected String parentIdentifierHash;
public DocumentBin(String parentIdentifierHash, String linkType)
{
this.parentIdentifierHash = parentIdentifierHash;
this.linkType = linkType;
}
public String getParentIdentifierHash()
{
return parentIdentifierHash;
}
public String getLinkType()
{
return linkType;
}
public int hashCode()
{
return ((linkType==null)?0:linkType.hashCode()) + ((parentIdentifierHash==null)?0:parentIdentifierHash.hashCode());
}
public boolean equals(Object o)
{
if (!(o instanceof DocumentBin))
return false;
DocumentBin db = (DocumentBin)o;
if (linkType == null || db.linkType == null)
{
if (linkType != db.linkType)
return false;
}
else
{
if (!linkType.equals(db.linkType))
return false;
}
if (parentIdentifierHash == null || db.parentIdentifierHash == null)
{
if (parentIdentifierHash != db.parentIdentifierHash)
return false;
}
else
{
if (!parentIdentifierHash.equals(db.parentIdentifierHash))
return false;
}
return true;
}
}
/** Class describing document reference.
* Note: If the same document reference occurs multiple times, the data names and values should AGGREGATE, rather than the newer one replacing the older.
* Similar treatment will occur for prerequisites, although that's unlikely to be used.
*/
protected static class DocumentReference
{
protected String localIdentifierHash;
protected String localIdentifier;
protected DocumentBin db;
/** This hashmap is keyed by data name and has a hashmap as a value (which contains the data values) */
protected HashMap data = new HashMap();
/** This hashmap contains the prerequisite event names */
protected HashMap prereqEvents = new HashMap();
public DocumentReference(String localIdentifierHash, String localIdentifier, DocumentBin db)
{
this.localIdentifierHash = localIdentifierHash;
this.localIdentifier = localIdentifier;
this.db = db;
}
/** Close all object data references. This should be called whenever a DocumentReference object is abandoned. */
public void discard()
throws ManifoldCFException
{
Iterator iter = data.keySet().iterator();
while (iter.hasNext())
{
String dataName = (String)iter.next();
ArrayList list = (ArrayList)data.get(dataName);
int i = 0;
while (i < list.size())
{
Object o = (Object)list.get(i++);
if (o instanceof CharacterInput)
((CharacterInput)o).discard();
}
}
}
public void addData(String[] dataNames, Object[][] dataValues)
{
if (dataNames == null || dataValues == null)
return;
int i = 0;
while (i < dataNames.length)
{
addData(dataNames[i],dataValues[i]);
i++;
}
}
public void addData(String dataName, Object[] dataValues)
{
if (dataName == null || dataValues == null)
return;
int i = 0;
while (i < dataValues.length)
{
addData(dataName,dataValues[i++]);
}
}
public void addData(String dataName, Object dataValue)
{
if (dataName == null)
return;
ArrayList valueMap = (ArrayList)data.get(dataName);
if (valueMap == null)
{
valueMap = new ArrayList();
data.put(dataName,valueMap);
}
// Without the hash value, it's impossible to keep track of value uniqueness in this layer. So, I've removed any attempts to do so; jobManager.addDocuments()
// will have to do that job instead.
valueMap.add(dataValue);
}
public void addPrerequisiteEvents(String[] eventNames)
{
if (eventNames == null)
return;
int i = 0;
while (i < eventNames.length)
{
addPrerequisiteEvent(eventNames[i++]);
}
}
public void addPrerequisiteEvent(String eventName)
{
prereqEvents.put(eventName,eventName);
}
public DocumentBin getKey()
{
return db;
}
public String getLocalIdentifierHash()
{
return localIdentifierHash;
}
public String getLocalIdentifier()
{
return localIdentifier;
}
public String[] getPrerequisiteEventNames()
{
String[] rval = new String[prereqEvents.size()];
int i = 0;
Iterator iter = prereqEvents.keySet().iterator();
while (iter.hasNext())
{
rval[i++] = (String)iter.next();
}
return rval;
}
public String[] getDataNames()
{
String[] rval = new String[data.size()];
int i = 0;
Iterator iter = data.keySet().iterator();
while (iter.hasNext())
{
String dataName = (String)iter.next();
rval[i++] = dataName;
}
return rval;
}
public Object[][] getDataValues()
{
// Presumably the values will correspond with the names ONLY if no changes have occurred to the hash table.
Object[][] rval = new Object[data.size()][];
int i = 0;
Iterator iter = data.keySet().iterator();
while (iter.hasNext())
{
String dataName = (String)iter.next();
ArrayList values = (ArrayList)data.get(dataName);
Object[] valueArray = new Object[values.size()];
rval[i] = valueArray;
int j = 0;
while (j < valueArray.length)
{
valueArray[j] = values.get(j);
j++;
}
i++;
}
return rval;
}
public boolean equals(Object o)
{
if (!(o instanceof DocumentReference))
return false;
DocumentReference other = (DocumentReference)o;
if (!other.localIdentifierHash.equals(localIdentifierHash))
return false;
return other.db.equals(db);
}
public int hashCode()
{
return localIdentifierHash.hashCode() + db.hashCode();
}
}
/** Class that represents a decision to process a document.
*/
protected static class DocumentToProcess
{
protected QueuedDocument document;
protected boolean scanOnly;
/** Construct.
*@param document is the document to process.
*@param scanOnly is true if the document should be scanned, but not ingested.
*/
public DocumentToProcess(QueuedDocument document, boolean scanOnly)
{
this.document = document;
this.scanOnly = scanOnly;
}
/** Get the document.
*@return the document.
*/
public QueuedDocument getDocument()
{
return document;
}
/** Get the 'scan only' flag.
*@return true if only scan should be attempted.
*/
public boolean getScanOnly()
{
return scanOnly;
}
}
/** The check activity class */
protected static class CheckActivity implements IOutputCheckActivity
{
public CheckActivity()
{
}
/** Detect if a date is acceptable downstream or not. This method is used to determine whether it makes sense to fetch a document
* in the first place.
*@param date is the document's date
*@return true if the document with that date can be accepted by the downstream connection.
*/
@Override
public boolean checkDateIndexable(Date date)
throws ManifoldCFException, ServiceInterruption
{
return false;
}
/** Detect if a mime type is acceptable downstream or not. This method is used to determine whether it makes sense to fetch a document
* in the first place.
*@param mimeType is the mime type of the document.
*@return true if the mime type can be accepted by the downstream connection.
*/
@Override
public boolean checkMimeTypeIndexable(String mimeType)
throws ManifoldCFException, ServiceInterruption
{
return false;
}
/** Pre-determine whether a document (passed here as a File object) is acceptable downstream. This method is
* used to determine whether a document needs to be actually transferred. This hook is provided mainly to support
* search engines that only handle a small set of accepted file types.
*@param localFile is the local file to check.
*@return true if the file is acceptable by the downstream connection.
*/
@Override
public boolean checkDocumentIndexable(File localFile)
throws ManifoldCFException, ServiceInterruption
{
return false;
}
/** Pre-determine whether a document's length is acceptable downstream. This method is used
* to determine whether to fetch a document in the first place.
*@param length is the length of the document.
*@return true if the file is acceptable by the downstream connection.
*/
@Override
public boolean checkLengthIndexable(long length)
throws ManifoldCFException, ServiceInterruption
{
return false;
}
/** Pre-determine whether a document's URL is acceptable downstream. This method is used
* to help filter out documents that cannot be indexed in advance.
*@param url is the URL of the document.
*@return true if the file is acceptable by the downstream connection.
*/
@Override
public boolean checkURLIndexable(String url)
throws ManifoldCFException, ServiceInterruption
{
return false;
}
}
/** The implementation of the IExistingVersions interface.
*/
protected static class ExistingVersions implements IExistingVersions
{
protected final Map map;
protected final String lastOutputConnectionName;
public ExistingVersions(String lastOutputConnectionName, List list)
{
this.lastOutputConnectionName = lastOutputConnectionName;
this.map = new HashMap();
for (QueuedDocument qd : list)
{
map.put(qd.getDocumentDescription().getDocumentIdentifier(),qd);
}
}
/** Retrieve an existing version string given a document identifier.
*@param documentIdentifier is the document identifier.
*@return the document version string, or null if the document was never previously indexed.
*/
@Override
public String getIndexedVersionString(String documentIdentifier)
throws ManifoldCFException
{
return getIndexedVersionString(documentIdentifier,null);
}
/** Retrieve a component existing version string given a document identifier.
*@param documentIdentifier is the document identifier.
*@param componentIdentifier is the component identifier, if any.
*@return the document version string, or null of the document component was never previously indexed.
*/
@Override
public String getIndexedVersionString(String documentIdentifier, String componentIdentifier)
throws ManifoldCFException
{
QueuedDocument qd = map.get(documentIdentifier);
DocumentIngestStatusSet status = qd.getLastIngestedStatus(lastOutputConnectionName);
if (status == null)
return null;
String componentIdentifierHash;
if (componentIdentifier == null)
componentIdentifierHash = null;
else
componentIdentifierHash = ManifoldCF.hash(componentIdentifier);
DocumentIngestStatus s = status.getComponent(componentIdentifierHash);
if (s == null)
return null;
return s.getDocumentVersion();
}
}
/** The ingest logger class */
protected static class OutputActivity extends CheckActivity implements IOutputActivity
{
// Connection name
protected final String connectionName;
// Connection manager
protected final IRepositoryConnectionManager connMgr;
/** Constructor */
public OutputActivity(String connectionName, IRepositoryConnectionManager connMgr)
{
this.connectionName = connectionName;
this.connMgr = connMgr;
}
/** Record time-stamped information about the activity of the output connector.
*@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
* activity has an associated time; the startTime field records when the activity began. A null value
* indicates that the start time and the finishing time are the same.
*@param activityType is a string which is fully interpretable only in the context of the connector involved, which is
* used to categorize what kind of activity is being recorded. For example, a web connector might record a
* "fetch document" activity. Cannot be null.
*@param dataSize is the number of bytes of data involved in the activity, or null if not applicable.
*@param entityURI is a (possibly long) string which identifies the object involved in the history record.
* The interpretation of this field will differ from connector to connector. May be null.
*@param resultCode contains a terse description of the result of the activity. The description is limited in
* size to 255 characters, and can be interpreted only in the context of the current connector. May be null.
*@param resultDescription is a (possibly long) human-readable string which adds detail, if required, to the result
* described in the resultCode field. This field is not meant to be queried on. May be null.
*/
@Override
public void recordActivity(Long startTime, String activityType, Long dataSize,
String entityURI, String resultCode, String resultDescription)
throws ManifoldCFException
{
connMgr.recordHistory(connectionName,startTime,activityType,dataSize,entityURI,resultCode,
resultDescription,null);
}
/** Qualify an access token appropriately, to match access tokens as returned by mod_aa. This method
* includes the authority name with the access token, if any, so that each authority may establish its own token space.
*@param authorityNameString is the name of the authority to use to qualify the access token.
*@param accessToken is the raw, repository access token.
*@return the properly qualified access token.
*/
@Override
public String qualifyAccessToken(String authorityNameString, String accessToken)
throws ManifoldCFException
{
if (authorityNameString == null)
return URLEncoder.encode(accessToken);
else
return URLEncoder.encode(authorityNameString) + ":" + URLEncoder.encode(accessToken);
}
/** Send a document via the pipeline to the next output connection.
*@param documentURI is the document's URI.
*@param document is the document data to be processed (handed to the output data store).
*@return the document status (accepted or permanently rejected); return codes are listed in IPipelineConnector.
*@throws IOException only if there's an IO error reading the data from the document.
*/
@Override
public int sendDocument(String documentURI, RepositoryDocument document)
throws ManifoldCFException, ServiceInterruption, IOException
{
// No downstream connection at output connection level.
return IPipelineConnector.DOCUMENTSTATUS_REJECTED;
}
/** Send NO document via the pipeline to the next output connection. This is equivalent
* to sending an empty document placeholder.
*/
@Override
public void noDocument()
throws ManifoldCFException, ServiceInterruption
{
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy