org.apache.manifoldcf.crawler.system.SeedingActivity Maven / Gradle / Ivy

Go to download
/* $Id: SeedingActivity.java 988245 2010-08-23 18:39:35Z kwright $ */

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.system;

import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import java.util.*;
import java.lang.reflect.*;

/** This class represents the things you can do with the framework while
* seeding.
*/
public class SeedingActivity implements ISeedingActivity
{
  public static final String _rcsid = "@(#)$Id: SeedingActivity.java 988245 2010-08-23 18:39:35Z kwright $";

  // This is the maximum number of documents passed to the queue at one time.
  protected static final int MAX_COUNT = 100;

  // Variables
  protected final String processID;
  protected final String connectionName;
  protected final IRepositoryConnectionManager connManager;
  protected final IJobManager jobManager;
  protected final IReprioritizationTracker rt;
  protected final IRepositoryConnection connection;
  protected final IRepositoryConnector connector;
  protected final Long jobID;
  protected final String[] legalLinkTypes;
  protected final boolean overrideSchedule;
  protected final int hopcountMethod;
  
  protected final String[] documentHashList = new String[MAX_COUNT];
  protected final String[] documentList = new String[MAX_COUNT];
  protected final String[][] documentPrereqList = new String[MAX_COUNT][];
  protected int documentCount = 0;
  protected final String[] remainingDocumentHashList = new String[MAX_COUNT];
  protected int remainingDocumentCount = 0;

  /** Constructor.
  */
  public SeedingActivity(String connectionName, IRepositoryConnectionManager connManager,
    IJobManager jobManager,
    IReprioritizationTracker rt, IRepositoryConnection connection, IRepositoryConnector connector,
    Long jobID, String[] legalLinkTypes, boolean overrideSchedule, int hopcountMethod, String processID)
  {
    this.processID = processID;
    this.connectionName = connectionName;
    this.connManager = connManager;
    this.jobManager = jobManager;
    this.rt = rt;
    this.connection = connection;
    this.connector = connector;
    this.jobID = jobID;
    this.legalLinkTypes = legalLinkTypes;
    this.overrideSchedule = overrideSchedule;
    this.hopcountMethod = hopcountMethod;
  }

  /** Record a "seed" document identifier.
  * Seeds passed to this method will be loaded into the job's queue at the beginning of the
  * job's execution, and for continuous crawling jobs, periodically throughout the crawl.
  *
  * All documents passed to this method are placed on the "pending documents" list, and are marked as being seed
  * documents.  All pending documents will be processed to determine if they have changed or have been deleted.
  * It is not a big problem if the connector chooses to put more documents onto the pending list than are
  * strictly necessary; it is merely a question of overall work required.
  *
  * Note that it is always ok to send MORE documents rather than less to this method.
  *
  *@param documentIdentifier is the identifier of the document to add to the "pending" queue.
  *@param prereqEventNames is the list of prerequisite events required for this document, or null if none.
  */
  public void addSeedDocument(String documentIdentifier, String[] prereqEventNames)
    throws ManifoldCFException
  {
    if (documentCount == MAX_COUNT)
    {
      // Prioritize and write the seed documents.
      writeSeedDocuments(documentHashList,documentList,documentPrereqList);
      documentCount = 0;
    }
    documentHashList[documentCount] = ManifoldCF.hash(documentIdentifier);
    documentList[documentCount] = documentIdentifier;
    if (prereqEventNames != null)
      documentPrereqList[documentCount] = prereqEventNames;
    else
      documentPrereqList[documentCount] = null;
    documentCount++;
  }

  /** Record a "seed" document identifier.
  * Seeds passed to this method will be loaded into the job's queue at the beginning of the
  * job's execution, and for continuous crawling jobs, periodically throughout the crawl.
  *
  * All documents passed to this method are placed on the "pending documents" list, and are marked as being seed
  * documents.  All pending documents will be processed to determine if they have changed or have been deleted.
  * It is not a big problem if the connector chooses to put more documents onto the pending list than are
  * strictly necessary; it is merely a question of overall work required.
  *
  * Note that it is always ok to send MORE documents rather than less to this method.
  *
  *@param documentIdentifier is the identifier of the document to add to the "pending" queue.
  */
  public void addSeedDocument(String documentIdentifier)
    throws ManifoldCFException
  {
    addSeedDocument(documentIdentifier,null);
  }

  /** This method receives document identifiers that should be considered part of the seeds, but do not need to be
  * queued for processing at this time.  (This method is used to keep the hopcount tables up to date.)  It is
  * allowed to receive more identifiers than it strictly needs to, specifically identifiers that may have also been
  * sent to the addSeedDocuments() method above.  However, the connector must constrain the identifiers
  * it sends by the document specification.
  * This method is only required to be called at all if the connector supports hopcount determination (which it
  * should signal by having more than zero legal relationship types returned by the getRelationshipTypes() method).
  *
  *@param documentIdentifier is the identifier of the document to consider as a seed, but not to put in the
  * "pending" queue.
  */
  public void addUnqueuedSeedDocument(String documentIdentifier)
    throws ManifoldCFException
  {
    if (remainingDocumentCount == MAX_COUNT)
    {
      // Flush the remaining documents
      jobManager.addRemainingDocumentsInitial(processID,jobID,legalLinkTypes,remainingDocumentHashList,hopcountMethod);
      remainingDocumentCount = 0;
    }
    remainingDocumentHashList[remainingDocumentCount++] = ManifoldCF.hash(documentIdentifier);
  }

  /** Finish a seeding pass */
  public void doneSeeding(boolean isPartial)
    throws ManifoldCFException
  {
    if (documentCount > 0)
    {
      String[] documentHashes = new String[documentCount];
      String[] documents = new String[documentCount];
      String[][] documentPrereqs = new String[documentCount][];
      int i = 0;
      while (i < documentHashes.length)
      {
        documentHashes[i] = documentHashList[i];
        documents[i] = documentList[i];
        documentPrereqs[i] = documentPrereqList[i];
        i++;
      }
      writeSeedDocuments(documentHashes,documents,documentPrereqs);
      documentCount = 0;
    }
    if (remainingDocumentCount > 0)
    {
      String[] documents = new String[remainingDocumentCount];
      int i = 0;
      while (i < documents.length)
      {
        documents[i] = remainingDocumentHashList[i];
        i++;
      }
      jobManager.addRemainingDocumentsInitial(processID,jobID,legalLinkTypes,documents,hopcountMethod);
      remainingDocumentCount = 0;
    }

    // Need to signal JobManager that seeding is done.
    jobManager.doneDocumentsInitial(jobID,legalLinkTypes,isPartial,hopcountMethod);
  }

  /** Record time-stamped information about the activity of the connector.
  *@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970).  Every
  *       activity has an associated time; the startTime field records when the activity began.  A null value
  *       indicates that the start time and the finishing time are the same.
  *@param activityType is a string which is fully interpretable only in the context of the connector involved, which is
  *       used to categorize what kind of activity is being recorded.  For example, a web connector might record a
  *       "fetch document" activity.  Cannot be null.
  *@param dataSize is the number of bytes of data involved in the activity, or null if not applicable.
  *@param entityIdentifier is a (possibly long) string which identifies the object involved in the history record.
  *       The interpretation of this field will differ from connector to connector.  May be null.
  *@param resultCode contains a terse description of the result of the activity.  The description is limited in
  *       size to 255 characters, and can be interpreted only in the context of the current connector.  May be null.
  *@param resultDescription is a (possibly long) human-readable string which adds detail, if required, to the result
  *       described in the resultCode field.  This field is not meant to be queried on.  May be null.
  *@param childIdentifiers is a set of child entity identifiers associated with this activity.  May be null.
  */
  public void recordActivity(Long startTime, String activityType, Long dataSize,
    String entityIdentifier, String resultCode, String resultDescription, String[] childIdentifiers)
    throws ManifoldCFException
  {
    connManager.recordHistory(connectionName,startTime,activityType,dataSize,entityIdentifier,resultCode,
      resultDescription,childIdentifiers);
  }

  /** Write specified documents after calculating their priorities */
  protected void writeSeedDocuments(String[] docIDHashes, String[] docIDs, String[][] prereqEventNames)
    throws ManifoldCFException
  {
    // First, prioritize the documents using the queue tracker
    IPriorityCalculator[] docPriorities = new IPriorityCalculator[docIDHashes.length];

    rt.clearPreloadRequests();

    for (int i = 0 ; i < docIDHashes.length ; i++)
    {
      // Calculate desired document priority based on current queuetracker status.
      String[] bins = connector.getBinNames(docIDs[i]);
      PriorityCalculator p = new PriorityCalculator(rt,connection,bins,docIDs[i]);
      docPriorities[i] = p;
      p.makePreloadRequest();
    }

    rt.preloadBinValues();

    jobManager.addDocumentsInitial(processID,
      jobID,legalLinkTypes,docIDHashes,docIDs,overrideSchedule,hopcountMethod,
      docPriorities,prereqEventNames);

  }

  /** Check whether current job is still active.
  * This method is provided to allow an individual connector that needs to wait on some long-term condition to give up waiting due to the job
  * itself being aborted.  If the connector should abort, this method will raise a properly-formed ServiceInterruption, which if thrown to the
  * caller, will signal that the current seeding activity remains incomplete and must be retried when the job is resumed.
  */
  public void checkJobStillActive()
    throws ManifoldCFException, ServiceInterruption
  {
    if (jobManager.checkJobActive(jobID) == false)
      throw new ServiceInterruption("Job no longer active",System.currentTimeMillis());
  }

  /** Create a global string from a simple string.
  *@param simpleString is the simple string.
  *@return a global string.
  */
  public String createGlobalString(String simpleString)
  {
    return ManifoldCF.createGlobalString(simpleString);
  }

  /** Create a connection-specific string from a simple string.
  *@param simpleString is the simple string.
  *@return a connection-specific string.
  */
  public String createConnectionSpecificString(String simpleString)
  {
    return ManifoldCF.createConnectionSpecificString(connection.getName(),simpleString);
  }

  /** Create a job-based string from a simple string.
  *@param simpleString is the simple string.
  *@return a job-specific string.
  */
  public String createJobSpecificString(String simpleString)
  {
    return ManifoldCF.createJobSpecificString(jobID,simpleString);
  }

}