All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.manifoldcf.crawler.interfaces.IProcessActivity Maven / Gradle / Ivy

/* $Id: IProcessActivity.java 988245 2010-08-23 18:39:35Z kwright $ */

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.interfaces;

import java.io.*;

import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;

/** This interface abstracts from the activities that a connector's processDocuments() method can do.
* The processing flow for a document is expected to go something like this:
* (1) The connector's processDocuments() method is called with a set of documents to be processed.
* (2) The connector computes a version string for each document in the set as part of determining
*    whether the document indeed needs to be refetched.
* (3) For each document processed, there can be one of several dispositions:
*   (a) There is no such document (anymore): deleteDocument() called for the document.
*   (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
*   (c) The document is determined to be unchanged and no updates are needed: nothing needs to be called
*     for the document.
*   (d) The document is determined to be unchanged BUT the version string needs to be updated: recordDocument()
*     is called for the document.
*   (e) The document is determined to be unindexable BUT it still exists in the repository: noDocument()
*    is called for the document.
*   (f) There was a service interruption: ServiceInterruption is thrown.
* (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
*    is available to return an opinion on that matter.
*/
public interface IProcessActivity extends IHistoryActivity, IEventActivity, IAbortActivity, IFingerprintActivity, ICarrydownActivity
{
  public static final String _rcsid = "@(#)$Id: IProcessActivity.java 988245 2010-08-23 18:39:35Z kwright $";

  /** Check if a document needs to be reindexed, based on a computed version string.
  * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed version
  * string.  This method will return "true" if the document needs to be re-indexed.
  *@param documentIdentifier is the document identifier.
  *@param newVersionString is the newly-computed version string.
  *@return true if the document needs to be reindexed.
  */
  public boolean checkDocumentNeedsReindexing(String documentIdentifier,
    String newVersionString)
    throws ManifoldCFException;

  /** Check if a document needs to be reindexed, based on a computed version string.
  * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed version
  * string.  This method will return "true" if the document needs to be re-indexed.
  *@param documentIdentifier is the document identifier.
  *@param componentIdentifier is the component document identifier, if any.
  *@param newVersionString is the newly-computed version string.
  *@return true if the document needs to be reindexed.
  */
  public boolean checkDocumentNeedsReindexing(String documentIdentifier,
    String componentIdentifier,
    String newVersionString)
    throws ManifoldCFException;

  /** Add a document description to the current job's queue.
  *@param documentIdentifier is the local document identifier to add (for the connector that
  * fetched the document).
  *@param parentIdentifier is the document identifier that is considered to be the "parent"
  * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
  * MUST be present in the case of carrydown information.
  *@param relationshipType is the string describing the kind of relationship described by this
  * reference.  This must be one of the strings returned by the IRepositoryConnector method
  * "getRelationshipTypes()".  May be null.
  *@param dataNames is the list of carry-down data from the parent to the child.  May be null.  Each name is limited to 255 characters!
  *@param dataValues are the values that correspond to the data names in the dataNames parameter.  May be null only if dataNames is null.
  *          The type of each object must either be a String, or a CharacterInput.
  *@param originationTime is the time, in ms since epoch, that the document originated.  Pass null if none or unknown.
  *@param prereqEventNames are the names of the prerequisite events which this document requires prior to processing.  Pass null if none.
  */
  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
    String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames)
    throws ManifoldCFException;

  /** Add a document description to the current job's queue.
  *@param documentIdentifier is the document identifier to add (for the connector that
  * fetched the document).
  *@param parentIdentifier is the document identifier that is considered to be the "parent"
  * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
  * MUST be present in the case of carrydown information.
  *@param relationshipType is the string describing the kind of relationship described by this
  * reference.  This must be one of the strings returned by the IRepositoryConnector method
  * "getRelationshipTypes()".  May be null.
  *@param dataNames is the list of carry-down data from the parent to the child.  May be null.  Each name is limited to 255 characters!
  *@param dataValues are the values that correspond to the data names in the dataNames parameter.  May be null only if dataNames is null.
  *          The type of each object must either be a String, or a CharacterInput.
  *@param originationTime is the time, in ms since epoch, that the document originated.  Pass null if none or unknown.
  */
  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
    String[] dataNames, Object[][] dataValues, Long originationTime)
    throws ManifoldCFException;

  /** Add a document description to the current job's queue.
  *@param documentIdentifier is the document identifier to add (for the connector that
  * fetched the document).
  *@param parentIdentifier is the document identifier that is considered to be the "parent"
  * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
  * MUST be present in the case of carrydown information.
  *@param relationshipType is the string describing the kind of relationship described by this
  * reference.  This must be one of the strings returned by the IRepositoryConnector method
  * "getRelationshipTypes()".  May be null.
  *@param dataNames is the list of carry-down data from the parent to the child.  May be null.  Each name is limited to 255 characters!
  *@param dataValues are the values that correspond to the data names in the dataNames parameter.  May be null only if dataNames is null.
  *          The type of each object must either be a String, or a CharacterInput.
  */
  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
    String[] dataNames, Object[][] dataValues)
    throws ManifoldCFException;

  /** Add a document description to the current job's queue.
  *@param documentIdentifier is the document identifier to add (for the connector that
  * fetched the document).
  *@param parentIdentifier is the document identifier that is considered to be the "parent"
  * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
  *@param relationshipType is the string describing the kind of relationship described by this
  * reference.  This must be one of the strings returned by the IRepositoryConnector method
  * "getRelationshipTypes()".  May be null.
  */
  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType)
    throws ManifoldCFException;

  /** Add a document description to the current job's queue.  This method is equivalent to
  * addDocumentReference(localIdentifier,null,null).
  *@param documentIdentifier is the document identifier to add (for the connector that
  * fetched the document).
  */
  public void addDocumentReference(String documentIdentifier)
    throws ManifoldCFException;

  /** Ingest the current document.
  *@param documentIdentifier is the document's identifier.
  *@param version is the version of the document, as reported by the getDocumentVersions() method of the
  *       corresponding repository connector.  An empty version string signals that there is no calculable
  *       document version string, and that the document should always be indexed.
  *@param documentURI is the URI to use to retrieve this document from the search interface (and is
  *       also the unique key in the index).
  *@param data is the document data.  The data is closed after ingestion is complete.
  *@throws IOException only when data stream reading fails.
  */
  public void ingestDocumentWithException(String documentIdentifier,
    String version, String documentURI, RepositoryDocument data)
    throws ManifoldCFException, ServiceInterruption, IOException;

  /** Ingest the current document.
  *@param documentIdentifier is the document's identifier.
  *@param componentIdentifier is the component document identifier, if any.
  *@param version is the version of the document, as reported by the getDocumentVersions() method of the
  *       corresponding repository connector.
  *@param documentURI is the URI to use to retrieve this document from the search interface (and is
  *       also the unique key in the index).
  *@param data is the document data.  The data is closed after ingestion is complete.
  *@throws IOException only when data stream reading fails.
  */
  public void ingestDocumentWithException(String documentIdentifier,
    String componentIdentifier,
    String version, String documentURI, RepositoryDocument data)
    throws ManifoldCFException, ServiceInterruption, IOException;

  /** Remove the specified document from the search engine index, and update the
  * recorded version information for the document.
  *@param documentIdentifier is the document's local identifier.
  *@param version is the version string to be recorded for the document.
  */
  public void noDocument(String documentIdentifier,
    String version)
    throws ManifoldCFException, ServiceInterruption;

  /** Remove the specified document from the search engine index, and update the
  * recorded version information for the document.
  *@param documentIdentifier is the document's local identifier.
  *@param componentIdentifier is the component document identifier, if any.
  *@param version is the version string to be recorded for the document.
  */
  public void noDocument(String documentIdentifier,
    String componentIdentifier,
    String version)
    throws ManifoldCFException, ServiceInterruption;

  /** Remove the specified document primary component permanently from the search engine index,
  * and from the status table.  Use this method when your document has components and
  * now also has a primary document, but will not have a primary document again for the foreseeable
  * future.  This is a rare situation.
  *@param documentIdentifier is the document's identifier.
  */
  public void removeDocument(String documentIdentifier)
    throws ManifoldCFException, ServiceInterruption;

  /** Retain existing document component.  Use this method to signal that an already-existing
  * document component does not need to be reindexed.  The default behavior is to remove
  * components that are not mentioned during processing.
  *@param documentIdentifier is the document's identifier.
  *@param componentIdentifier is the component document identifier, which cannot be null.
  */
  public void retainDocument(String documentIdentifier,
    String componentIdentifier)
    throws ManifoldCFException;

  /** Retain all existing document components of a primary document.  Use this method to signal that
  * no document components need to be reindexed.  The default behavior is to remove
  * components that are not mentioned during processing.
  *@param documentIdentifier is the document's identifier.
  */
  public void retainAllComponentDocument(String documentIdentifier)
    throws ManifoldCFException;

  /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
  * documents with the same URL, however, will still be removed.)  This is
  * useful if the version string changes but the document contents are known not
  * to have changed.
  *@param documentIdentifier is the document identifier.
  *@param version is the document version.
  */
  public void recordDocument(String documentIdentifier,
    String version)
    throws ManifoldCFException;

  /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
  * documents with the same URL, however, will still be removed.)  This is
  * useful if the version string changes but the document contents are known not
  * to have changed.
  *@param documentIdentifier is the document identifier.
  *@param componentIdentifier is the component document identifier, if any.
  *@param version is the document version.
  */
  public void recordDocument(String documentIdentifier,
    String componentIdentifier,
    String version)
    throws ManifoldCFException;

  /** Delete the specified document permanently from the search engine index, and from the status table,
  * along with all its components.
  * This method does NOT keep track of any document version information for the document and thus can
  * lead to "churn", whereby the same document is queued, processed,
  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
  * in any case where the same decision will need to be made over and over.
  *@param documentIdentifier is the document's identifier.
  */
  public void deleteDocument(String documentIdentifier)
    throws ManifoldCFException;

  /** Override the schedule for the next time a document is crawled.
  * Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound,
  * or a combination of these, on a specific document.  This method is only effective if the job is a continuous one, and if the
  * identifier you pass in is being processed.
  *@param documentIdentifier is the document's identifier.
  *@param lowerRecrawlBoundTime is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.
  *@param upperRecrawlBoundTime is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.
  *@param lowerExpireBoundTime is the time in ms since epoch that the expire time should not fall BELOW, or null if none.
  *@param upperExpireBoundTime is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.
  */
  public void setDocumentScheduleBounds(String documentIdentifier,
    Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime,
    Long lowerExpireBoundTime, Long upperExpireBoundTime)
    throws ManifoldCFException;

  /** Override a document's origination time.
  * Use this method to signal the framework that a document's origination time is something other than the first time it was crawled.
  *@param documentIdentifier is the document's identifier.
  *@param originationTime is the document's origination time, or null if unknown.
  */
  public void setDocumentOriginationTime(String documentIdentifier,
    Long originationTime)
    throws ManifoldCFException;

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy