org.apache.manifoldcf.crawler.interfaces.IProcessActivity Maven / Gradle / Ivy
/* $Id: IProcessActivity.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.interfaces;
import java.io.*;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
/** This interface abstracts from the activities that a connector's processDocuments() method can do.
* The processing flow for a document is expected to go something like this:
* (1) The connector's processDocuments() method is called with a set of documents to be processed.
* (2) The connector computes a version string for each document in the set as part of determining
* whether the document indeed needs to be refetched.
* (3) For each document processed, there can be one of several dispositions:
* (a) There is no such document (anymore): deleteDocument() called for the document.
* (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
* (c) The document is determined to be unchanged and no updates are needed: nothing needs to be called
* for the document.
* (d) The document is determined to be unchanged BUT the version string needs to be updated: recordDocument()
* is called for the document.
* (e) The document is determined to be unindexable BUT it still exists in the repository: noDocument()
* is called for the document.
* (f) There was a service interruption: ServiceInterruption is thrown.
* (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
* is available to return an opinion on that matter.
*/
public interface IProcessActivity extends IHistoryActivity, IEventActivity, IAbortActivity, IFingerprintActivity, ICarrydownActivity
{
public static final String _rcsid = "@(#)$Id: IProcessActivity.java 988245 2010-08-23 18:39:35Z kwright $";
/** Check if a document needs to be reindexed, based on a computed version string.
* Call this method to determine whether reindexing is necessary. Pass in a newly-computed version
* string. This method will return "true" if the document needs to be re-indexed.
*@param documentIdentifier is the document identifier.
*@param newVersionString is the newly-computed version string.
*@return true if the document needs to be reindexed.
*/
public boolean checkDocumentNeedsReindexing(String documentIdentifier,
String newVersionString)
throws ManifoldCFException;
/** Check if a document needs to be reindexed, based on a computed version string.
* Call this method to determine whether reindexing is necessary. Pass in a newly-computed version
* string. This method will return "true" if the document needs to be re-indexed.
*@param documentIdentifier is the document identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param newVersionString is the newly-computed version string.
*@return true if the document needs to be reindexed.
*/
public boolean checkDocumentNeedsReindexing(String documentIdentifier,
String componentIdentifier,
String newVersionString)
throws ManifoldCFException;
/** Add a document description to the current job's queue.
*@param documentIdentifier is the local document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
* MUST be present in the case of carrydown information.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*@param dataNames is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
*@param dataValues are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
* The type of each object must either be a String, or a CharacterInput.
*@param originationTime is the time, in ms since epoch, that the document originated. Pass null if none or unknown.
*@param prereqEventNames are the names of the prerequisite events which this document requires prior to processing. Pass null if none.
*/
public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames)
throws ManifoldCFException;
/** Add a document description to the current job's queue.
*@param documentIdentifier is the document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
* MUST be present in the case of carrydown information.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*@param dataNames is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
*@param dataValues are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
* The type of each object must either be a String, or a CharacterInput.
*@param originationTime is the time, in ms since epoch, that the document originated. Pass null if none or unknown.
*/
public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
String[] dataNames, Object[][] dataValues, Long originationTime)
throws ManifoldCFException;
/** Add a document description to the current job's queue.
*@param documentIdentifier is the document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
* MUST be present in the case of carrydown information.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*@param dataNames is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
*@param dataValues are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
* The type of each object must either be a String, or a CharacterInput.
*/
public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
String[] dataNames, Object[][] dataValues)
throws ManifoldCFException;
/** Add a document description to the current job's queue.
*@param documentIdentifier is the document identifier to add (for the connector that
* fetched the document).
*@param parentIdentifier is the document identifier that is considered to be the "parent"
* of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
*@param relationshipType is the string describing the kind of relationship described by this
* reference. This must be one of the strings returned by the IRepositoryConnector method
* "getRelationshipTypes()". May be null.
*/
public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType)
throws ManifoldCFException;
/** Add a document description to the current job's queue. This method is equivalent to
* addDocumentReference(localIdentifier,null,null).
*@param documentIdentifier is the document identifier to add (for the connector that
* fetched the document).
*/
public void addDocumentReference(String documentIdentifier)
throws ManifoldCFException;
/** Ingest the current document.
*@param documentIdentifier is the document's identifier.
*@param version is the version of the document, as reported by the getDocumentVersions() method of the
* corresponding repository connector. An empty version string signals that there is no calculable
* document version string, and that the document should always be indexed.
*@param documentURI is the URI to use to retrieve this document from the search interface (and is
* also the unique key in the index).
*@param data is the document data. The data is closed after ingestion is complete.
*@throws IOException only when data stream reading fails.
*/
public void ingestDocumentWithException(String documentIdentifier,
String version, String documentURI, RepositoryDocument data)
throws ManifoldCFException, ServiceInterruption, IOException;
/** Ingest the current document.
*@param documentIdentifier is the document's identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param version is the version of the document, as reported by the getDocumentVersions() method of the
* corresponding repository connector.
*@param documentURI is the URI to use to retrieve this document from the search interface (and is
* also the unique key in the index).
*@param data is the document data. The data is closed after ingestion is complete.
*@throws IOException only when data stream reading fails.
*/
public void ingestDocumentWithException(String documentIdentifier,
String componentIdentifier,
String version, String documentURI, RepositoryDocument data)
throws ManifoldCFException, ServiceInterruption, IOException;
/** Remove the specified document from the search engine index, and update the
* recorded version information for the document.
*@param documentIdentifier is the document's local identifier.
*@param version is the version string to be recorded for the document.
*/
public void noDocument(String documentIdentifier,
String version)
throws ManifoldCFException, ServiceInterruption;
/** Remove the specified document from the search engine index, and update the
* recorded version information for the document.
*@param documentIdentifier is the document's local identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param version is the version string to be recorded for the document.
*/
public void noDocument(String documentIdentifier,
String componentIdentifier,
String version)
throws ManifoldCFException, ServiceInterruption;
/** Remove the specified document primary component permanently from the search engine index,
* and from the status table. Use this method when your document has components and
* now also has a primary document, but will not have a primary document again for the foreseeable
* future. This is a rare situation.
*@param documentIdentifier is the document's identifier.
*/
public void removeDocument(String documentIdentifier)
throws ManifoldCFException, ServiceInterruption;
/** Retain existing document component. Use this method to signal that an already-existing
* document component does not need to be reindexed. The default behavior is to remove
* components that are not mentioned during processing.
*@param documentIdentifier is the document's identifier.
*@param componentIdentifier is the component document identifier, which cannot be null.
*/
public void retainDocument(String documentIdentifier,
String componentIdentifier)
throws ManifoldCFException;
/** Retain all existing document components of a primary document. Use this method to signal that
* no document components need to be reindexed. The default behavior is to remove
* components that are not mentioned during processing.
*@param documentIdentifier is the document's identifier.
*/
public void retainAllComponentDocument(String documentIdentifier)
throws ManifoldCFException;
/** Record a document version, WITHOUT reindexing it, or removing it. (Other
* documents with the same URL, however, will still be removed.) This is
* useful if the version string changes but the document contents are known not
* to have changed.
*@param documentIdentifier is the document identifier.
*@param version is the document version.
*/
public void recordDocument(String documentIdentifier,
String version)
throws ManifoldCFException;
/** Record a document version, WITHOUT reindexing it, or removing it. (Other
* documents with the same URL, however, will still be removed.) This is
* useful if the version string changes but the document contents are known not
* to have changed.
*@param documentIdentifier is the document identifier.
*@param componentIdentifier is the component document identifier, if any.
*@param version is the document version.
*/
public void recordDocument(String documentIdentifier,
String componentIdentifier,
String version)
throws ManifoldCFException;
/** Delete the specified document permanently from the search engine index, and from the status table,
* along with all its components.
* This method does NOT keep track of any document version information for the document and thus can
* lead to "churn", whereby the same document is queued, processed,
* and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
* in any case where the same decision will need to be made over and over.
*@param documentIdentifier is the document's identifier.
*/
public void deleteDocument(String documentIdentifier)
throws ManifoldCFException;
/** Override the schedule for the next time a document is crawled.
* Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound,
* or a combination of these, on a specific document. This method is only effective if the job is a continuous one, and if the
* identifier you pass in is being processed.
*@param documentIdentifier is the document's identifier.
*@param lowerRecrawlBoundTime is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.
*@param upperRecrawlBoundTime is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.
*@param lowerExpireBoundTime is the time in ms since epoch that the expire time should not fall BELOW, or null if none.
*@param upperExpireBoundTime is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.
*/
public void setDocumentScheduleBounds(String documentIdentifier,
Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime,
Long lowerExpireBoundTime, Long upperExpireBoundTime)
throws ManifoldCFException;
/** Override a document's origination time.
* Use this method to signal the framework that a document's origination time is something other than the first time it was crawled.
*@param documentIdentifier is the document's identifier.
*@param originationTime is the document's origination time, or null if unknown.
*/
public void setDocumentOriginationTime(String documentIdentifier,
Long originationTime)
throws ManifoldCFException;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy