All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.manifoldcf.crawler.interfaces.QueueTracker Maven / Gradle / Ivy

/* $Id: QueueTracker.java 988245 2010-08-23 18:39:35Z kwright $ */

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.interfaces;

import org.apache.manifoldcf.core.interfaces.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;

/** This class attempts to provide document priorities in order to acheive as much balance as possible between documents having different bins.
* A document's priority assignment takes place at the time the document is added to the queue, and will be recalculated when a job is aborted, or
* when the crawler daemon is started.  The document priorities are strictly obeyed when documents are chosen from the queue and handed to
* worker threads; higher-priority documents always have precedence, except due to deliberate priority adjustment specified by the job priority.
*
* The priority values themselves are logarithmic: 0.0 is the highest, and the larger the number, the lower the priority.
*
* The basis for the calculation for each document priority handed out by this module are:
*
* - number of documents having a given bin (tracked)
* - performance of a connection (gathered through statistics)
* - throttling that applies to the each document bin
*
*
* The queuing prioritization model hooks into the document lifecycle in the following places:
* (1) When a document is added to the queue (and thus when its priority is handed out)
* (2) When documents that were *supposed* to be added to the queue turned out to already be there and already have an established priority,
*     (in which case the priority that was handed out before is returned to the pool for reuse)
* (3) When a document is pulled from the database queue (which sets the current highest priority level that should not be exceeded in step (1))
*
* The assignment prioritization model is largely independent of the queuing prioritization model, and is used to select among documents that have
* been marked "active" as they are handed to worker threads.  These events cause information to be logged:
* (1) When a document is handed to a worker thread
* (2) When the worker thread completes the document
*
*/
public class QueueTracker
{
  public static final String _rcsid = "@(#)$Id: QueueTracker.java 988245 2010-08-23 18:39:35Z kwright $";

  /** Factor by which bins are reduced */
  protected final static double binReductionFactor = 1.0;

  /** These are the accumulated performance averages for all connections etc. */
  protected final PerformanceStatistics performanceStatistics = new PerformanceStatistics();

  /** These are the bin counts for tracking the documents that are on
  * the active queue, but are not being processed yet */
  protected final Map queuedBinCounts = new HashMap();

  /** These are the bin counts for active threads */
  protected final Map activeBinCounts = new HashMap();

  /** Constructor */
  public QueueTracker()
  {
  }

  /** Add an access record to the queue tracker.  This happens when a document
  * is added to the in-memory queue, and allows us to keep track of that particular event so
  * we can schedule in a way that meets our distribution goals.
  *@param binNames are the set of bins, as returned from the connector in question, for
  * the document that is being queued.  These bins are considered global in nature.
  */
  public void addRecord(String[] binNames)
  {
    int i = 0;
    while (i < binNames.length)
    {
      String binName = binNames[i++];
      synchronized (queuedBinCounts)
      {
        BinCount value = queuedBinCounts.get(binName);
        if (value == null)
        {
          value = new BinCount();
          queuedBinCounts.put(binName,value);
        }
        value.increment();
      }
    }

  }

  /** Note the time required to successfully complete a set of documents.  This allows this module to keep track of
  * the performance characteristics of each individual connection, so distribution across connections can be balanced
  * properly.
  */
  public void noteConnectionPerformance(int docCount, String connectionName, long elapsedTime)
  {
    performanceStatistics.noteDocumentsCompleted(connectionName,docCount,elapsedTime);
  }

  /** Obtain the current performance statistics object */
  public PerformanceStatistics getCurrentStatistics()
  {
    return performanceStatistics;
  }

  /** Note that we are beginning processing for a document with a particular set of bins.
  * This method is called when a worker thread starts work on a set of documents.
  */
  public void beginProcessing(String[] binNames)
  {
    // Effectively, we are moving the document from one status to another, so we adjust the bin counts of the source and
    // the target both.

    int i = 0;
    while (i < binNames.length)
    {
      String binName = binNames[i++];

      // Increment queued bin count for this bin.
      synchronized (queuedBinCounts)
      {
        BinCount value = queuedBinCounts.get(binName);
        if (value != null)
        {
          if (value.decrement())
            queuedBinCounts.remove(binName);
        }
      }

      // Decrement active bin count for this bin.
      synchronized (activeBinCounts)
      {
        BinCount value = activeBinCounts.get(binName);
        if (value == null)
        {
          value = new BinCount();
          activeBinCounts.put(binName,value);
        }
        value.increment();
      }
    }
  }


  /** Note that we have completed processing of a document with a given set of bins.
  * This method gets called when a Worker Thread has finished with a document.
  */
  public void endProcessing(String[] binNames)
  {
    // Remove the document from the active queue, by decrementing the corresponding active bin counts.

    int i = 0;
    while (i < binNames.length)
    {
      String binName = binNames[i++];
      synchronized (activeBinCounts)
      {
        BinCount value = activeBinCounts.get(binName);
        if (value != null)
        {
          if (value.decrement())
            activeBinCounts.remove(binName);
        }
      }
    }
  }

  /** Calculate an assignment rating for a set of bins based on what's currently in use.
  * This rating is used to help determine which documents returned from a queueing query actually get made "active",
  * and which ones are skipped for the moment.
  *
  * The rating returned
  * for each bin will be 1 divided by one plus the active thread count for that bin.  The higher the
  * rating, the better.  The ratings are combined by multiplying the rating for each bin by that for
  * every other bin, and then taking the nth root (where n is the number of bins) to normalize for
  * the number of bins.
  * The repository connection is used to reduce the priority of assignment, based on the fetch rate that will
  * result from this set of bins.
  */
  public double calculateAssignmentRating(String[] binNames, IRepositoryConnection connection)
  {
    // Work in log space
    double ratingLog = 0.0;
    int i = 0;
    while (i < binNames.length)
    {
      String binName = binNames[i++];
      int count = 0;
      synchronized (activeBinCounts)
      {
        BinCount value = activeBinCounts.get(binName);
        if (value != null)
          count = value.getValue();
      }
      // rating *= (1.0 / (1.0 + (double)count))
      ratingLog -= Math.log(1.0 + (double)count);
    }

    // Take the ith root of the bin rating, and leave it in log form
    return ratingLog/(double)i;
  }


  /** This is the class which allows a mutable integer count value to be saved in the bincount table.
  */
  protected static class BinCount
  {
    /** The count */
    protected int count = 0;

    /** Create */
    public BinCount()
    {
    }

    public BinCount duplicate()
    {
      BinCount rval = new BinCount();
      rval.count = this.count;
      return rval;
    }

    /** Increment the counter */
    public void increment()
    {
      count++;
    }

    /** Decrement the counter, returning true if empty */
    public boolean decrement()
    {
      count--;
      return count == 0;
    }

    /** Set the counter value */
    public void setValue(int count)
    {
      this.count = count;
    }

    /** Get the counter value */
    public int getValue()
    {
      return count;
    }
  }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy