org.onebusaway.gtfs_merge.strategies.AbstractEntityMergeStrategy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of onebusaway-gtfs-merge Show documentation
A tool for merging GTFS feeds.
The newest version!
/**
 * Copyright (C) 2012 Google, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.onebusaway.gtfs_merge.strategies;

import org.onebusaway.gtfs_merge.GtfsMergeContext;
import org.onebusaway.gtfs_merge.strategies.scoring.DuplicateScoringSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Abstract base class that defines methods and properties common to all entity
 * merge strategies, regardless of entity type.
 * 
 * @author bdferris
 * @see AbstractSingleEntityMergeStrategy
 * @see AbstractCollectionEntityMergeStrategy
 */
public abstract class AbstractEntityMergeStrategy implements
    EntityMergeStrategy {

  private static final Logger _log = LoggerFactory.getLogger(AbstractEntityMergeStrategy.class);

  /**
   * By default, we don't specify a default duplicate detection strategy, but
   * instead attempt auto-detection of the best strategy. When the auto-detected
   * strategy is not appropriate, it can be manually overridden by setting this
   * value.
   */
  protected EDuplicateDetectionStrategy _duplicateDetectionStrategy = null;

  /**
   * When auto-detecting the best duplicate detection strategy to use, defines
   * the scoring threshold to use when considering if two entity id sets have
   * enough ids in common to consider using identifier-based duplicate
   * detection. Note that we aren't yet comparing entities with the same id to
   * see if they seem similar at this point, just the raw number of identifiers
   * in common between two sets. The intuition is that if two entity sets have
   * very few identifiers in common, the odds are low that identity-based
   * duplicate detection should be used.
   * 
   * An id overlap score will be between 0.0 and 1.0, where 0.0 indicates
   * absolutely no overlap and 1.0 indicates that the two id sets are the same.
   * If the score is below the specified threshold, identifier-based duplicate
   * detection will not be considered.
   * 
   * See
   * {@link DuplicateScoringSupport#scoreElementOverlap(java.util.Collection, java.util.Collection)}
   * for an example scoring method.
   */
  protected double _minElementsInCommonScoreForAutoDetect = 0.5;

  /**
   * When auto-detecting the best duplicate detection strategy to use, the
   * different {@link EDuplicateDetectionStrategy} will produce a set of
   * candidate duplicates, for which we score their overlap on a scale from 0.0
   * to 1.0, where 0.0 indicates that none of the entities seem to match and 1.0
   * indicates that they are exact duplicates. We define a minimum overlap score
   * threshold that must be met for a particular duplicate detection strategy to
   * be applied to the source and target feeds at large.
   */
  protected double _minElementsDuplicateScoreForAutoDetect = 0.5;

  /**
   * This threshold is similar to
   * {@link #_minElementsDuplicateScoreForAutoDetect} except that is used only
   * for auto-detecting fuzzy matches and only for producing a candidate set of
   * fuzzy matches to score to determine if auto-detection should be used.
   * 
   * TODO(bdferris): I'll admit that I'm having a hard time remembering why I
   * wanted a separate threshold for determining the set of candidate fuzzy
   * matches. It might make sense to remove this at some point. I think the idea
   * might have been to be more lenient when determining if we should use
   * fuzzy-duplicate-detection in the first place, but be more strict when it
   * comes to actual duplicate detection.
   */
  protected double _minElementDuplicateScoreForFuzzyMatch = 0.5;

  /**
   * What should happen when we detect a duplicate entity?
   */
  protected ELogDuplicatesStrategy _logDuplicatesStrategy = ELogDuplicatesStrategy.NONE;

  /**
   * Set a duplicate detection strategy. By default, we attempt to auto-detect
   * an appropriate strategy.
   * 
   * @param duplicateDetectionStrategy
   */
  public void setDuplicateDetectionStrategy(
      EDuplicateDetectionStrategy duplicateDetectionStrategy) {
    _duplicateDetectionStrategy = duplicateDetectionStrategy;
  }

  public void setLogDuplicatesStrategy(
      ELogDuplicatesStrategy logDuplicatesStrategy) {
    _logDuplicatesStrategy = logDuplicatesStrategy;
  }

  /**
   * Determines the best {@link EDuplicateDetectionStrategy} to use for the
   * current entity type and source feed. If a specific duplicate detection
   * strategy has already been specified with
   * {@link #setDuplicateDetectionStrategy(EDuplicateDetectionStrategy)}, it
   * will always be returned. If not, we attempt to pick the best duplicate
   * detection strategy given the current source feed and the data already in
   * the merged output feed. Auto-detecting the best duplicate detection
   * strategy may be an expensive operation, so we cache the result for each
   * source feed.
   * 
   * @param context
   * @return the duplicate detection strategy to use for the current source
   *         input feed
   */
  protected EDuplicateDetectionStrategy determineDuplicateDetectionStrategy(
      GtfsMergeContext context) {
    if (_duplicateDetectionStrategy != null) {
      return _duplicateDetectionStrategy;
    }
    EDuplicateDetectionStrategy resolvedDuplicateDetectionStrategy = context.getResolvedDuplicateDetectionStrategy();
    if (resolvedDuplicateDetectionStrategy == null) {
      resolvedDuplicateDetectionStrategy = pickBestDuplicateDetectionStrategy(context);
      _log.info("best duplicate detection strategy for " + getDescription()
          + " = " + resolvedDuplicateDetectionStrategy);
      context.setResolvedDuplicateDetectionStrategy(resolvedDuplicateDetectionStrategy);
    }
    return resolvedDuplicateDetectionStrategy;
  }

  /**
   * Determines the best {@link EDuplicateDetectionStrategy} to use for merging
   * entities from the current source feed into the merged output feed.
   * Sub-classes are required to provide the most appropriate strategy for
   * merging their particular entity type.
   * 
   * @param context
   * @return
   */
  protected abstract EDuplicateDetectionStrategy pickBestDuplicateDetectionStrategy(
      GtfsMergeContext context);

  /**
   * 
   * @return a string description of the current entity merge strategy,
   *         typically identifying the entity-type to be merged
   */
  protected abstract String getDescription();
}