org.onebusaway.gtfs_merge.strategies.AbstractIdentifiableSingleEntityMergeStrategy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of onebusaway-gtfs-merge Show documentation
A tool for merging GTFS feeds.
The newest version!
/**
 * Copyright (C) 2012 Google, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.onebusaway.gtfs_merge.strategies;

import java.io.Serializable;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.onebusaway.collections.MappingLibrary;
import org.onebusaway.collections.Max;
import org.onebusaway.gtfs.model.AgencyAndId;
import org.onebusaway.gtfs.model.IdentityBean;
import org.onebusaway.gtfs.services.GtfsMutableRelationalDao;
import org.onebusaway.gtfs.services.GtfsRelationalDao;
import org.onebusaway.gtfs_merge.GtfsMergeContext;
import org.onebusaway.gtfs_merge.strategies.scoring.AndDuplicateScoringStrategy;
import org.onebusaway.gtfs_merge.strategies.scoring.DuplicateScoringSupport;

/**
 * Abstract base class that defines common methods and properties for merging
 * single GTFS entities with explicit identifiers in a GTFS feed.
 * 
 * @author bdferris
 * 
 * @param  the type of the GTFS entity that this merge strategy handles
 */
public abstract class AbstractIdentifiableSingleEntityMergeStrategy>
    extends AbstractSingleEntityMergeStrategy {

  /**
   * When comparing entities between two feeds to see if they are duplicates, we
   * use the specified scoring strategy to score the amount of duplication
   * between the two entities. Sub-classes can add rules to this scoring
   * strategy specific to their entity type to guide duplication scoring.
   */
  protected AndDuplicateScoringStrategy _duplicateScoringStrategy = new AndDuplicateScoringStrategy();

  public AbstractIdentifiableSingleEntityMergeStrategy(Class entityType) {
    super(entityType);
  }

  @Override
  protected EDuplicateDetectionStrategy pickBestDuplicateDetectionStrategy(
      GtfsMergeContext context) {

    /**
     * If there are currently no elements to be duplicated, then return the NONE
     * strategy.
     */
    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();
    if (target.getAllEntitiesForType(_entityType).isEmpty()
        || source.getAllEntitiesForType(_entityType).isEmpty()) {
      return EDuplicateDetectionStrategy.NONE;
    }

    if (hasLikelyIdentifierOverlap(context)) {
      return EDuplicateDetectionStrategy.IDENTITY;
    } else if (hasLikelyFuzzyOverlap(context)) {
      return EDuplicateDetectionStrategy.FUZZY;
    } else {
      return EDuplicateDetectionStrategy.NONE;
    }
  }

  /**
   * Determines if the entities sharing the same ids in the source and target
   * feeds appear to be similar enough to indicate that
   * {@link EDuplicateDetectionStrategy#IDENTITY} duplicate detection can be
   * used.
   * 
   * @param context
   * @return true if identity duplicate detection seems appropriate
   */
  @SuppressWarnings("unchecked")
  private boolean hasLikelyIdentifierOverlap(GtfsMergeContext context) {
    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();
    Collection targetEntities = (Collection) target.getAllEntitiesForType(_entityType);
    Collection sourceEntities = (Collection) source.getAllEntitiesForType(_entityType);

    Map sourceById = MappingLibrary.mapToValue(targetEntities,
        "id");
    Map targetById = MappingLibrary.mapToValue(sourceEntities,
        "id");

    /**
     * First we check to make sure that the two feeds have enough identifiers in
     * common to suggest that identity-based duplicate detection should be used.
     */
    Set commonIds = new HashSet();
    double elementOvelapScore = DuplicateScoringSupport.scoreElementOverlap(
        sourceById.keySet(), targetById.keySet(), commonIds);
    if (commonIds.isEmpty()
        || elementOvelapScore < _minElementsInCommonScoreForAutoDetect) {
      return false;
    }

    /**
     * Now we score entities with the same identifier to see how well they
     * actually match.
     */
    double totalScore = 0.0;
    for (Serializable id : commonIds) {
      T targetEntity = sourceById.get(id);
      T sourceEntity = targetById.get(id);
      totalScore += _duplicateScoringStrategy.score(context, sourceEntity,
          targetEntity);
    }
    totalScore /= commonIds.size();

    /**
     * If the score is high enough, identity-based duplication detection should
     * be used.
     */
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
  }

  /**
   * Determines if the set entities in the source and target feeds appear to be
   * similar enough when performing fuzzy matching to indicate that
   * {@link EDuplicateDetectionStrategy#FUZZY} duplicate detection can be used.
   * 
   * @param context
   * @return true if fuzzy duplicate detection seems appropriate
   */
  @SuppressWarnings("unchecked")
  private boolean hasLikelyFuzzyOverlap(GtfsMergeContext context) {

    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();

    /**
     * TODO: Fuzzy matching is expensive. Do we really want to compare all of
     * the entities? Or would a sufficiently-large subset do the trick? Can any
     * of this be cached for the actual duplicate detection later on?
     */
    Collection targetEntities = (Collection) target.getAllEntitiesForType(_entityType);
    Collection sourceEntities = (Collection) source.getAllEntitiesForType(_entityType);

    double duplicateElements = 0;
    double totalScore = 0.0;

    /**
     * First we determine a rough set of potentially overlapping entities based
     * on a fuzzy match.
     */
    Set remainingSourceEntities = new HashSet(sourceEntities);
    for (T targetEntity : targetEntities) {
      Max best = new Max();
      for (T sourceEntity : remainingSourceEntities) {
        double score = _duplicateScoringStrategy.score(context, sourceEntity,
            targetEntity);
        if (score < _minElementDuplicateScoreForFuzzyMatch) {
          continue;
        }
        best.add(score, sourceEntity);
      }

      if (best.getMaxElement() != null) {
        duplicateElements++;
        totalScore += best.getMaxValue();
        remainingSourceEntities.remove(best.getMaxElement());
      }
    }

    /**
     * There needs to be sufficient overlap between the two feeds for us to
     * consider using fuzzy duplicate detection in the first place.
     */
    double elementsInCommon = (duplicateElements / targetEntities.size() + duplicateElements
        / sourceEntities.size()) / 2;
    if (elementsInCommon < _minElementsInCommonScoreForAutoDetect) {
      return false;
    }

    /**
     * If there is sufficient overlap, only use fuzzy detection if the entities
     * themselves match well.
     */
    totalScore /= duplicateElements;
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
  }

  @Override
  protected IdentityBean getIdentityDuplicate(GtfsMergeContext context,
      IdentityBean entity) {
    String rawId = getRawId(entity.getId());
    return (IdentityBean) context.getEntityForRawId(rawId);
  }

  @SuppressWarnings("unchecked")
  @Override
  protected IdentityBean getFuzzyDuplicate(GtfsMergeContext context,
      IdentityBean entity) {
    GtfsMutableRelationalDao targetDao = context.getTarget();
    Collection targets = (Collection) targetDao.getAllEntitiesForType(_entityType);
    if (targets.isEmpty()) {
      return null;
    }
    Max best = new Max();
    for (T target : targets) {
      /**
       * If we just added the target entity as part of the current feed, do not
       * attempt a fuzzy match against it.
       */
      String targetRawId = getRawId(target.getId());
      if (context.isEntityJustAddedWithRawId(targetRawId)) {
        continue;
      }
      double score = _duplicateScoringStrategy.score(context, (T) entity,
          target);
      best.add(score, target);
    }
    if (best.getMaxValue() < _minElementsDuplicateScoreForAutoDetect) {
      return null;
    }
    return (IdentityBean) best.getMaxElement();
  }

  /**
   * Saves the specified entity to the merged output feed. If the raw id of the
   * entity duplicates an existing entity in the output feed, its id will be
   * renamed.
   * 
   * @param context
   * @param entity
   */
  @Override
  protected void save(GtfsMergeContext context, IdentityBean entity) {
    String rawId = getRawId(entity.getId());
    /**
     * If an element if the same id has already been saved, we need to rename
     * this one.
     */
    if (context.getEntityForRawId(rawId) != null) {
      rename(context, entity);
      rawId = getRawId(entity.getId());
    }
    context.putEntityWithRawId(rawId, entity);
    super.save(context, entity);
  }

  /**
   * Converts the entity identifier into a raw GTFS identifier string. This is
   * what we actually use for identity duplicate detection.
   * 
   * @param id
   * @return the raw GTFS id
   */
  private String getRawId(Object id) {
    if (id instanceof String) {
      return (String) id;
    } else if (id instanceof AgencyAndId) {
      return ((AgencyAndId) id).getId();
    }
    throw new UnsupportedOperationException(
        "cannot generate raw key for type: " + id.getClass());
  }

  /**
   * Rename the id of the specified identity to avoid an raw GTFS identifier
   * collision in the merged output feed.
   * 
   * @param context
   * @param entity
   */
  @SuppressWarnings("unchecked")
  protected void rename(GtfsMergeContext context, IdentityBean entity) {
    Object id = entity.getId();
    if (id != null && id instanceof AgencyAndId) {
      IdentityBean bean = (IdentityBean) entity;
      AgencyAndId agencyAndId = bean.getId();
      agencyAndId = MergeSupport.renameAgencyAndId(context, agencyAndId);
      bean.setId(agencyAndId);
    }
  }
}