org.onebusaway.gtfs_merge.strategies.AbstractCollectionEntityMergeStrategy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of onebusaway-gtfs-merge Show documentation
A tool for merging GTFS feeds.
The newest version!
/**
 * Copyright (C) 2012 Google, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.onebusaway.gtfs_merge.strategies;

import java.io.Serializable;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.onebusaway.csv_entities.exceptions.CsvException;
import org.onebusaway.gtfs.model.AgencyAndId;
import org.onebusaway.gtfs.model.ServiceCalendar;
import org.onebusaway.gtfs.model.ServiceCalendarDate;
import org.onebusaway.gtfs.model.ShapePoint;
import org.onebusaway.gtfs.services.GtfsRelationalDao;
import org.onebusaway.gtfs_merge.GtfsMergeContext;
import org.onebusaway.gtfs_merge.strategies.scoring.DuplicateScoringSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Abstract base class that defines common methods and properties for merging
 * collection-like GTFS entities. Collection-like entities are entity types
 * where a collection of entries are identified by a common identifier. That
 * includes entities like {@link ShapePoint} entries in shapes.txt, where one
 * {@code shapeId} identifies a series of shape points. It also includes entries
 * like {@link ServiceCalendar} and {@link ServiceCalendarDate} entries from
 * calendar.txt and calendar_dates.txt, where one {@code service_id} potentially
 * covers multiple calendar entries.
 * 
 * @author bdferris
 * 
 * @param  the type for the id object class that is used to uniquely
 *          identify a collection entity
 */
public abstract class AbstractCollectionEntityMergeStrategy
    extends AbstractEntityMergeStrategy {

  private static final Logger _log = LoggerFactory.getLogger(AbstractCollectionEntityMergeStrategy.class);

  /**
   * Returned by {@link #getDescription()}
   */
  private final String _keyDescription;

  public AbstractCollectionEntityMergeStrategy(String keyDescription) {
    _keyDescription = keyDescription;
  }

  @Override
  public void merge(GtfsMergeContext context) {
    for (KEY key : getKeys(context.getSource())) {
      processKey(context, key);
    }
  }

  /**
   * We process each entity collection with a particular id in turn, looking for
   * duplicates and taking appropriate action to merge the resulting entities
   * into the output feed.
   * 
   * @param context
   * @param key the identifier of the current entity collection to process
   */
  private void processKey(GtfsMergeContext context, KEY key) {
    KEY duplicate = getDuplicate(context, key);
    if (duplicate != null) {
      logDuplicateKey(key);
      if (!duplicate.equals(key)) {
        renameKey(context, key, duplicate);
      }
      return;
    }
    String rawKey = getRawKey(key);

    /**
     * If we've already saved elements using this key previously, we need to
     * rename this key to avoid duplication.
     */
    if (context.getEntityForRawId(rawKey) != null) {
      KEY newKey = getRenamedKey(context, key);
      renameKey(context, key, newKey);
      key = newKey;
      rawKey = getRawKey(key);
      MergeSupport.clearCaches(context.getSource());
    }

    context.putEntityWithRawId(rawKey, key);
    saveElementsForKey(context, key);
  }

  /**
   * An entity-specific method to determine the set of unique identifiers used
   * by collection entities in the specified GTFS feed.
   * 
   * @param dao
   * @return the set of unique identifiers
   */
  protected abstract Collection getKeys(GtfsRelationalDao dao);

  /**
   * Determines if the entity collection with the specified id overlaps with an
   * entity collection already in the merged output feed. If a duplicate is
   * found, the id of the already-present entity collection is returned. If no
   * duplicate is found, returns null.
   * 
   * @param context
   * @param key
   * @return the id of an existing, duplicate entity collection in the output
   *         feed, or null if none exists
   */
  private KEY getDuplicate(GtfsMergeContext context, KEY key) {
    EDuplicateDetectionStrategy duplicateDetectionStrategy = determineDuplicateDetectionStrategy(context);
    switch (duplicateDetectionStrategy) {
      case IDENTITY:
        return getIdentityDuplicate(context, key);
      case FUZZY:
        return getFuzzyDuplicate(context, key);
      case NONE:
        return null;
      default:
        throw new IllegalStateException(
            "unknown duplicate detection strategy: "
                + _duplicateDetectionStrategy);
    }
  }

  @Override
  protected EDuplicateDetectionStrategy pickBestDuplicateDetectionStrategy(
      GtfsMergeContext context) {

    Collection targetKeys = getKeys(context.getTarget());
    Collection sourceKeys = getKeys(context.getSource());

    /**
     * If there are no entities, then we can't have identifier overlap.
     */
    if (targetKeys.isEmpty() || sourceKeys.isEmpty()) {
      return EDuplicateDetectionStrategy.NONE;
    }

    if (hasLikelyIdentifierOverlap(context, sourceKeys, targetKeys)) {
      return EDuplicateDetectionStrategy.IDENTITY;
    } else if (hasLikelyFuzzyOverlap(context, sourceKeys, targetKeys)) {
      return EDuplicateDetectionStrategy.FUZZY;
    } else {
      return EDuplicateDetectionStrategy.NONE;
    }
  }

  /**
   * Determines if the two set of collection identifiers have enough overlap
   * between entities with the same id to indicate that
   * {@link EDuplicateDetectionStrategy#IDENTITY} duplicate detection can be
   * used.
   * 
   * @param context
   * @param sourceKeys
   * @param targetKeys
   * @return true if identity duplicate detection seems appropriate
   */
  private boolean hasLikelyIdentifierOverlap(GtfsMergeContext context,
      Collection sourceKeys, Collection targetKeys) {

    /**
     * There needs to be a reasonable number of overlapping identifiers in the
     * first place for us to consider using identifier-based duplicate
     * detection.
     */
    Set commonKeys = new HashSet();
    double elementOvelapScore = DuplicateScoringSupport.scoreElementOverlap(
        sourceKeys, targetKeys, commonKeys);
    if (commonKeys.isEmpty()
        || elementOvelapScore < _minElementsInCommonScoreForAutoDetect) {
      return false;
    }

    /**
     * We score each entity pair with a common key between the two feeds to
     * determine if entities with the same key really are duplicates.
     */
    double totalScore = 0.0;
    for (KEY key : commonKeys) {
      totalScore += scoreDuplicateKey(context, key);
    }
    totalScore /= commonKeys.size();
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
  }

  /**
   * Given an id identifying an entity collection in both the source input feed
   * and the merged output feed, produce a score between 0.0 and 1.0 identifying
   * how likely it is that the two entity collections are one and the same,
   * where 0.0 means they having nothing in common and 1.0 meaning they are
   * exactly the same.
   * 
   * @param context
   * @param key
   * @return
   */
  protected abstract double scoreDuplicateKey(GtfsMergeContext context, KEY key);

  /**
   * Determines if the collection entities in source input feed and the target
   * merged output feed appear to have fuzzy duplicates. Sub-classes can
   * override this method to provide a fuzzy-duplicate detection strategy.
   * 
   * @param context
   * @param sourceKeys
   * @param targetKeys
   * @return true if the two feeds appear to have fuzzy duplicates
   */
  private boolean hasLikelyFuzzyOverlap(GtfsMergeContext context,
      Collection sourceKeys, Collection targetKeys) {
    return false;
  }

  /**
   * Find the id of an existing entity collection in the merged output feed with
   * the specified id. Returns null if no identifier-based duplicate exists.
   * 
   * Why don't we just do an identifier equality check? In the case of
   * identifiers like {@link AgencyAndId}, two ids might have different agency
   * ids when their raw GTFS ids are the same. Thus the "equal" identifer may
   * not actual be equal in the strict Java sense.
   * 
   * @param context
   * @param key
   * @return the id of the identifier-based duplicate entity collection, or null
   *         if not found
   */
  @SuppressWarnings("unchecked")
  private KEY getIdentityDuplicate(GtfsMergeContext context, KEY key) {
    String rawKey = getRawKey(key);
    return (KEY) context.getEntityForRawId(rawKey);
  }

  private KEY getFuzzyDuplicate(GtfsMergeContext context, KEY key) {
    return null;
  }

  /**
   * Converts the entity collection identifier into a raw GTFS identifier
   * string. This is what we actually use for identity duplicate detection.
   * 
   * @param key
   * @return
   */
  protected String getRawKey(KEY key) {
    if (key instanceof AgencyAndId) {
      return ((AgencyAndId) key).getId();
    }
    throw new UnsupportedOperationException(
        "cannot generate raw key for type: " + key.getClass());
  }

  private void logDuplicateKey(KEY key) {
    switch (_logDuplicatesStrategy) {
      case NONE:
        break;
      case WARNING:
        _log.warn("duplicate key: type=" + _keyDescription + " key=" + key);
        break;
      case ERROR:
        throw new CsvException("duplicate key: type=" + _keyDescription
            + " key=" + key);
    }
  }

  /**
   * If we detect that an entity collection in the source input feed duplicates
   * an entity collection in the merged output feed, we rename all references to
   * the old id in the source feed to use the id of the entity in the merged
   * feed. That way, when examining other entities in the source feed that
   * referenced the original entity collection with entities in the target feed
   * that reference the duplicate entity, both sets of entity will now appear to
   * reference the same thing. This can be useful for similarity detection.
   * 
   * @param context
   * @param oldId the original id in the source input feed
   * @param newId the new id, which replaces the old in the source input feed
   */
  protected abstract void renameKey(GtfsMergeContext context, KEY oldId,
      KEY newId);

  /**
   * Writes the specified entity collection to the merged output feed.
   * 
   * @param context
   * @param key the identifier for the entity collection to save
   */
  protected abstract void saveElementsForKey(GtfsMergeContext context, KEY key);

  /**
   * Renames the specified identifier to make it unique in the merged output
   * feed. Useful for when you find two entity collections with the same
   * identifier that aren't actually duplicates.
   * 
   * @param context
   * @param key
   * @return
   */
  @SuppressWarnings("unchecked")
  private KEY getRenamedKey(GtfsMergeContext context, KEY key) {
    if (key instanceof String) {
      return (KEY) (context.getPrefix() + key);
    } else if (key instanceof AgencyAndId) {
      return (KEY) MergeSupport.renameAgencyAndId(context, (AgencyAndId) key);
    }
    throw new UnsupportedOperationException("uknown key type: "
        + key.getClass());
  }

  @Override
  protected String getDescription() {
    return _keyDescription;
  }
}