org.onebusaway.gtfs_merge.strategies.AbstractIdentifiableSingleEntityMergeStrategy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of onebusaway-gtfs-merge Show documentation
A tool for merging GTFS feeds.
The newest version!
/**
 * Copyright (C) 2012 Google, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.onebusaway.gtfs_merge.strategies;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.onebusaway.collections.MappingLibrary;
import org.onebusaway.collections.Max;
import org.onebusaway.gtfs.model.AgencyAndId;
import org.onebusaway.gtfs.model.IdentityBean;
import org.onebusaway.gtfs.services.GtfsMutableRelationalDao;
import org.onebusaway.gtfs.services.GtfsRelationalDao;
import org.onebusaway.gtfs_merge.GtfsMergeContext;
import org.onebusaway.gtfs_merge.strategies.scoring.AndDuplicateScoringStrategy;
import org.onebusaway.gtfs_merge.strategies.scoring.DuplicateScoringSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Abstract base class that defines common methods and properties for merging
 * single GTFS entities with explicit identifiers in a GTFS feed.
 * 
 * @author bdferris
 * 
 * @param  the type of the GTFS entity that this merge strategy handles
 */
public abstract class AbstractIdentifiableSingleEntityMergeStrategy>
    extends AbstractSingleEntityMergeStrategy {
  private static Logger _log = LoggerFactory.getLogger(AbstractIdentifiableSingleEntityMergeStrategy.class);
  /**
   * When comparing entities between two feeds to see if they are duplicates, we
   * use the specified scoring strategy to score the amount of duplication
   * between the two entities. Sub-classes can add rules to this scoring
   * strategy specific to their entity type to guide duplication scoring.
   */
  protected AndDuplicateScoringStrategy _duplicateScoringStrategy = new AndDuplicateScoringStrategy();

  public AbstractIdentifiableSingleEntityMergeStrategy(Class entityType) {
    super(entityType);
  }

  @Override
  protected EDuplicateDetectionStrategy pickBestDuplicateDetectionStrategy(
      GtfsMergeContext context) {

    /**
     * If there are currently no elements to be duplicated, then return the NONE
     * strategy.
     */
    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();
    if (target.getAllEntitiesForType(_entityType).isEmpty()
        || source.getAllEntitiesForType(_entityType).isEmpty()) {
      return EDuplicateDetectionStrategy.NONE;
    }

    if (hasLikelyIdentifierOverlap(context)) {
      return EDuplicateDetectionStrategy.IDENTITY;
    } else if (hasLikelyFuzzyOverlap(context)) {
      return EDuplicateDetectionStrategy.FUZZY;
    } else {
      return EDuplicateDetectionStrategy.NONE;
    }
  }

  /**
   * Determines if the entities sharing the same ids in the source and target
   * feeds appear to be similar enough to indicate that
   * {@link EDuplicateDetectionStrategy#IDENTITY} duplicate detection can be
   * used.
   * 
   * @param context
   * @return true if identity duplicate detection seems appropriate
   */
  @SuppressWarnings("unchecked")
  private boolean hasLikelyIdentifierOverlap(GtfsMergeContext context) {
    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();
    Collection targetEntities = (Collection) target.getAllEntitiesForType(_entityType);
    Collection sourceEntities = (Collection) source.getAllEntitiesForType(_entityType);

    Map sourceById = MappingLibrary.mapToValue(targetEntities,
        "id");
    Map targetById = MappingLibrary.mapToValue(sourceEntities,
        "id");

    /**
     * First we check to make sure that the two feeds have enough identifiers in
     * common to suggest that identity-based duplicate detection should be used.
     */
    Set commonIds = new HashSet();
    double elementOvelapScore = DuplicateScoringSupport.scoreElementOverlap(
        sourceById.keySet(), targetById.keySet(), commonIds);
    if (commonIds.isEmpty()
        || elementOvelapScore < _minElementsInCommonScoreForAutoDetect) {
      return false;
    }

    /**
     * Now we score entities with the same identifier to see how well they
     * actually match.
     */
    double totalScore = 0.0;
    for (Serializable id : commonIds) {
      T targetEntity = sourceById.get(id);
      T sourceEntity = targetById.get(id);
      totalScore += _duplicateScoringStrategy.score(context, sourceEntity,
          targetEntity);
    }
    totalScore /= commonIds.size();

    /**
     * If the score is high enough, identity-based duplication detection should
     * be used.
     */
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
  }

  /**
   * Determines if the set entities in the source and target feeds appear to be
   * similar enough when performing fuzzy matching to indicate that
   * {@link EDuplicateDetectionStrategy#FUZZY} duplicate detection can be used.
   * 
   * @param context
   * @return true if fuzzy duplicate detection seems appropriate
   */
  @SuppressWarnings("unchecked")
  private boolean hasLikelyFuzzyOverlap(GtfsMergeContext context) {

    GtfsRelationalDao source = context.getSource();
    GtfsMutableRelationalDao target = context.getTarget();

    /**
     * TODO: Fuzzy matching is expensive. Do we really want to compare all of
     * the entities? Or would a sufficiently-large subset do the trick? Can any
     * of this be cached for the actual duplicate detection later on?
     */
    Collection targetEntities = (Collection) target.getAllEntitiesForType(_entityType);
    Collection sourceEntities = (Collection) source.getAllEntitiesForType(_entityType);

    double duplicateElements = 0;
    double totalScore = 0.0;

    /**
     * First we determine a rough set of potentially overlapping entities based
     * on a fuzzy match.
     * 
     * We break up the list of searches and spread it across available CPUs.
     */
    int cpus = Runtime.getRuntime().availableProcessors();
    int start = 0;
    int end = targetEntities.size() / cpus;
    int increment = targetEntities.size() / cpus;
    ExecutorService executorService = Executors.newFixedThreadPool(cpus);
    List results = new ArrayList(cpus);
    if (end < 10) {
      // no need to segregate is set is small
      Set remainingSourceEntities = new HashSet(sourceEntities);
      Result result = new Result();
      results.add(result);
      executorService.submit(new ScoringTask(context, _duplicateScoringStrategy, targetEntities, remainingSourceEntities, 0, targetEntities.size(), _minElementsInCommonScoreForAutoDetect, result));
    } else {
      for (int i = 0; i < cpus; i++) {
        Collection t_targetEntities = (Collection) target.getAllEntitiesForType(_entityType);
        Collection t_sourceEntities = (Collection) source.getAllEntitiesForType(_entityType);
        Set t_remainingSourceEntities = new HashSet(t_sourceEntities);
  
        Result result = new Result();
        results.add(result);
        executorService.submit(new ScoringTask(context, _duplicateScoringStrategy, t_targetEntities, t_remainingSourceEntities, start, end, _minElementsInCommonScoreForAutoDetect, result));
        start = end + 1;
        end = end + increment;
      }
    }    
    try {
      // give the executor a chance to run
      Thread.sleep(1 * 1000);
    } catch (InterruptedException e1) {
      return false;
    }
    
    int i = 0;
    for (Result result : results) {
      while (!result.isDone()) {
        try {
          _log.info("waiting on thread[" + i + "] at " + (int)(result.getPercentComplete() * 100) + "% complete (" + _entityType + ")");
          Thread.sleep(30 * 1000);
        } catch (InterruptedException e) {
          return false;
        }
      }
      duplicateElements += result.getDuplicateElements();
      totalScore += result.getTotalScore();
      i++;
      // we no longer remove the best match to avoid concurrency issues
    }

    executorService.shutdown();

    /**
     * There needs to be sufficient overlap between the two feeds for us to
     * consider using fuzzy duplicate detection in the first place.
     */
    double elementsInCommon = (duplicateElements / targetEntities.size() + duplicateElements
        / sourceEntities.size()) / 2;
    if (elementsInCommon < _minElementsInCommonScoreForAutoDetect) {
      return false;
    }

    /**
     * If there is sufficient overlap, only use fuzzy detection if the entities
     * themselves match well.
     */
    totalScore /= duplicateElements;
    return totalScore > _minElementsDuplicateScoreForAutoDetect;
  }

  @Override
  protected IdentityBean getIdentityDuplicate(GtfsMergeContext context,
      IdentityBean entity) {
    String rawId = getRawId(entity.getId());
    return (IdentityBean) context.getEntityForRawId(rawId);
  }

  @SuppressWarnings("unchecked")
  @Override
  protected IdentityBean getFuzzyDuplicate(GtfsMergeContext context,
      IdentityBean entity) {
    GtfsMutableRelationalDao targetDao = context.getTarget();
    Collection targets = (Collection) targetDao.getAllEntitiesForType(_entityType);
    if (targets.isEmpty()) {
      return null;
    }
    Max best = new Max();
    for (T target : targets) {
      /**
       * If we just added the target entity as part of the current feed, do not
       * attempt a fuzzy match against it.
       */
      String targetRawId = getRawId(target.getId());
      if (context.isEntityJustAddedWithRawId(targetRawId)) {
        continue;
      }
      double score = _duplicateScoringStrategy.score(context, (T) entity,
          target);
      best.add(score, target);
    }
    if (best.getMaxValue() < _minElementsDuplicateScoreForAutoDetect) {
      return null;
    }
    return (IdentityBean) best.getMaxElement();
  }

  /**
   * Saves the specified entity to the merged output feed. If the raw id of the
   * entity duplicates an existing entity in the output feed, its id will be
   * renamed.
   * 
   * @param context
   * @param entity
   */
  @Override
  protected void save(GtfsMergeContext context, IdentityBean entity) {
    String rawId = getRawId(entity.getId());
    /**
     * If an element if the same id has already been saved, we need to rename
     * this one.
     */
    if (context.getEntityForRawId(rawId) != null) {
      rename(context, entity);
      rawId = getRawId(entity.getId());
    }
    context.putEntityWithRawId(rawId, entity);
    super.save(context, entity);
  }

  /**
   * Converts the entity identifier into a raw GTFS identifier string. This is
   * what we actually use for identity duplicate detection.
   * 
   * @param id
   * @return the raw GTFS id
   */
  private String getRawId(Object id) {
    if (id instanceof String) {
      return (String) id;
    } else if (id instanceof AgencyAndId) {
      return ((AgencyAndId) id).getId();
    }
    throw new UnsupportedOperationException(
        "cannot generate raw key for type: " + id.getClass());
  }

  /**
   * Rename the id of the specified identity to avoid an raw GTFS identifier
   * collision in the merged output feed.
   * 
   * @param context
   * @param entity
   */
  @SuppressWarnings("unchecked")
  protected void rename(GtfsMergeContext context, IdentityBean entity) {
    Object id = entity.getId();
    if (id != null && id instanceof AgencyAndId) {
      IdentityBean bean = (IdentityBean) entity;
      AgencyAndId agencyAndId = bean.getId();
      AgencyAndId newAgencyAndId;
      if (this.getDuplicateRenamingStrategy() == EDuplicateRenamingStrategy.AGENCY) {
        newAgencyAndId = MergeSupport.renameAgencyAndId(agencyAndId.getAgencyId()+"-", agencyAndId);
        _log.debug(agencyAndId.toString() + " renamed(1) to " + newAgencyAndId);
      } else {
        newAgencyAndId = MergeSupport.renameAgencyAndId(context, agencyAndId);
        _log.debug(agencyAndId.toString() + " renamed(2) to " + newAgencyAndId);
      }
      
      bean.setId(newAgencyAndId);
    }
  }
  
  private static class Result {
    private double duplicateElements = 0.0;
    private double totalScore = 0.0;
    private boolean done = false;
    private double percentComplete = 0.0;
    public Result() {
    }
    public double getDuplicateElements() {
      return duplicateElements;
    }
    public void setDuplicateElements(double duplicateElements) {
      this.duplicateElements = duplicateElements;
    }
    public double getTotalScore() {
      return totalScore;
    }
    public void setTotalScore(double totalScore) {
      this.totalScore = totalScore;
    }
    public void setDone() {
      done = true;
    }
    public boolean isDone() {
      return done;
    }
    public double getPercentComplete() {
      return percentComplete;
    }
    public void setPercentComplete(double percentComplete) {
      this.percentComplete = percentComplete;
    }
  }
  
  public static class ScoringTask implements Runnable {
    private GtfsMergeContext context; 
    protected AndDuplicateScoringStrategy duplicateScoringStrategy;
    private Collection targetEntities; 
    private Collection remainingSourceEntities; 
    private int start; 
    private int end;
    private double min;
    private Result result;

    public Result getResult() {
      return result;
    }
    
    public ScoringTask(GtfsMergeContext context, 
        AndDuplicateScoringStrategy duplicateScoringStrategy,
        Collection targetEntities, 
        Collection remainingSourceEntities, 
        int start, 
        int end,
        double min, Result result) {
      this.context = context;
      this.duplicateScoringStrategy = duplicateScoringStrategy;
      this.targetEntities = targetEntities;
      this.remainingSourceEntities = remainingSourceEntities;
      this.start = start;
      this.end = end;
      this.result = result;
    }

    @Override
    public void run() {
      try {
        score(context, duplicateScoringStrategy, targetEntities, remainingSourceEntities, start, end, min, result);
      } catch (Throwable t) {
        _log.error("scoring thread broke:", t);
      } finally {
        result.setDone();
      }
    }
    
    private void score(GtfsMergeContext context, 
        AndDuplicateScoringStrategy duplicateScoringStrategy,
        Collection targetEntities, 
        Collection remainingSourceEntities, 
        int start, 
        int end,
        double min,
        Result result) {
      double duplicateElements = 0;
      double totalScore = 0;
      Iterator iterator = targetEntities.iterator();
      for (int i=0; i < start; i++) {
        iterator.next();
      }
      for (int i = start; i< end; i++) {
        if (i % 20 == 0) {
          double percent = ((double)i-start) / (end - start);
          result.setPercentComplete(percent);
        }
        T targetEntity = iterator.next();
        Max best = new Max();
        for (T sourceEntity : remainingSourceEntities) {
          double score = duplicateScoringStrategy.score(context, sourceEntity,
              targetEntity);
          if (score < min) {
            continue;
          }
          best.add(score, sourceEntity);
        }

        if (best.getMaxElement() != null) {
          duplicateElements++;
          totalScore += best.getMaxValue();
        }
      }
      result.setDuplicateElements(duplicateElements);
      result.setTotalScore(totalScore);
    }

  }
}