com.conveyal.gtfs.loader.JdbcGtfsSnapshotter Maven / Gradle / Ivy
package com.conveyal.gtfs.loader;
import com.conveyal.gtfs.model.Calendar;
import com.conveyal.gtfs.model.CalendarDate;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import org.apache.commons.dbutils.DbUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.time.format.DateTimeFormatter;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import static com.conveyal.gtfs.loader.JdbcGtfsLoader.createFeedRegistryIfNotExists;
import static com.conveyal.gtfs.loader.JdbcGtfsLoader.createSchema;
import static com.conveyal.gtfs.util.Util.randomIdString;
/**
* This class takes a feedId that represents a feed already in the database and creates a copy of the entire feed.
* All tables except for the derived error and service tables are copied over (the derived pattern and pattern stop
* tables ARE copied over).
*
* This copy functionality is intended to make the feed editable and so the resulting feed
* tables are somewhat modified from their original read-only source. For instance, the ID column has been modified
* so that it is an auto-incrementing serial integer, changing the meaning of the column from csv_line (for feeds
* loaded from GTFS) to a unique identifier used to reference entities in an API.
*/
public class JdbcGtfsSnapshotter {
private static final Logger LOG = LoggerFactory.getLogger(JdbcGtfsSnapshotter.class);
private final DataSource dataSource;
/**
* Whether to normalize stop_times#stop_sequence values on snapshot (or leave them intact).
*
* TODO: if more options are added in the future, this should be folded into a SnapshotOptions
* object.
*/
private final boolean normalizeStopTimes;
// These fields will be filled in once feed snapshot begins.
private Connection connection;
private String tablePrefix;
// The reference feed ID (namespace) to copy.
private final String feedIdToSnapshot;
/**
* @param feedId namespace (schema) to snapshot. If null, a blank snapshot will be created.
* @param dataSource the JDBC data source with database connection details
* @param normalizeStopTimes whether to keep stop sequence values intact or normalize to be zero-based and
* incrementing
*/
public JdbcGtfsSnapshotter(String feedId, DataSource dataSource, boolean normalizeStopTimes) {
this.feedIdToSnapshot = feedId;
this.dataSource = dataSource;
this.normalizeStopTimes = normalizeStopTimes;
}
/**
* Copy primary entity tables as well as Pattern and PatternStops tables.
*/
public SnapshotResult copyTables() {
// This result object will be returned to the caller to summarize the feed and report any critical errors.
SnapshotResult result = new SnapshotResult();
try {
long startTime = System.currentTimeMillis();
// We get a single connection object and share it across several different methods.
// This ensures that actions taken in one method are visible to all subsequent SQL statements.
// If we create a schema or table on one connection, then access it in a separate connection, we have no
// guarantee that it exists when the accessing statement is executed.
connection = dataSource.getConnection();
// Generate a unique prefix that will identify this feed.
this.tablePrefix = randomIdString();
result.uniqueIdentifier = tablePrefix;
// Create entry in snapshots table.
registerSnapshot();
// Include the dot separator in the table prefix.
// This allows everything to work even when there's no prefix.
this.tablePrefix += ".";
// Copy each table in turn
// FIXME: NO non-fatal exception errors are being captured during copy operations.
result.agency = copy(Table.AGENCY, true);
result.calendar = copy(Table.CALENDAR, true);
result.calendarDates = copy(Table.CALENDAR_DATES, true);
result.fareAttributes = copy(Table.FARE_ATTRIBUTES, true);
result.fareRules = copy(Table.FARE_RULES, true);
result.feedInfo = copy(Table.FEED_INFO, true);
result.frequencies = copy(Table.FREQUENCIES, true);
result.routes = copy(Table.ROUTES, true);
// FIXME: Find some place to store errors encountered on copy for patterns and pattern stops.
copy(Table.PATTERNS, true);
copy(Table.PATTERN_STOP, true);
// see method comments fo why different logic is needed for this table
result.scheduleExceptions = createScheduleExceptionsTable();
result.shapes = copy(Table.SHAPES, true);
result.stops = copy(Table.STOPS, true);
// TODO: Should we defer index creation on stop times?
// Copying all tables for STIF w/ stop times idx = 156 sec; w/o = 28 sec
// Other feeds w/ stop times AC Transit = 3 sec; Brooklyn bus =
result.stopTimes = copy(Table.STOP_TIMES, true);
result.transfers = copy(Table.TRANSFERS, true);
result.trips = copy(Table.TRIPS, true);
result.completionTime = System.currentTimeMillis();
result.loadTimeMillis = result.completionTime - startTime;
LOG.info("Copying tables took {} sec", (result.loadTimeMillis) / 1000);
} catch (Exception ex) {
// Note: Exceptions that occur during individual table loads are separately caught and stored in
// TableLoadResult.
LOG.error("Exception while creating snapshot: {}", ex.toString());
ex.printStackTrace();
result.fatalException = ex.toString();
} finally {
if (connection != null) DbUtils.closeQuietly(connection);
}
return result;
}
/**
* This is the main table copy method that wraps a call to Table#createSqlTableFrom and creates indexes for
* the table.
*/
private TableLoadResult copy (Table table, boolean createIndexes) {
// This object will be returned to the caller to summarize the contents of the table and any errors.
// FIXME: Should there be a separate TableSnapshotResult? Load result is empty except for fatal exception.
TableLoadResult tableLoadResult = new TableLoadResult();
try {
// FIXME this is confusing, we only create a new table object so we can call a couple of methods on it,
// all of which just need a list of fields.
Table targetTable = new Table(tablePrefix + table.name, table.entityClass, table.required, table.fields);
boolean success;
if (feedIdToSnapshot == null) {
// If there is no feedId to snapshot (i.e., we're making an empty snapshot), simply create the table.
success = targetTable.createSqlTable(connection, true);
} else {
// Otherwise, use the createTableFrom method to copy the data from the original.
String fromTableName = String.format("%s.%s", feedIdToSnapshot, table.name);
LOG.info("Copying table {} to {}", fromTableName, targetTable.name);
success = targetTable.createSqlTableFrom(connection, fromTableName, normalizeStopTimes);
}
// Only create indexes if table creation was successful.
if (success && createIndexes) {
addEditorSpecificFields(connection, tablePrefix, table);
// Use spec table to create indexes. See createIndexes method for more info on why.
table.createIndexes(connection, tablePrefix);
// Populate default values for editor fields, including normalization of stop time stop sequences.
populateDefaultEditorValues(connection, tablePrefix, table);
}
LOG.info("Committing transaction...");
connection.commit();
LOG.info("Done.");
} catch (Exception ex) {
tableLoadResult.fatalException = ex.toString();
LOG.error("Error: ", ex);
try {
connection.rollback();
} catch (SQLException e) {
e.printStackTrace();
}
}
return tableLoadResult;
}
/**
* Special logic is needed for creating the schedule_exceptions table.
*
* gtfs-lib has some more advanced data types in addition to those available in the calendar_dates.txt file of the
* GTFS specification. The schedule_exceptions table is the source of truth for exceptions to regular schedules.
* When exporting to a GTFS, the calendar_dates table is completely ignored if the schedule_exceptions table exists.
*
* When creating a snapshot, if the schedule_exceptions table doesn't currently exist, it is assumed that the feed
* being copied has just been imported and additional data to explain schedule_exceptions has not been generated yet.
* If the schedule_exceptions does already exist, that table is simply copied over.
*
* If the calendar table does not exist in the feed being copied from, it might have been the case that the
* imported feed did not have a calendar.txt file. If that was the case, then the schedule exceptions need to be
* generated from the calendar_dates table and also the calendar table needs to be populated with dummy entries so
* that it is possible to export the data to a GTFS because of current logic in JdbcGtfsExporter. The way
* JdbcGtfsExporter will only export calendar_dates that have a corresponding entry in the calendar table and have a
* service span that applies to a schedule_exception. Furthermore, the dummy calendar entries are currently needed
* for the downstream library datatools-server/datatools-ui to work properly.
*/
private TableLoadResult createScheduleExceptionsTable() {
// check to see if the schedule_exceptions table exists
boolean scheduleExceptionsTableExists = tableExists(feedIdToSnapshot, "schedule_exceptions");
String scheduleExceptionsTableName = tablePrefix + "schedule_exceptions";
if (scheduleExceptionsTableExists) {
// schedule_exceptions table already exists in namespace being copied from. Therefore, we simply copy it.
return copy(Table.SCHEDULE_EXCEPTIONS, true);
} else {
// schedule_exceptions does not exist. Therefore, we generate schedule_exceptions from the calendar_dates.
TableLoadResult tableLoadResult = new TableLoadResult();
try {
Table.SCHEDULE_EXCEPTIONS.createSqlTable(
connection,
tablePrefix.replace(".", ""),
true
);
String sql = String.format(
"insert into %s (name, dates, exemplar, added_service, removed_service) values (?, ?, ?, ?, ?)",
scheduleExceptionsTableName
);
PreparedStatement scheduleExceptionsStatement = connection.prepareStatement(sql);
final BatchTracker scheduleExceptionsTracker = new BatchTracker(
"schedule_exceptions",
scheduleExceptionsStatement
);
JDBCTableReader calendarDatesReader = new JDBCTableReader(
Table.CALENDAR_DATES,
dataSource,
feedIdToSnapshot + ".",
EntityPopulator.CALENDAR_DATE
);
Iterable calendarDates = calendarDatesReader.getAll();
// Keep track of calendars by service id in case we need to add dummy calendar entries.
Map dummyCalendarsByServiceId = new HashMap<>();
// Iterate through calendar dates to build up to get maps from exceptions to their dates.
Multimap removedServiceForDate = HashMultimap.create();
Multimap addedServiceForDate = HashMultimap.create();
for (CalendarDate calendarDate : calendarDates) {
// Skip any null dates
if (calendarDate.date == null) {
LOG.warn("Encountered calendar date record with null value for date field. Skipping.");
continue;
}
String date = calendarDate.date.format(DateTimeFormatter.BASIC_ISO_DATE);
if (calendarDate.exception_type == 1) {
addedServiceForDate.put(date, calendarDate.service_id);
// create (if needed) and extend range of dummy calendar that would need to be created if we are
// copying from a feed that doesn't have the calendar.txt file
Calendar calendar = dummyCalendarsByServiceId.getOrDefault(calendarDate.service_id, new Calendar());
calendar.service_id = calendarDate.service_id;
if (calendar.start_date == null || calendar.start_date.isAfter(calendarDate.date)) {
calendar.start_date = calendarDate.date;
}
if (calendar.end_date == null || calendar.end_date.isBefore(calendarDate.date)) {
calendar.end_date = calendarDate.date;
}
dummyCalendarsByServiceId.put(calendarDate.service_id, calendar);
} else {
removedServiceForDate.put(date, calendarDate.service_id);
}
}
// Iterate through dates with added or removed service and add to database.
// For usability and simplicity of code, don't attempt to find all dates with similar
// added and removed services, but simply create an entry for each found date.
for (String date : Sets.union(removedServiceForDate.keySet(), addedServiceForDate.keySet())) {
scheduleExceptionsStatement.setString(1, date);
String[] dates = {date};
scheduleExceptionsStatement.setArray(2, connection.createArrayOf("text", dates));
scheduleExceptionsStatement.setInt(3, 9); // FIXME use better static type
scheduleExceptionsStatement.setArray(
4,
connection.createArrayOf("text", addedServiceForDate.get(date).toArray())
);
scheduleExceptionsStatement.setArray(
5,
connection.createArrayOf("text", removedServiceForDate.get(date).toArray())
);
scheduleExceptionsTracker.addBatch();
}
scheduleExceptionsTracker.executeRemaining();
// fetch all entries in the calendar table to generate set of serviceIds that exist in the calendar
// table.
JDBCTableReader calendarReader = new JDBCTableReader(
Table.CALENDAR,
dataSource,
feedIdToSnapshot + ".",
EntityPopulator.CALENDAR
);
Set calendarServiceIds = new HashSet<>();
for (Calendar calendar : calendarReader.getAll()) {
calendarServiceIds.add(calendar.service_id);
}
// For service_ids that only existed in the calendar_dates table, insert auto-generated, "blank"
// (no days of week specified) calendar entries.
sql = String.format(
"insert into %s (service_id, description, start_date, end_date, " +
"monday, tuesday, wednesday, thursday, friday, saturday, sunday)" +
"values (?, ?, ?, ?, 0, 0, 0, 0, 0, 0, 0)",
tablePrefix + "calendar"
);
PreparedStatement calendarStatement = connection.prepareStatement(sql);
final BatchTracker calendarsTracker = new BatchTracker(
"calendar",
calendarStatement
);
for (Calendar dummyCalendar : dummyCalendarsByServiceId.values()) {
if (calendarServiceIds.contains(dummyCalendar.service_id)) {
// This service_id already exists in the calendar table. No need to create auto-generated entry.
continue;
}
calendarStatement.setString(1, dummyCalendar.service_id);
calendarStatement.setString(
2,
String.format("%s (auto-generated)", dummyCalendar.service_id)
);
calendarStatement.setString(
3,
dummyCalendar.start_date.format(DateTimeFormatter.BASIC_ISO_DATE)
);
calendarStatement.setString(
4,
dummyCalendar.end_date.format(DateTimeFormatter.BASIC_ISO_DATE)
);
calendarsTracker.addBatch();
}
calendarsTracker.executeRemaining();
connection.commit();
} catch (Exception e) {
tableLoadResult.fatalException = e.toString();
LOG.error("Error creating schedule Exceptions: ", e);
e.printStackTrace();
try {
connection.rollback();
} catch (SQLException ex) {
ex.printStackTrace();
}
}
LOG.info("done creating schedule exceptions");
return tableLoadResult;
}
}
/**
* Helper method to determine if a table exists within a namespace.
*/
private boolean tableExists(String namespace, String tableName) {
// Preempt SQL check with null check of either namespace or table name.
if (namespace == null || tableName == null) return false;
try {
// This statement is postgres-specific.
PreparedStatement tableExistsStatement = connection.prepareStatement(
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = ? AND table_name = ?)"
);
tableExistsStatement.setString(1, namespace);
tableExistsStatement.setString(2, tableName);
ResultSet resultSet = tableExistsStatement.executeQuery();
resultSet.next();
return resultSet.getBoolean(1);
} catch (SQLException e) {
e.printStackTrace();
return false;
}
}
/**
* Add columns for any required, optional, or editor fields that don't already exist as columns on the table.
* This method contains a SQL statement that requires PostgreSQL 9.6+.
*/
private void addEditorSpecificFields(Connection connection, String tablePrefix, Table table) throws SQLException {
LOG.info("Adding any missing columns for {}", tablePrefix + table.name);
Statement statement = connection.createStatement();
for (Field field : table.editorFields()) {
// The following statement requires PostgreSQL 9.6+.
String addColumnSql = String.format("ALTER TABLE %s ADD COLUMN IF NOT EXISTS %s %s",
tablePrefix + table.name,
field.name,
field.getSqlTypeName());
LOG.info(addColumnSql);
statement.execute(addColumnSql);
}
}
/**
* Populates editor-specific fields added during GTFS-to-snapshot operation with default values. This method also
* "normalizes" the stop sequences for stop times to zero-based, incremented integers. NOTE: stop time normalization
* can take a significant amount of time (for large feeds up to 5 minutes) if the update is large.
*/
private void populateDefaultEditorValues(Connection connection, String tablePrefix, Table table) throws SQLException {
Statement statement = connection.createStatement();
if (Table.ROUTES.name.equals(table.name)) {
// Set default values for route status and publicly visible to "Approved" and "Public", respectively.
// This prevents unexpected results when users attempt to export a GTFS feed from the editor and no
// routes are exported due to undefined values for status and publicly visible.
String updateStatusSql = String.format(
"update %sroutes set status = 2, publicly_visible = 1 where status is NULL AND publicly_visible is NULL",
tablePrefix);
int updatedRoutes = statement.executeUpdate(updateStatusSql);
LOG.info("Updated status for {} routes", updatedRoutes);
}
if (Table.CALENDAR.name.equals(table.name)) {
// Set default values for description field. Basically constructs a description from the days of the week
// for which the calendar is active.
LOG.info("Updating calendar descriptions");
String[] daysOfWeek = new String[]{"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"};
String concatenatedDaysOfWeek = String.join(", ",
Arrays.stream(daysOfWeek)
.map(d -> String.format(
"case %s when 1 then '%s' else '' end",
d,
// Capitalize first letter. Converts days of week from "monday" -> "Mo".
d.substring(0, 1).toUpperCase() + d.substring(1, 2))).toArray(String[]::new));
String updateOtherSql = String.format(
"update %scalendar set description = concat(%s) where description is NULL",
tablePrefix,
concatenatedDaysOfWeek);
LOG.info(updateOtherSql);
int calendarsUpdated = statement.executeUpdate(updateOtherSql);
LOG.info("Updated description for {} calendars", calendarsUpdated);
}
if (Table.TRIPS.name.equals(table.name)) {
// Update use_frequency field for patterns. This sets all patterns that have a frequency trip to use
// frequencies. NOTE: This is performed after copying the TRIPS table rather than after PATTERNS because
// both tables (plus, frequencies) need to exist for the successful operation.
// TODO: How should this handle patterns that have both timetable- and frequency-based trips?
// NOTE: The below substitution uses argument indexing. All values "%1$s" reference the first argument
// supplied (i.e., tablePrefix).
String updatePatternsSql = String.format(
"update %1$spatterns set use_frequency = 1 " +
"from (select distinct %1$strips.pattern_id from %1$strips, %1$sfrequencies where %1$sfrequencies.trip_id = %1$strips.trip_id) freq " +
"where freq.pattern_id = %1$spatterns.pattern_id",
tablePrefix);
LOG.info(updatePatternsSql);
int patternsUpdated = statement.executeUpdate(updatePatternsSql);
LOG.info("Updated use_frequency for {} patterns", patternsUpdated);
}
// TODO: Add simple conversion from calendar_dates to schedule_exceptions if no exceptions exist? See
// https://github.com/catalogueglobal/datatools-server/issues/80
}
/**
* Add a line to the list of loaded feeds to record the snapshot and which feed the snapshot replicates.
*/
private void registerSnapshot () {
try {
// We cannot simply insert into the feeds table because if we are creating an empty snapshot (to create/edit
// a GTFS feed from scratch), the feed registry table will not exist.
// TODO copy over feed_id and feed_version from source namespace?
// TODO: Record total snapshot processing time?
createFeedRegistryIfNotExists(connection);
createSchema(connection, tablePrefix);
PreparedStatement insertStatement = connection.prepareStatement(
"insert into feeds values (?, null, null, null, null, null, current_timestamp, ?, false)");
insertStatement.setString(1, tablePrefix);
insertStatement.setString(2, feedIdToSnapshot);
insertStatement.execute();
connection.commit();
LOG.info("Created new snapshot namespace: {}", insertStatement);
} catch (Exception ex) {
LOG.error("Exception while registering snapshot namespace in feeds table", ex);
DbUtils.closeQuietly(connection);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy