com.conveyal.gtfs.loader.JdbcGtfsLoader Maven / Gradle / Ivy
package com.conveyal.gtfs.loader;
import com.conveyal.gtfs.error.NewGTFSError;
import com.conveyal.gtfs.error.NewGTFSErrorType;
import com.conveyal.gtfs.error.SQLErrorStorage;
import com.conveyal.gtfs.storage.StorageException;
import com.csvreader.CsvReader;
import com.google.common.hash.HashCode;
import com.google.common.hash.Hashing;
import com.google.common.io.Files;
import org.apache.commons.dbutils.DbUtils;
import org.postgresql.copy.CopyManager;
import org.postgresql.core.BaseConnection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.sql.DataSource;
import java.io.*;
import java.sql.*;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import static com.conveyal.gtfs.error.NewGTFSErrorType.*;
import static com.conveyal.gtfs.model.Entity.human;
import static com.conveyal.gtfs.util.Util.randomIdString;
/**
* This class loads CSV tables from zipped GTFS into an SQL relational database management system with a JDBC driver.
* By comparing the GTFS specification for a table with the headers present in the GTFS CSV, it dynamically builds up
* table definitions and SQL statements to interact with those tables. It retains all columns present in the GTFS,
* including optional columns, known extensions, and unrecognized proprietary extensions.
*
* It supports several ways of putting the data into the tables: batched prepared inserts or loading from an
* intermediate tab separated text file.
*
* Our previous approach involved loading GTFS CSV tables into Java objects and then using an object-relational mapping
* to put those objects into a database. In that case a fixed number of fields are represented. If the GTFS feed
* contains extra proprietary fields, they are lost immediately on import. The Java model objects must contain fields
* for all GTFS columns that will ever be retrieved, and memory and database space are required to represent all those
* fields even when they are not present in a particular feed. The same would be true of a direct-to-database approach
* if multiple feeds were loaded into the same tables: a "lowest common denominator" set of fields would need to be
* selected. However, we create one set of tables per GTFS feed loaded into the same database and namespace them with
* what the SQL spec calls "schemas".
*
* The structure of the CSV file and the data it contains are validated line by line during the load process. This
* allows us to handle error recovery ourselves, recording detailed messages about the largest possible number of errors
* rather than failing at the first error.
*
* This is important because of the generally long turnaround for GTFS publication, repair, and validation. If a newly
* submitted feed fails to import because of a missing file, we don't want to report that single error to the feed
* producer, only to discover and report additional errors when the repaired feed is re-submitted weeks later.
*
* The fact that existing libraries would abort a GTFS import upon encountering very common, recoverable errors was the
* original motivation for creating gtfs-lib. Error recovery is all the more important when bulk-loading data into
* database systems - the Postgres 'copy' import is particularly brittle and does not provide error messages that would
* help the feed producer repair their feed.
*
* The validation performed during CSV loading includes:
* - columns are present for all required fields
* - all rows have the same number of fields as there are headers
* - fields do not contain problematic characters
* - field contents can be converted to the target data types and are in range
* - referential integrity
*/
public class JdbcGtfsLoader {
public static final long INSERT_BATCH_SIZE = 500;
// Represents null in Postgres text format
private static final String POSTGRES_NULL_TEXT = "\\N";
private static final Logger LOG = LoggerFactory.getLogger(JdbcGtfsLoader.class);
private String gtfsFilePath;
protected ZipFile zip;
private File tempTextFile;
private PrintStream tempTextFileStream;
private PreparedStatement insertStatement = null;
private final DataSource dataSource;
// These fields will be filled in once feed loading begins.
private Connection connection;
private String tablePrefix;
private SQLErrorStorage errorStorage;
// Contains references to unique entity IDs during load stage used for referential integrity check.
private ReferenceTracker referenceTracker = new ReferenceTracker();
public JdbcGtfsLoader(String gtfsFilePath, DataSource dataSource) {
this.gtfsFilePath = gtfsFilePath;
this.dataSource = dataSource;
}
/** Get SQL string for creating the feed registry table (AKA, the "feeds" table). */
public static String getCreateFeedRegistrySQL() {
return "create table if not exists feeds (namespace varchar primary key, md5 varchar, " +
"sha1 varchar, feed_id varchar, feed_version varchar, filename varchar, loaded_date timestamp, " +
"snapshot_of varchar, deleted boolean)";
}
// Hash to uniquely identify files.
// We can't use CRC32, the probability of collision on 10k items is about 1%.
// https://stackoverflow.com/a/1867252
// http://preshing.com/20110504/hash-collision-probabilities/
// On the full NL feed:
// MD5 took 820 msec, cabb18e43798f92c52d5d0e49f52c988
// Murmur took 317 msec, 5e5968f9bf5e1cdf711f6f48fcd94355
// SHA1 took 1072 msec, 9fb356af4be2750f20955203787ec6f95d32ef22
// There appears to be no advantage to loading tables in parallel, as the whole loading process is I/O bound.
public FeedLoadResult loadTables () {
// This result object will be returned to the caller to summarize the feed and report any critical errors.
FeedLoadResult result = new FeedLoadResult();
try {
// Begin tracking time. FIXME: should this follow the connect/register and begin with the table loads?
long startTime = System.currentTimeMillis();
// We get a single connection object and share it across several different methods.
// This ensures that actions taken in one method are visible to all subsequent SQL statements.
// If we create a schema or table on one connection, then access it in a separate connection, we have no
// guarantee that it exists when the accessing statement is executed.
connection = dataSource.getConnection();
File gtfsFile = new File(gtfsFilePath);
this.zip = new ZipFile(gtfsFilePath);
// Generate a unique prefix that will identify this feed.
// Prefixes ("schema" names) based on feed_id and feed_version get very messy, so we use random unique IDs.
// We don't want to use an auto-increment numeric primary key because these need to be alphabetical.
// Although ID collisions are theoretically possible, they are improbable in the extreme because our IDs
// are long enough to have as much entropy as a UUID. So we don't really need to check for uniqueness and
// retry in a loop.
// TODO handle the case where we don't want any prefix.
this.tablePrefix = randomIdString();
result.filename = gtfsFilePath;
result.uniqueIdentifier = tablePrefix;
// The order of the following four lines should not be changed because the schema needs to be in place
// before the error storage can be constructed, which in turn needs to exist in case any errors are
// encountered during the loading process.
{
createSchema(connection, tablePrefix);
//the SQLErrorStorage constructor expects the tablePrefix to contain the dot separator.
this.errorStorage = new SQLErrorStorage(connection, tablePrefix + ".", true);
//registerFeed accesses this.tablePrefix which shouldn't contain the dot separator.
registerFeed(gtfsFile);
// Include the dot separator in the table prefix from this point onwards.
// This allows everything to work even when there's no prefix.
this.tablePrefix += ".";
}
// Load each table in turn, saving some summary information about what happened during each table load
result.agency = load(Table.AGENCY);
result.calendar = load(Table.CALENDAR);
result.calendarDates = load(Table.CALENDAR_DATES);
result.routes = load(Table.ROUTES);
result.fareAttributes = load(Table.FARE_ATTRIBUTES);
result.fareRules = load(Table.FARE_RULES);
result.feedInfo = load(Table.FEED_INFO);
result.shapes = load(Table.SHAPES);
result.stops = load(Table.STOPS);
result.transfers = load(Table.TRANSFERS);
result.trips = load(Table.TRIPS); // refs routes
result.frequencies = load(Table.FREQUENCIES); // refs trips
result.stopTimes = load(Table.STOP_TIMES);
result.errorCount = errorStorage.getErrorCount();
// This will commit and close the single connection that has been shared between all preceding load steps.
errorStorage.commitAndClose();
zip.close();
result.completionTime = System.currentTimeMillis();
result.loadTimeMillis = result.completionTime - startTime;
LOG.info("Loading tables took {} sec", result.loadTimeMillis / 1000);
} catch (Exception ex) {
// TODO catch exceptions separately while loading each table so load can continue, store in TableLoadResult
LOG.error("Exception while loading GTFS file: {}", ex.toString());
ex.printStackTrace();
result.fatalException = ex.toString();
} finally {
if (connection != null) DbUtils.closeQuietly(connection);
}
return result;
}
/**
* Creates a schema/namespace in the database WITHOUT committing the changes.
* This does *not* setup any other tables or enter the schema name in a registry (@see #registerFeed).
*
* @param connection Connection to the database to create the schema on.
* @param schemaName Name of the schema (i.e. table prefix). Should not include the dot suffix.
*/
static void createSchema (Connection connection, String schemaName) {
try {
Statement statement = connection.createStatement();
// FIXME do the following only on databases that support schemas.
// SQLite does not support them. Is there any advantage of schemas over flat tables?
statement.execute("create schema " + schemaName);
LOG.info("Created new feed schema: {}", statement);
} catch (Exception ex) {
LOG.error("Exception while registering new feed namespace in feeds table: {}", ex.getMessage());
DbUtils.closeQuietly(connection);
}
}
/**
* Add a line to the list of loaded feeds showing that this feed has been loaded.
* We used to inspect feed_info here so we could make our table prefix based on feed ID and version.
* Now we just load feed_info like any other table.
* // Create a row in the table of loaded feeds for this feed
* Really this is not just making the table prefix - it's loading the feed_info and should also calculate hashes.
*
* Originally we were flattening all feed_info files into one root-level table, but that forces us to drop any
* custom fields in feed_info.
*/
private void registerFeed (File gtfsFile) {
// FIXME is this extra CSV reader used anymore? Check comment below.
// First, inspect feed_info.txt to extract the ID and version.
// We could get this with SQL after loading, but feed_info, feed_id and feed_version are all optional.
CsvReader csvReader = Table.FEED_INFO.getCsvReader(zip, errorStorage);
String feedId = "", feedVersion = "";
if (csvReader != null) {
// feed_info.txt has been found and opened.
try {
csvReader.readRecord();
// csvReader.get() returns the empty string for missing columns
feedId = csvReader.get("feed_id");
feedVersion = csvReader.get("feed_version");
} catch (IOException e) {
LOG.error("Exception while inspecting feed_info: {}", e);
}
csvReader.close();
}
try {
HashCode md5 = Files.hash(gtfsFile, Hashing.md5());
String md5Hex = md5.toString();
HashCode sha1 = Files.hash(gtfsFile, Hashing.sha1());
String shaHex = sha1.toString();
createFeedRegistryIfNotExists(connection);
// TODO try to get the feed_id and feed_version out of the feed_info table
// statement.execute("select * from feed_info");
// current_timestamp seems to be the only standard way to get the current time across all common databases.
// Record total load processing time?
PreparedStatement insertStatement = connection.prepareStatement(
"insert into feeds values (?, ?, ?, ?, ?, ?, current_timestamp, null, false)");
insertStatement.setString(1, tablePrefix);
insertStatement.setString(2, md5Hex);
insertStatement.setString(3, shaHex);
insertStatement.setString(4, feedId.isEmpty() ? null : feedId);
insertStatement.setString(5, feedVersion.isEmpty() ? null : feedVersion);
insertStatement.setString(6, zip.getName());
insertStatement.execute();
connection.commit();
LOG.info("Created new feed namespace: {}", insertStatement);
} catch (Exception ex) {
LOG.error("Exception while registering new feed namespace in feeds table", ex);
DbUtils.closeQuietly(connection);
}
}
/**
* Creates the feed registry table if it does not already exist. This must occur before the first attempt to load a
* GTFS feed or create an empty snapshot. Note: the connection MUST be committed after this method call.
*/
static void createFeedRegistryIfNotExists(Connection connection) throws SQLException {
Statement statement = connection.createStatement();
statement.execute(getCreateFeedRegistrySQL());
}
/**
* This wraps the main internal table loader method to catch exceptions and figure out how many errors happened.
*/
private TableLoadResult load (Table table) {
// This object will be returned to the caller to summarize the contents of the table and any errors.
TableLoadResult tableLoadResult = new TableLoadResult();
int initialErrorCount = errorStorage.getErrorCount();
try {
tableLoadResult.rowCount = loadInternal(table);
tableLoadResult.fileSize = getTableSize(table);
LOG.info(String.format("loaded in %d %s records", tableLoadResult.rowCount, table.name));
} catch (Exception ex) {
LOG.error("Fatal error loading table", ex);
tableLoadResult.fatalException = ex.toString();
// Rollback connection so that fatal exception does not impact loading of other tables.
try {
connection.rollback();
} catch (SQLException e) {
e.printStackTrace();
}
} finally {
// Explicitly delete the tmp file now that load is finished (either success or failure).
// Otherwise these multi-GB files clutter the drive.
if (tempTextFile != null) {
tempTextFile.delete();
}
}
int finalErrorCount = errorStorage.getErrorCount();
tableLoadResult.errorCount = finalErrorCount - initialErrorCount;
return tableLoadResult;
}
/**
* Get the uncompressed file size in bytes for the specified GTFS table.
*/
private int getTableSize(Table table) {
ZipEntry zipEntry = zip.getEntry(table.name + ".txt");
if (zipEntry == null) return 0;
return (int) zipEntry.getSize();
}
/**
* This function will throw any exception that occurs. Those exceptions will be handled by the outer load method.
* @return number of rows that were loaded.
*/
private int loadInternal (Table table) throws Exception {
CsvReader csvReader = table.getCsvReader(zip, errorStorage);
if (csvReader == null) {
LOG.info(String.format("file %s.txt not found in gtfs zipfile", table.name));
// This GTFS table could not be opened in the zip, even in a subdirectory.
if (table.isRequired()) errorStorage.storeError(NewGTFSError.forTable(table, MISSING_TABLE));
return 0;
}
LOG.info("Loading GTFS table {}", table.name);
// Use the Postgres text load format if we're connected to that DBMS.
boolean postgresText = (connection.getMetaData().getDatabaseProductName().equals("PostgreSQL"));
// TODO Strip out line returns, tabs in field contents.
// By default the CSV reader trims leading and trailing whitespace in fields.
// Build up a list of fields in the same order they appear in this GTFS CSV file.
Field[] fields = table.getFieldsFromFieldHeaders(csvReader.getHeaders(), errorStorage);
int keyFieldIndex = table.getKeyFieldIndex(fields);
// Create separate fields array with filtered list that does not include null values (for duplicate headers or
// ID field). This is solely used to construct the table and array of values to load.
Field[] cleanFields = Arrays.stream(fields).filter(Objects::nonNull).toArray(Field[]::new);
if (cleanFields.length == 0) {
// Do not create the table if there are no valid fields.
errorStorage.storeError(NewGTFSError.forTable(table, TABLE_MISSING_COLUMN_HEADERS));
return 0;
}
// Replace the GTFS spec Table with one representing the SQL table we will populate, with reordered columns.
// FIXME this is confusing, we only create a new table object so we can call a couple of methods on it, all of which just need a list of fields.
Table targetTable = new Table(tablePrefix + table.name, table.entityClass, table.required, cleanFields);
// NOTE H2 doesn't seem to work with schemas (or create schema doesn't work).
// With bulk loads it takes 140 seconds to load the data and additional 120 seconds just to index the stop times.
// SQLite also doesn't support schemas, but you can attach additional database files with schema-like naming.
// We'll just literally prepend feed identifiers to table names when supplied.
// Some databases require the table to exist before a statement can be prepared.
targetTable.createSqlTable(connection);
// TODO are we loading with or without a header row in our Postgres text file?
if (postgresText) {
// No need to output headers to temp text file, our SQL table column order exactly matches our text file.
tempTextFile = File.createTempFile(targetTable.name, "text");
tempTextFileStream = new PrintStream(new BufferedOutputStream(new FileOutputStream(tempTextFile)));
LOG.info("Loading via temporary text file at " + tempTextFile.getAbsolutePath());
} else {
insertStatement = connection.prepareStatement(targetTable.generateInsertSql());
LOG.info(insertStatement.toString()); // Logs the SQL for the prepared statement
}
// When outputting text, accumulate transformed strings to allow skipping rows when errors are encountered.
// One extra position in the array for the CSV line number.
String[] transformedStrings = new String[cleanFields.length + 1];
// Iterate over each record and prepare the record for storage in the table either through batch insert
// statements or postgres text copy operation.
while (csvReader.readRecord()) {
// The CSV reader's current record is zero-based and does not include the header line.
// Convert to a CSV file line number that will make more sense to people reading error messages.
if (csvReader.getCurrentRecord() + 2 > Integer.MAX_VALUE) {
errorStorage.storeError(NewGTFSError.forTable(table, TABLE_TOO_LONG));
break;
}
int lineNumber = ((int) csvReader.getCurrentRecord()) + 2;
if (lineNumber % 500_000 == 0) LOG.info("Processed {}", human(lineNumber));
if (csvReader.getColumnCount() != fields.length) {
String badValues = String.format("expected=%d; found=%d", fields.length, csvReader.getColumnCount());
errorStorage.storeError(NewGTFSError.forLine(table, lineNumber, WRONG_NUMBER_OF_FIELDS, badValues));
continue;
}
// Store value of key field for use in checking duplicate IDs
// FIXME: If the key field is missing (keyFieldIndex is still -1) from a loaded table, this will crash.
String keyValue = csvReader.get(keyFieldIndex);
// The first field holds the line number of the CSV file. Prepared statement parameters are one-based.
if (postgresText) transformedStrings[0] = Integer.toString(lineNumber);
else insertStatement.setInt(1, lineNumber);
// Maintain a separate columnIndex from for loop because some fields may be null and not included in the set
// of fields for this table.
int columnIndex = 0;
for (int f = 0; f < fields.length; f++) {
Field field = fields[f];
// If the field is null, it represents a duplicate header or ID field and must be skipped to maintain
// table integrity.
if (field == null) continue;
// CSV reader get on an empty field will be an empty string literal.
String string = csvReader.get(f);
// Use spec table to check that references are valid and IDs are unique.
Set errors = referenceTracker
.checkReferencesAndUniqueness(keyValue, lineNumber, field, string, table);
// Check for special case with calendar_dates where added service should not trigger ref. integrity
// error.
if (
table.name.equals("calendar_dates") &&
"service_id".equals(field.name) &&
"1".equals(csvReader.get(Field.getFieldIndex(fields, "exception_type")))
){
for (NewGTFSError error : errors) {
if (NewGTFSErrorType.REFERENTIAL_INTEGRITY.equals(error.errorType)) {
// Do not record bad service_id reference errors for calendar date entries that add service
// (exception type=1) because a corresponding service_id in calendars.txt is not required in
// this case.
LOG.info(
"A calendar_dates.txt entry added service (exception_type=1) for service_id={}, which does not have (or necessarily need) a corresponding entry in calendars.txt.",
keyValue
);
} else {
errorStorage.storeError(error);
}
}
}
// In all other cases (i.e., outside of the calendar_dates special case), store the reference errors found.
else {
errorStorage.storeErrors(errors);
}
// Add value for entry into table
setValueForField(table, columnIndex, lineNumber, field, string, postgresText, transformedStrings);
// Increment column index.
columnIndex += 1;
}
if (postgresText) {
// Print a new line in the standard postgres text format:
// https://www.postgresql.org/docs/9.1/static/sql-copy.html#AEN64380
tempTextFileStream.println(String.join("\t", transformedStrings));
} else {
insertStatement.addBatch();
if (lineNumber % INSERT_BATCH_SIZE == 0) insertStatement.executeBatch();
}
}
// Record number is zero based but includes the header record, which we don't want to count.
// But if we are working with Postgres text file (without a header row) we have to add 1
// Iteration over all rows has finished, so We are now one record past the end of the file.
int numberOfRecordsLoaded = (int) csvReader.getCurrentRecord();
if (postgresText) {
numberOfRecordsLoaded = numberOfRecordsLoaded + 1;
}
if (table.isRequired() && numberOfRecordsLoaded == 0) {
errorStorage.storeError(NewGTFSError.forTable(table, REQUIRED_TABLE_EMPTY));
}
csvReader.close();
// Finalize loading the table, either by copying the pre-validated text file into the database (for Postgres)
// or inserting any remaining rows (for all others).
if (postgresText) {
LOG.info("Loading into database table {} from temporary text file...", targetTable.name);
tempTextFileStream.close();
copyFromFile(connection, tempTextFile, targetTable.name);
} else {
insertStatement.executeBatch();
}
// Create indexes using spec table. Target table must not be used because fields could be in the wrong order
// (and the order is currently important to determining the index fields).
table.createIndexes(connection, tablePrefix);
LOG.info("Committing transaction...");
connection.commit();
LOG.info("Done.");
return numberOfRecordsLoaded;
}
/**
* Method that uses the PostgreSQL-specific copy from file command to load csv data into a table on the provided
* connection. NOTE: This method does not commit the transaction or close the connection.
*/
public static void copyFromFile(Connection connection, File file, String targetTableName) throws IOException, SQLException {
// Allows sending over network. This is only slightly slower than a local file copy.
final String copySql = String.format("copy %s from stdin", targetTableName);
// FIXME we should be reading the COPY text from a stream in parallel, not from a temporary text file.
InputStream stream = new BufferedInputStream(new FileInputStream(file.getAbsolutePath()));
// Our connection pool wraps the Connection objects, so we need to unwrap the Postgres connection interface.
CopyManager copyManager = new CopyManager(connection.unwrap(BaseConnection.class));
copyManager.copyIn(copySql, stream, 1024*1024);
stream.close();
// It is also possible to load from local file if this code is running on the database server.
// statement.execute(String.format("copy %s from '%s'", table.name, tempTextFile.getAbsolutePath()));
}
/**
* Set value for a field either as a prepared statement parameter or (if using postgres text-loading) in the
* transformed strings array provided. This also handles the case where the string is empty (i.e., field is null)
* and when an exception is encountered while setting the field value (usually due to a bad data type), in which case
* the field is set to null.
*/
public void setValueForField(Table table, int fieldIndex, int lineNumber, Field field, String string, boolean postgresText, String[] transformedStrings) {
if (string.isEmpty()) {
// CSV reader always returns empty strings, not nulls
if (field.isRequired() && !field.isEmptyValuePermitted() && errorStorage != null) {
errorStorage.storeError(NewGTFSError.forLine(table, lineNumber, MISSING_FIELD, field.name));
}
setFieldToNull(postgresText, transformedStrings, fieldIndex, field);
} else {
// Micro-benchmarks show it's only 4-5% faster to call typed parameter setter methods
// rather than setObject with a type code. I think some databases don't have setObject though.
// The Field objects throw exceptions to avoid passing the line number, table name etc. into them.
try {
// Here, we set the transformed string element even when an error occurs.
// Ideally, no errors should be signaled with exceptions, but this happens in a try/catch in case
// something goes wrong (we don't necessarily want to abort loading the feed altogether).
// FIXME Also, we should probably not be converting any GTFS field values, but some of them are coerced
// to null if they are unparseable (e.g., DateField).
// We should be saving it as-is in the database and converting upon load into our model objects.
Set errors;
if (postgresText) {
ValidateFieldResult result = field.validateAndConvert(string);
// If the result is null, use the null-setting method.
if (result.clean == null) setFieldToNull(postgresText, transformedStrings, fieldIndex, field);
// Otherwise, set the cleaned field according to its index.
else transformedStrings[fieldIndex + 1] = result.clean;
errors = result.errors;
} else {
errors = field.setParameter(insertStatement, fieldIndex + 2, string);
}
// Store any errors encountered after field value has been set.
for (NewGTFSError error : errors) {
error.entityType = table.getEntityClass();
error.lineNumber = lineNumber;
if (errorStorage != null) errorStorage.storeError(error);
}
} catch (StorageException ex) {
// FIXME many exceptions don't have an error type
if (errorStorage != null) {
errorStorage.storeError(NewGTFSError.forLine(table, lineNumber, ex.errorType, ex.badValue));
}
// Set transformedStrings or prepared statement param to null
setFieldToNull(postgresText, transformedStrings, fieldIndex, field);
}
}
}
/**
* Sets field to null in statement or string array depending on whether postgres is being used.
*/
private void setFieldToNull(boolean postgresText, String[] transformedStrings, int fieldIndex, Field field) {
if (postgresText) transformedStrings[fieldIndex + 1] = POSTGRES_NULL_TEXT;
// Adjust parameter index by two: indexes are one-based and the first one is the CSV line number.
else try {
// LOG.info("setting {} index to null", fieldIndex + 2);
field.setNull(insertStatement, fieldIndex + 2);
} catch (SQLException e) {
e.printStackTrace();
// FIXME: store error here? It appears that an exception should only be thrown if the type value is invalid,
// the connection is closed, or the index is out of bounds. So storing an error may be unnecessary.
}
}
/**
* Protect against SQL injection.
* The only place we include arbitrary input in SQL is the column names of tables.
* Implicitly (looking at all existing table names) these should consist entirely of
* lowercase letters and underscores.
*
* TODO add a test including SQL injection text (quote and semicolon)
*/
public static String sanitize (String string, SQLErrorStorage errorStorage) {
String clean = string.replaceAll("[^\\p{Alnum}_]", "");
if (!clean.equals(string)) {
LOG.warn("SQL identifier '{}' was sanitized to '{}'", string, clean);
if (errorStorage != null) errorStorage.storeError(NewGTFSError.forFeed(COLUMN_NAME_UNSAFE, string));
}
return clean;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy