com.microsoft.sqlserver.jdbc.SQLServerBulkCSVFileRecord Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mssql-jdbc Show documentation
Show all versions of mssql-jdbc Show documentation
Microsoft JDBC Driver for SQL Server.
/*
* Microsoft JDBC Driver for SQL Server Copyright(c) Microsoft Corporation All rights reserved. This program is made
* available under the terms of the MIT License. See the LICENSE file in the project root for more information.
*/
package com.microsoft.sqlserver.jdbc;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.sql.Types;
import java.text.DecimalFormat;
import java.text.MessageFormat;
import java.time.OffsetDateTime;
import java.time.OffsetTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
/**
* Provides a simple implementation of the ISQLServerBulkRecord interface that can be used to read in the basic Java
* data types from a delimited file where each line represents a row of data.
*/
public class SQLServerBulkCSVFileRecord extends SQLServerBulkRecord implements java.lang.AutoCloseable {
/**
* Update serialVersionUID when making changes to this file
*/
private static final long serialVersionUID = 1546487135640225989L;
/**
* Resources associated with reading in the file
*/
/** file reader */
private transient BufferedReader fileReader;
/** input stream reader */
private transient InputStreamReader sr;
/** file input stream */
private transient FileInputStream fis;
/**
* Current line of data to parse.
*/
private String currentLine = null;
/**
* Delimiter to parse lines with.
*/
private final String delimiter;
/**
* Escape delimiters
*/
private boolean escapeDelimiters;
/**
* Double quote character to parse lines with
*/
private static final char doubleQuoteChar = '\"';
/*
* Class names for logging.
*/
private static final String loggerClassName = "SQLServerBulkCSVFileRecord";
/**
* Constructs a simple reader to parse data from a delimited file with the given encoding.
*
* @param fileToParse
* File to parse data from.
* @param encoding
* Charset encoding to use for reading the file, or NULL for the default encoding.
* @param delimiter
* Delimiter to used to separate each column. Regex characters must be escaped with double backslashes.
* @param firstLineIsColumnNames
* True if the first line of the file should be parsed as column names; false otherwise
* @throws SQLServerException
* If the arguments are invalid, there are any errors in reading the file, or the file is empty
*/
public SQLServerBulkCSVFileRecord(String fileToParse, String encoding, String delimiter,
boolean firstLineIsColumnNames) throws SQLServerException {
initLoggerResources();
if (loggerExternal.isLoggable(java.util.logging.Level.FINER)) {
loggerExternal.entering(loggerPackageName, loggerClassName,
new Object[] {fileToParse, encoding, delimiter, firstLineIsColumnNames});
}
if (null == fileToParse) {
throwInvalidArgument("fileToParse");
} else if (null == delimiter) {
throwInvalidArgument("delimiter");
}
this.delimiter = delimiter;
try {
// Create the file reader
fis = new FileInputStream(fileToParse);
if (null == encoding || 0 == encoding.length()) {
sr = new InputStreamReader(fis);
} else {
sr = new InputStreamReader(fis, encoding);
}
initFileReader(sr, encoding, delimiter, firstLineIsColumnNames);
} catch (UnsupportedEncodingException unsupportedEncoding) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_unsupportedEncoding"));
throw new SQLServerException(form.format(new Object[] {encoding}), null, 0, unsupportedEncoding);
} catch (Exception e) {
throw new SQLServerException(null, e.getMessage(), null, 0, false);
}
columnMetadata = new HashMap<>();
loggerExternal.exiting(loggerPackageName, loggerClassName);
}
/**
* Constructs a SQLServerBulkCSVFileRecord to parse data from a delimited file with the given encoding.
*
* @param fileToParse
* InputStream to parse data from
* @param encoding
* Charset encoding to use for reading the file, or NULL for the default encoding.
* @param delimiter
* Delimiter to used to separate each column. Regex characters must be escaped with double backslashes.
* @param firstLineIsColumnNames
* True if the first line of the file should be parsed as column names; false otherwise
* @throws SQLServerException
* If the arguments are invalid, there are any errors in reading the file, or the file is empty
*/
public SQLServerBulkCSVFileRecord(InputStream fileToParse, String encoding, String delimiter,
boolean firstLineIsColumnNames) throws SQLServerException {
initLoggerResources();
if (loggerExternal.isLoggable(java.util.logging.Level.FINER)) {
loggerExternal.entering(loggerPackageName, loggerClassName,
new Object[] {fileToParse, encoding, delimiter, firstLineIsColumnNames});
}
if (null == fileToParse) {
throwInvalidArgument("fileToParse");
} else if (null == delimiter) {
throwInvalidArgument("delimiter");
}
this.delimiter = delimiter;
try {
if (null == encoding || 0 == encoding.length()) {
sr = new InputStreamReader(fileToParse);
} else {
sr = new InputStreamReader(fileToParse, encoding);
}
initFileReader(sr, encoding, delimiter, firstLineIsColumnNames);
} catch (UnsupportedEncodingException unsupportedEncoding) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_unsupportedEncoding"));
throw new SQLServerException(form.format(new Object[] {encoding}), null, 0, unsupportedEncoding);
} catch (Exception e) {
throw new SQLServerException(null, e.getMessage(), null, 0, false);
}
columnMetadata = new HashMap<>();
if (loggerExternal.isLoggable(java.util.logging.Level.FINER)) {
loggerExternal.exiting(loggerPackageName, loggerClassName);
}
}
/**
* Constructs a SQLServerBulkCSVFileRecord to parse data from a CSV file with the given encoding.
*
* @param fileToParse
* File to parse data from
* @param encoding
* Charset encoding to use for reading the file.
* @param firstLineIsColumnNames
* True if the first line of the file should be parsed as column names; false otherwise
* @throws SQLServerException
* If the arguments are invalid, there are any errors in reading the file, or the file is empty
*/
public SQLServerBulkCSVFileRecord(String fileToParse, String encoding,
boolean firstLineIsColumnNames) throws SQLServerException {
this(fileToParse, encoding, ",", firstLineIsColumnNames);
}
/**
* Constructs a SQLServerBulkCSVFileRecord to parse data from a CSV file with the default encoding.
*
* @param fileToParse
* File to parse data from
* @param firstLineIsColumnNames
* True if the first line of the file should be parsed as column names; false otherwise
* @throws SQLServerException
* If the arguments are invalid, there are any errors in reading the file, or the file is empty
*/
public SQLServerBulkCSVFileRecord(String fileToParse, boolean firstLineIsColumnNames) throws SQLServerException {
this(fileToParse, null, ",", firstLineIsColumnNames);
}
private void initFileReader(InputStreamReader sr, String encoding, String demlimeter,
boolean firstLineIsColumnNames) throws SQLServerException, IOException {
fileReader = new BufferedReader(sr);
if (firstLineIsColumnNames) {
currentLine = fileReader.readLine();
if (null != currentLine) {
columnNames = (escapeDelimiters && currentLine.contains("\""))
? escapeQuotesRFC4180(parseString(
currentLine, delimiter))
: parseString(currentLine, delimiter);
}
}
}
/*
* RFC4180 specifies that rules for quoted fields. It allows quoted string data to contain newlines data
* provided the contents otherwise conforms to the rules for escaping quotes. For example, the following is valid:
* "a","b","c"
* "aaa","b <-- newline is retained in data field
* bb","c"
* "aa","bb","cc"
* We cannot simply use fileReader.readLine() to read these records but instead must continue reading until we reach
* a newline that is not contained within quotes.
*/
private String readLineEscapeDelimiters() throws SQLServerException {
int quoteCount = 0;
StringBuilder sb = new StringBuilder();
try {
int c;
while ((c = fileReader.read()) != -1) {
if ((c == '\n' || c == '\r') && quoteCount % 2 == 0) { // newlines only end the record if we are not in quotes
fileReader.mark(1);
c = fileReader.read(); // we might have read \r of a \r\n, if so we need to read the \n as well
if (c != '\n') {
fileReader.reset(); // only delimited by \n, unread last char so it goes into the next record
}
break;
}
sb.append((char) c);
if (c == '"') {
quoteCount++;
}
}
if (c == -1 && quoteCount % 2 != 0) { // stream ended, but we are within quotes -- data problem
throw new SQLServerException(SQLServerException.getErrString("R_InvalidCSVQuotes"), null, 0, null);
}
if (c == -1) { // keep semantics of readLine() by returning a null when there is no more data
return null;
}
} catch (IOException e) {
throw new SQLServerException(e.getMessage(), null, 0, e);
}
return sb.toString();
}
private void initLoggerResources() {
super.loggerPackageName = "com.microsoft.sqlserver.jdbc.SQLServerBulkCSVFileRecord";
}
/**
* Releases any resources associated with the file reader.
*
* @throws SQLServerException
* when an error occurs
*/
public void close() throws SQLServerException {
loggerExternal.entering(loggerPackageName, "close");
// Ignore errors since we are only cleaning up here
if (fileReader != null)
try {
fileReader.close();
} catch (Exception e) {}
if (sr != null)
try {
sr.close();
} catch (Exception e) {}
if (fis != null)
try {
fis.close();
} catch (Exception e) {}
loggerExternal.exiting(loggerPackageName, "close");
}
@Override
public Object[] getRowData() throws SQLServerException {
if (null == currentLine)
return null;
else {
/*
* Binary data may be corrupted The limit in split() function should be a negative value, otherwise trailing
* empty strings are discarded. Empty string is returned if there is no value.
*/
String[] data = (escapeDelimiters && currentLine.contains("\""))
? escapeQuotesRFC4180(parseString(
currentLine, delimiter))
: parseString(currentLine, delimiter);
// Cannot go directly from String[] to Object[] and expect it to act as an array.
Object[] dataRow = new Object[data.length];
for (Entry pair : columnMetadata.entrySet()) {
ColumnMetadata cm = pair.getValue();
// Reading a column not available in csv positionInFile > number of columns retrieved after split
if (data.length < pair.getKey() - 1) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_invalidColumn"));
Object[] msgArgs = {pair.getKey()};
throw new SQLServerException(form.format(msgArgs), SQLState.COL_NOT_FOUND, DriverError.NOT_SET,
null);
}
// Source header has more columns than current line read
if (columnNames != null && (columnNames.length > data.length)) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_DataSchemaMismatch"));
Object[] msgArgs = {};
throw new SQLServerException(form.format(msgArgs), SQLState.COL_NOT_FOUND, DriverError.NOT_SET,
null);
}
try {
if (0 == data[pair.getKey() - 1].length()) {
dataRow[pair.getKey() - 1] = null;
continue;
}
switch (cm.columnType) {
/*
* Both BCP and BULK INSERT considers double quotes as part of the data and throws error if any
* data (say "10") is to be inserted into an numeric column. Our implementation does the same.
*/
case Types.INTEGER: {
// Formatter to remove the decimal part as SQL Server floors the decimal in integer types.
DecimalFormat decimalFormatter = new DecimalFormat("#");
decimalFormatter.setRoundingMode(RoundingMode.DOWN);
String formatedfInput = decimalFormatter
.format(Double.parseDouble(data[pair.getKey() - 1]));
dataRow[pair.getKey() - 1] = Integer.valueOf(formatedfInput);
break;
}
case Types.TINYINT:
case Types.SMALLINT: {
// Formatter to remove the decimal part as SQL Server floors the decimal in integer types.
DecimalFormat decimalFormatter = new DecimalFormat("#");
decimalFormatter.setRoundingMode(RoundingMode.DOWN);
String formatedfInput = decimalFormatter
.format(Double.parseDouble(data[pair.getKey() - 1]));
dataRow[pair.getKey() - 1] = Short.valueOf(formatedfInput);
break;
}
case Types.BIGINT: {
BigDecimal bd = new BigDecimal(data[pair.getKey() - 1].trim());
try {
dataRow[pair.getKey() - 1] = bd.setScale(0, RoundingMode.DOWN).longValueExact();
} catch (ArithmeticException ex) {
String value = "'" + data[pair.getKey() - 1] + "'";
MessageFormat form = new MessageFormat(
SQLServerException.getErrString("R_errorConvertingValue"));
throw new SQLServerException(
form.format(new Object[] {value, JDBCType.of(cm.columnType)}), null, 0, ex);
}
break;
}
case microsoft.sql.Types.MONEY:
case microsoft.sql.Types.SMALLMONEY:
case Types.DECIMAL:
case Types.NUMERIC: {
BigDecimal bd = new BigDecimal(data[pair.getKey() - 1].trim());
dataRow[pair.getKey() - 1] = bd.setScale(cm.scale, RoundingMode.HALF_UP);
break;
}
case Types.BIT: {
// "true" => 1, "false" => 0. Any non-zero value (integer/double) => 1, 0/0.0 => 0
try {
dataRow[pair.getKey()
- 1] = (0 == Double.parseDouble(data[pair.getKey() - 1])) ? Boolean.FALSE
: Boolean.TRUE;
} catch (NumberFormatException e) {
dataRow[pair.getKey() - 1] = Boolean.parseBoolean(data[pair.getKey() - 1]);
}
break;
}
case Types.REAL: {
dataRow[pair.getKey() - 1] = Float.parseFloat(data[pair.getKey() - 1]);
break;
}
case Types.DOUBLE: {
dataRow[pair.getKey() - 1] = Double.parseDouble(data[pair.getKey() - 1]);
break;
}
case Types.BINARY:
case Types.VARBINARY:
case Types.LONGVARBINARY:
case Types.BLOB: {
/*
* For binary data, the value in file may or may not have the '0x' prefix. We will try to
* match our implementation with 'BULK INSERT' except that we will allow 0x prefix whereas
* 'BULK INSERT' command does not allow 0x prefix. A BULK INSERT example: A sample csv file
* containing data for 2 binary columns and 1 row: 61,62 Table definition: create table
* t1(c1 varbinary(10), c2 varbinary(10)) BULK INSERT command: bulk insert t1 from
* 'C:\in.csv' with(DATAFILETYPE='char',firstrow=1, FIELDTERMINATOR=',') select * from t1
* shows 1 row with columns: 0x61, 0x62
*/
// Strip off 0x if present.
String binData = data[pair.getKey() - 1].trim();
if (binData.startsWith("0x") || binData.startsWith("0X")) {
dataRow[pair.getKey() - 1] = binData.substring(2);
} else {
dataRow[pair.getKey() - 1] = binData;
}
break;
}
case java.sql.Types.TIME_WITH_TIMEZONE: {
OffsetTime offsetTimeValue;
// The per-column DateTimeFormatter gets priority.
if (null != cm.dateTimeFormatter)
offsetTimeValue = OffsetTime.parse(data[pair.getKey() - 1], cm.dateTimeFormatter);
else if (timeFormatter != null)
offsetTimeValue = OffsetTime.parse(data[pair.getKey() - 1], timeFormatter);
else
offsetTimeValue = OffsetTime.parse(data[pair.getKey() - 1]);
dataRow[pair.getKey() - 1] = offsetTimeValue;
break;
}
case java.sql.Types.TIMESTAMP_WITH_TIMEZONE: {
OffsetDateTime offsetDateTimeValue;
// The per-column DateTimeFormatter gets priority.
if (null != cm.dateTimeFormatter)
offsetDateTimeValue = OffsetDateTime.parse(data[pair.getKey() - 1],
cm.dateTimeFormatter);
else if (dateTimeFormatter != null)
offsetDateTimeValue = OffsetDateTime.parse(data[pair.getKey() - 1], dateTimeFormatter);
else
offsetDateTimeValue = OffsetDateTime.parse(data[pair.getKey() - 1]);
dataRow[pair.getKey() - 1] = offsetDateTimeValue;
break;
}
case Types.NULL: {
dataRow[pair.getKey() - 1] = null;
break;
}
case Types.DATE:
case Types.CHAR:
case Types.NCHAR:
case Types.VARCHAR:
case Types.NVARCHAR:
case Types.LONGVARCHAR:
case Types.LONGNVARCHAR:
case Types.CLOB:
default: {
// The string is copied as is.
/*
* Handling double quotes: Both BCP (without a format file) and BULK INSERT behaves the same
* way for double quotes. They treat double quotes as part of the data. For a CSV file as
* follows, data is inserted as is: ""abc"" "abc" abc a"b"c a""b""c Excel on the other hand,
* shows data as follows. It strips off beginning and ending quotes, and sometimes quotes
* get messed up. When the same CSV is saved from Excel again, Excel adds additional quotes.
* abc"" abc abc a"b"c a""b""c In our implementation we will match the behavior with BCP and
* BULK INSERT. BCP command: bcp table1 in in.csv -c -t , -r 0x0A -S localhost -U sa -P
* BULK INSERT command: bulk insert table1 from 'in.csv' with (FIELDTERMINATOR=',')
* Handling delimiters in data: Excel allows comma in data when data is surrounded with
* quotes. For example, "Hello, world" is treated as one cell. BCP and BULK INSERT deos not
* allow field terminators in data: https://technet.microsoft.com/en-us/library/
* aa196735%28v=sql.80%29.aspx?f=255&MSPPError=- 2147217396
*/
dataRow[pair.getKey() - 1] = data[pair.getKey() - 1];
break;
}
}
} catch (IllegalArgumentException e) {
String value = "'" + data[pair.getKey() - 1] + "'";
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_errorConvertingValue"));
throw new SQLServerException(form.format(new Object[] {value, JDBCType.of(cm.columnType)}), null, 0,
e);
} catch (ArrayIndexOutOfBoundsException e) {
throw new SQLServerException(SQLServerException.getErrString("R_DataSchemaMismatch"), e);
}
}
return dataRow;
}
}
@Override
void addColumnMetadataInternal(int positionInSource, String name, int jdbcType, int precision, int scale,
DateTimeFormatter dateTimeFormatter) throws SQLServerException {
loggerExternal.entering(loggerPackageName, "addColumnMetadata",
new Object[] {positionInSource, name, jdbcType, precision, scale});
String colName = "";
if (0 >= positionInSource) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_invalidColumnOrdinal"));
Object[] msgArgs = {positionInSource};
throw new SQLServerException(form.format(msgArgs), SQLState.COL_NOT_FOUND, DriverError.NOT_SET, null);
}
if (null != name)
colName = name.trim();
else if ((null != columnNames) && (columnNames.length >= positionInSource))
colName = columnNames[positionInSource - 1];
if ((null != columnNames) && (positionInSource > columnNames.length)) {
MessageFormat form = new MessageFormat(SQLServerException.getErrString("R_invalidColumn"));
Object[] msgArgs = {positionInSource};
throw new SQLServerException(form.format(msgArgs), SQLState.COL_NOT_FOUND, DriverError.NOT_SET, null);
}
checkDuplicateColumnName(positionInSource, name);
switch (jdbcType) {
/*
* SQL Server supports numerous string literal formats for temporal types, hence sending them as varchar
* with approximate precision(length) needed to send supported string literals. string literal formats
* supported by temporal types are available in MSDN page on data types.
*/
case java.sql.Types.DATE:
case java.sql.Types.TIME:
case java.sql.Types.TIMESTAMP:
case microsoft.sql.Types.DATETIMEOFFSET:
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, jdbcType, 50, scale, dateTimeFormatter));
break;
// Redirect SQLXML as LONGNVARCHAR
// SQLXML is not valid type in TDS
case java.sql.Types.SQLXML:
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, java.sql.Types.LONGNVARCHAR, precision, scale, dateTimeFormatter));
break;
/*
* Redirecting Float as Double based on data type mapping
* https://msdn.microsoft.com/library/ms378878%28v=sql.110%29.aspx
*/
case java.sql.Types.FLOAT:
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, java.sql.Types.DOUBLE, precision, scale, dateTimeFormatter));
break;
// redirecting BOOLEAN as BIT
case java.sql.Types.BOOLEAN:
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, java.sql.Types.BIT, precision, scale, dateTimeFormatter));
break;
default:
columnMetadata.put(positionInSource,
new ColumnMetadata(colName, jdbcType, precision, scale, dateTimeFormatter));
}
loggerExternal.exiting(loggerPackageName, "addColumnMetadata");
}
@Override
public boolean next() throws SQLServerException {
try {
currentLine = escapeDelimiters ? readLineEscapeDelimiters() : fileReader.readLine();
} catch (IOException e) {
throw new SQLServerException(e.getMessage(), null, 0, e);
}
return (null != currentLine);
}
/**
* Returns whether the rules to escape delimiters are used.
*
* @return true if the rules are used, false otherwise.
*/
public boolean isEscapeColumnDelimitersCSV() {
return escapeDelimiters;
}
/**
* When set to true, the following rules will be used to parse CSV files: Each field may or may not be enclosed in
* double quotes. If fields are not enclosed with double quotes, then double quotes may not appear inside the
* fields. Fields containing double quotes, and delimiters should be enclosed in double quotes. If double-quotes are
* used to enclose fields, then a double-quote appearing inside a field must be escaped by preceding it with another
* double quote. Spaces are considered part of a field. Spaces before and after enclosing double quotes are ignored.
*
* @param escapeDelimiters
* true if the rules above to be used.
*/
public void setEscapeColumnDelimitersCSV(boolean escapeDelimiters) {
this.escapeDelimiters = escapeDelimiters;
}
private static String[] escapeQuotesRFC4180(String[] tokens) throws SQLServerException {
if (null == tokens) {
return tokens;
}
for (int i = 0; i < tokens.length; i++) {
boolean escaped = false;
int j = 0;
StringBuilder sb = new StringBuilder();
long quoteCount = tokens[i].chars().filter(ch -> ch == '"').count();
if (quoteCount > 0) {
tokens[i] = tokens[i].trim();
}
if (0 != quoteCount % 2 || (quoteCount > 0
&& ('"' != tokens[i].charAt(0) || '"' != tokens[i].charAt(tokens[i].length() - 1)))) {
throw new SQLServerException(SQLServerException.getErrString("R_InvalidCSVQuotes"), null, 0, null);
}
while (j < tokens[i].length()) {
if ('"' == tokens[i].charAt(j)) {
if (!escaped) {
escaped = true;
} else {
if ((j < tokens[i].length() - 1) && '"' == tokens[i].charAt(j + 1)) {
sb.append('"');
j++;
}
}
} else {
sb.append(tokens[i].charAt(j));
}
j++;
}
tokens[i] = sb.toString();
}
return tokens;
}
private static String[] parseString(String buffer, String delimiter) {
ArrayList tokens = new ArrayList<>();
int position = 0;
boolean quoted = false;
for (int i = 0; i < buffer.length(); i++) {
if (buffer.charAt(i) == doubleQuoteChar) {
quoted = !quoted;
} else if (!quoted && i + delimiter.length() <= buffer.length()
&& buffer.substring(i, i + delimiter.length()).equals(delimiter)) {
// Add field to token list when delimiter is found
tokens.add(buffer.substring(position, i));
position = i + delimiter.length();
i = position - 1; // Adjust the index to start after the delimiter
}
}
// Add the last field
tokens.add(buffer.substring(position));
return tokens.toArray(new String[0]);
}
}