
com.mchange.sc.v1.superflex.SuperFlexDbArchiver.scala Maven / Gradle / Ivy
The newest version!
/*
* Distributed as part of superflex v0.2.0
*
* Copyright (C) 2013 Machinery For Change, Inc.
*
* Author: Steve Waldman
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of EITHER:
*
* 1) The GNU Lesser General Public License (LGPL), version 2.1, as
* published by the Free Software Foundation
*
* OR
*
* 2) The Eclipse Public License (EPL), version 1.0
*
* You may choose which license to accept if you wish to redistribute
* or modify this work. You may offer derivatives of this work
* under the license you have chosen, or you may provide the same
* choice of license which you have been offered here.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received copies of both LGPL v2.1 and EPL v1.0
* along with this software; see the files LICENSE-EPL and LICENSE-LGPL.
* If not, the text of these licenses are currently available at
*
* LGPL v2.1: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
* EPL v1.0: http://www.eclipse.org/org/documents/epl-v10.php
*
*/
package com.mchange.sc.v1.superflex;
import java.io.{File, FileInputStream, InputStreamReader, BufferedReader, IOException};
import java.net.URL;
import java.sql.{Connection,DriverManager,PreparedStatement,SQLException,Statement,Types};
import java.text.{ParseException,SimpleDateFormat};
import com.mchange.v2.csv.CsvBufferedReader;
import scala.collection._;
import scala.collection.mutable.ArrayBuffer;
import com.mchange.sc.v1.util.ClosableUtils._;
import com.mchange.sc.v1.sql.ResourceUtils._;
import com.mchange.sc.v1.log.MLevel._
import scala.language.reflectiveCalls;
object SuperFlexDbArchiver {
implicit val logger = mlogger( this )
val dfltDateFormatPatterns = Array("yyyy-MM-dd", "yyyyMMdd", "MMddyyyy", "dd-MMM-yyyy", "dd-MMM-yy", "MM/dd/yy", "MM/dd/yyyy", "ddMMMyyyy");
val dfltDateTimeFormatPatternAddenda = Array("'T'HH:mm:ss"," HH:mm:ss z");
val dfltDateTimeFormatPatterns = for (p <- dfltDateFormatPatterns; sfx <- dfltDateTimeFormatPatternAddenda) yield (p + sfx);
// to distinguish dates from datetimes, since datetime patterns are
// succeeding when only the prefix date matches, leading to timestamp
// cols where we want dates
val extraDateValidator = new Function2[String,String,Boolean] {
def apply(maybeDate : String, patternStr : String) : Boolean = {
def countColons(s : String) : Int = s.filter( _ == ':').length
return ( countColons( maybeDate ) == countColons( patternStr ) );
}
}
// lowercase only! -- we lowercase values before trying to match with these strings.
val trueStrings = "true"::"t"::"yes"::"y"::"1"::Nil;
val falseStrings = "false"::"f"::"no"::"n"::"0"::Nil;
val leadingZerosRegex = "0\\d".r;
type MetaData = Map[String,List[String]];
trait NamedDataFileSource {
def createBufferedReader( bufferSize : Int, fileEncoding : String ) : BufferedReader;
def sourceName : String;
override def equals( other : Any ) : Boolean;
override def hashCode() : Int;
final def forMultilineCsv : CsvDataFileSource = this match {
case ready : CsvDataFileSource => ready
case unready => CsvDataFileSource( unready )
}
}
final case class UrlDataFileSource( url : URL ) extends NamedDataFileSource {
def createBufferedReader( bufferSize : Int, fileEncoding : String ) : BufferedReader = {
new BufferedReader( new InputStreamReader( url.openStream(), fileEncoding ), bufferSize )
}
def sourceName : String = url.toString
}
final case class CsvDataFileSource( inner : NamedDataFileSource ) extends NamedDataFileSource {
def createBufferedReader( bufferSize : Int, fileEncoding : String ) : BufferedReader = new CsvBufferedReader( inner.createBufferedReader( bufferSize, fileEncoding ) )
def sourceName : String = inner.sourceName
}
object FileDataFileSource {
// can't use default args here, because the autogenerated version does, and there can be only one
def apply( fileName : String, fullpath : Boolean ) : FileDataFileSource = new FileDataFileSource( new File( fileName ), fullpath );
def apply( fileName : String ) : FileDataFileSource = apply( fileName, true )
}
final case class FileDataFileSource( file : File, fullpath : Boolean = true ) extends NamedDataFileSource {
def createBufferedReader( bufferSize : Int, fileEncoding : String ) : BufferedReader = { new BufferedReader( new InputStreamReader( new FileInputStream( file ), fileEncoding ), bufferSize ) };
def sourceName : String = { if (fullpath) file.getCanonicalPath(); else file.getName(); }
}
def dfltColSort( pkNames : List[String] ) : Ordering[String] = new Ordering[String] {
def compare( t : String, o : String ) = oldDfltColSort( pkNames )( t ).compare( o )
}
def oldDfltColSort( pkNames : List[String] )( cn : String) : Ordered[String] = {
new Ordered[String] {
def compare( other : String ) : Int = {
val tpki = pkNames.indexOf( cn );
val opki = pkNames.indexOf( other );
if ( tpki >= 0 && opki >= 0) {
if (tpki > opki) 1;
else if (tpki < opki) -1;
else 0;
} else if (tpki >= 0) -1;
else if (opki >=0) 1;
else cn.compareTo( other );
}
}
}
def asBoolean( maybeBoolean : String) : Option[Boolean] = {
val chk = maybeBoolean.toLowerCase;
if ( trueStrings.contains( chk ) ) Some(true);
else if ( falseStrings.contains( chk ) ) Some(false);
else None;
}
//should only be called on valid, non-null booleans, so use of Option.get should effectively be guarded
val booleanSetter = ( (ps : PreparedStatement, i : Int, s: String) => ps.setBoolean(i, asBoolean(s).get ) );
val charTypeSetter = ( (ps : PreparedStatement, i : Int, s: String) => ps.setString(i, s ) );
val integerSetter = ( (ps : PreparedStatement, i : Int, s: String) => ps.setInt(i, Integer.parseInt(s)) );
val bigintSetter = ( (ps : PreparedStatement, i : Int, s: String) => ps.setLong(i, java.lang.Long.parseLong(s)) );
val doublePrecisionSetter = ( (ps : PreparedStatement, i : Int, s: String) => ps.setDouble(i, java.lang.Double.parseDouble( s ) ) );
def dateSetter( pattern : String ) = new DateSetter( pattern )
def timestampSetter( pattern : String ) = new TimestampSetter( pattern )
// each thread should make a copy, as not all setters can be called concurrently
// setters that can be called concurrently will be passed back as "copies".
def copySetter( setter : Function3[PreparedStatement, Int, String, Unit] ) : Function3[PreparedStatement, Int, String, Unit] = {
if ( setter.isInstanceOf[CopyableSetter] ) setter.asInstanceOf[CopyableSetter].copy else setter;
}
def copyMaybeSetter( maybeSetter : Option[Function3[PreparedStatement, Int, String, Unit]] ) : Option[Function3[PreparedStatement, Int, String, Unit]] = {
if ( maybeSetter != None )
Some( copySetter( maybeSetter.get ) );
else
None;
}
sealed trait CopyableSetter extends Function3[PreparedStatement,Int,String,Unit] {
def copy : CopyableSetter
}
// ensures that multiple instances are equal while allowing for distinct instances (required
// to avoid synchronization during multithreaded use, as SimpleDateFormat is not thread
// safe).
final class DateSetter( val pattern : String ) extends CopyableSetter {
val df = new SimpleDateFormat( pattern );
override def equals( other : Any ) : Boolean = other match {
case ds : DateSetter => (this eq ds) || (this.pattern == ds.pattern)
case _ => false
}
override def hashCode = pattern.hashCode ^ (1 << 0)
override def toString = s"DateSetter(${pattern})"
override def apply( ps : PreparedStatement, i : Int, s : String) : Unit = {
try { ps.setDate(i, new java.sql.Date( df.parse( s ).getTime() ) ) }
catch{ case t : Throwable => WARNING.log( s"Pattern '${pattern}', Bad datum '${s}'"); throw t; }
}
def copy : DateSetter = new DateSetter( pattern );
}
final class TimestampSetter( val pattern : String ) extends CopyableSetter {
val df = new SimpleDateFormat( pattern );
override def apply( ps : PreparedStatement, i : Int, s : String) : Unit = {
try { ps.setTimestamp(i, new java.sql.Timestamp( df.parse( s ).getTime() ) ) }
catch{ case t : Throwable => WARNING.log(s"Pattern '${pattern}', Bad datum '${s}'"); throw t; }
}
override def equals( other : Any ) : Boolean = other match {
case ts : TimestampSetter => (this eq ts) || (this.pattern == ts.pattern)
case _ => false
}
override def hashCode = pattern.hashCode ^ (1 << 1)
override def toString = s"TimestampSetter(${pattern})"
def copy : TimestampSetter = new TimestampSetter( pattern );
}
case class FkdLine( rawLine : String, transformedLine : String, splitLine : Seq[String], probDesc : String ) {
def xmlSnippet =
{ this.rawLine }
{ this.transformedLine }
{ this.splitLine.map('"' + _ + '"').mkString(",") }
};
case class FkdLineKeeper( source : NamedDataFileSource, colNames : List[String], fkdLines : List[FkdLine] ) {
def xmlSnippet =
{ this.source.sourceName }
{ this.colNames.map('"' + _ + '"').mkString(",") }
{ fkdLines.map( _.xmlSnippet ) }
}
def unreadableLinesXml( keepers : Iterable[FkdLineKeeper] ) = {
{ keepers.filter( _.fkdLines.length > 0).map( _.xmlSnippet ) }
}
object Key {
val COL_NAMES = "colNames";
val LABELS = "labels";
val PROLOGUE = "prologue";
}
val emptyColumnInfoArray = new Array[ColumnInfo](0);
val emptyStringArray = new Array[String](0);
}
// TODO: Add a parameter to limit the number of rows per batch in inserts.
// Currently we treat full files as one batch, which is fast but
// overindulgent of memory.
abstract class SuperFlexDbArchiver extends Splitter {
import SuperFlexDbArchiver._; //bring in unqualified reference to companion objects members
protected val priorTableInfo : TableInfo;
protected val files : Seq[NamedDataFileSource];
protected val debugColumnInspection = false;
protected val concurrentInserts : Boolean = true;
/**
* -1 means unlimited. Only matters if concurrentInserts is true
*/
protected val maxConcurrency : Int = -1;
protected val quoteColNames : Boolean = false;
protected val shouldDrop = false;
//println ("priorTableInfo " + priorTableInfo);
protected val colSort = dfltColSort( if (priorTableInfo.pkNames == None) Nil; else priorTableInfo.pkNames.get );
protected val dateFormatPatterns : Iterable[String] = dfltDateFormatPatterns;
protected val dateTimeFormatPatterns : Iterable[String] = dfltDateTimeFormatPatterns;
protected val defaultTypeDeclaration = ( "INTEGER", Types.INTEGER );
protected val bufferSize = 512 * 1024 * 1024; //500M
protected val fileEncoding = "ISO-8859-1";
protected val tmpTablePfx = "tmp_";
protected val leadingZerosNonNumeric = true;
// use -1 to treat full files as batches, irrespective of size
protected val maxBatchSize = 50000;
// if true, read will die with an exception if there are unreadable lines
// if false, bad lines will be retained for logging / correcting, and then skipped
protected val strict = true;
protected val mustInfer : Boolean = true;
protected def transformColName( colName : String ) : String = colName;
protected def readMetaData( br: BufferedReader ) : MetaData;
protected def isNull( datum : String, colName : String ) : Boolean = (datum == null || datum.length == 0);
protected def readDataLine( br : BufferedReader ) : String = br.readLine();
/**
* Note -- this method will only be called if quoteColNames is
* overridden to true.
*/
protected def quoteColName( colName : String ) = '"' + colName + '"';
// we need users to set transformation and quoting conventions distinctly,
// because we do not want to quote when we index columns. So the
// combined method is final.
private final def transformQuoteColName( colName : String ) : String = {
var out = transformColName( colName );
if ( quoteColNames )
out = quoteColName( out );
out;
}
/**
* if table structure inference is disabled, then the synthetic columns must be fully specified. Otherwise, you can use nameOnlyColumnInfos(...) below.
*/
protected def syntheticColumns( f : NamedDataFileSource, colNames : Seq[String] ) : Array[ColumnInfo] = emptyColumnInfoArray;
protected def syntheticColumnValues( data : Array[String], f : NamedDataFileSource, colNames : Seq[String] ) : Array[String] = emptyStringArray;
/**
* Just a utility for specifying synthetic columns whose type should be inferred like "natural" columns
*/
protected final def nameOnlyColumnInfos( names : String* ) : Array[ColumnInfo] = names.map( ColumnInfo( _, None, None, None, None ) ).toArray;
/**
* when overriding, place the super.prepareTransformDataLines() in out._1, and your own info in out._2.
*
* super.prepareTransformDataLines() must always be called, as SuperFlexDbArchiver uses prepare / transform methods. They are not just hooks.
*
* Note -- any class that overrides prepare/prepareTransform method must override ALL THREE to pass
* the left side of the prepObj pair to and or call the superclass method.
*/
protected def prepareTransformFileData( f : NamedDataFileSource, colNames : Array[String] ) : Tuple2[Any,Any] = Tuple2( f, colNames );
/**
* when overriding, get the parent class' line using super.prepareTransformUnsplitDataLine( prepObj._1 ), and retrieve
* your own info from prepObj._2
*
* super.transformUnsplitDataLines() must always be called, as SuperFlexDbArchiver uses prepare / transform methods. They are not just hooks.
*
* Note -- any class that overrides prepare/prepareTransform method must override ALL THREE to pass
* the left side of the prepObj pair to and or call the superclass method.
*/
protected def transformUnsplitDataLine( line : String, prepInfo : Tuple2[Any,Any] ) : String = line;
/**
* when overriding, get the parent class' line using super.transformUnsplitDataLine( prepObj._1 ), and retrieve
* your own info from prepObj._2
*
* super.transformSplitData() must always be called, as SuperFlexDbArchiver uses prepare / transform methods. They are not just hooks.
*
* Note -- any class that overrides prepare/prepareTransform method must override ALL THREE to pass
* the left side of the prepObj pair to and or call the superclass method.
*/
protected def transformSplitData( data : Array[String], prepInfo : Tuple2[Any,Any] ) : Array[String] = {
val ( f, colNames ) = prepInfo.asInstanceOf[Tuple2[NamedDataFileSource,Array[String]]];
val synthCols = syntheticColumns( f, colNames );
val sz = synthCols.size;
if (sz == 0) data;
else {
val synthData = syntheticColumnValues( data, f, colNames );
val out = new Array[String]( data.length + sz );
// printf("colNames: %s\n", colNames.mkString("[ ", ", "," ]"));
// printf("synthCols.size: %s, synthData.size %s\n", synthCols.size.toString, synthData.size.toString);
System.arraycopy(data, 0, out, 0, data.length );
System.arraycopy(synthData, 0, out, data.length, sz);
out;
}
}
protected def afterTableCreate( con : Connection ) : Unit = {};
protected def afterRowInsert( con : Connection, data : Array[String], f : NamedDataFileSource, colNames : Seq[String] ) : Unit = {};
protected def afterAllInserts( csrc : ConnectionSource ) : Unit = {};
/**
* Can be set to override the usual inference process for a
* given column. The return value is the SQL type that should be
* declared and the java.sql.Types code it should be associated
* with. Returning None means to use the usual inference
* process.
*/
protected def forceInferredType( colName : String, maxLength : Int ) : Option[Tuple2[String,Int]] = None;
// we let this be a def, because we'll find it convenient to let this depend
// on the column count sometimes in order to work around a postgres bug.
//
// search the pg-sql mailing list for threads
// "Deadlock detection"
// "Connection hanging on INSERT apparently due to large batch size and 4 CPU cores"
// "deadlocks on JDBC batch inserts into tables with many columns"
//
// note that table insepction has completed by the time this "parameter" is accessed,
// so we can check, e.g. _unifiedNamesToColInfos
//
protected def batchFileInserts : Boolean = true;
/** throw an Exception if not valid */
protected def validateMetaData( mds : MetaData) : Unit = {}
protected def padFieldLength( observedMaxLength : Int) : Int = observedMaxLength;
// Note: this object imposes the sort that will determine the ordering of columns
protected object FilesInfo {
// note that synth columns are always appended to "natural" file columns in the
// order they are presented by syntheticColNames. parse and insepction should
// supply synthetic values appropriately
def apply( metaDatas : Array[MetaData] ) : FilesInfo = {
val allColNames = mutable.Set.empty[String];
val colNamesByFile = mutable.HashMap.empty[NamedDataFileSource, Array[String]];
for (i <- 0 until files.length) {
val fileColNames = metaDatas(i)( Key.COL_NAMES ).toArray;
val synthCols = syntheticColumns( files(i), fileColNames );
val synthColNames = synthCols.map( _.name );
val fileSynthColNames = fileColNames ++ synthColNames;
//printf( "fileSynthColNames %s", fileSynthColNames.mkString("[ ", ", ", " ]") );
allColNames ++= fileSynthColNames;
colNamesByFile += ( files(i) -> fileSynthColNames.toArray );
}
val fcbcn = mutable.Map.empty[String, Iterable[(NamedDataFileSource, Int)]] ++
( for ( cn <- allColNames ) yield ( Tuple2(cn, for (f <- files; fcn = colNamesByFile(f).indexOf( cn ); if fcn >= 0) yield ( f, fcn ) ) ) );
new FilesInfo( allColNames, colNamesByFile, fcbcn, colSort );
}
}
protected class FilesInfo( cnames : Iterable[String],
cnbf : Map[NamedDataFileSource,Array[String]],
fcbcn : Map[String, Iterable[(NamedDataFileSource, Int)]],
sort : Ordering[String] ) {
val allColumnNames : SortedSet[String] = immutable.TreeSet.empty(sort) ++ cnames;
val colNamesByFile : Map[NamedDataFileSource, Array[String]] = immutable.HashMap.empty ++ cnbf;
val fileColsByColumnName : SortedMap[String, Iterable[ (NamedDataFileSource, Int) ] ] = immutable.TreeMap.empty(sort) ++ fcbcn;
}
/**
* An empty String, rather than a None value, signifies no prologues
*/
private def mergePrologues(metaDatas : Array[MetaData]) : String = {
// XXX hardcoded 8k buffer
val sb : StringBuilder = new StringBuilder(8192);
for (i <- 0 until metaDatas.length; prologue = metaDatas(i).getOrElse(Key.PROLOGUE, null); if (prologue != null)) {
assert( prologue.length == 1, "Prologues should be parsed to a single string (the first value of the metadata List." );
sb.append( "Source file: ");
sb.append( files(i).sourceName );
sb.append(':');
sb.append("\n");
sb.append( prologue(0).trim() );
sb.append( "\n" );
}
sb.toString();
}
// if label vals would be interpreted as nulls in data, they
// are interpreted as nulls here
protected def createLabelMap(md : MetaData, file : NamedDataFileSource) : Option[Map[String,String]] = {
val colNames : Option[List[String]] = md.get( Key.COL_NAMES );
val labels : Option[List[String]] = md.get( Key.LABELS );
if (colNames == None || labels == None) {
None;
} else if (colNames.get.length == labels.get.length) {
val cng = colNames.get;
val lg = labels.get;
val pairs : Seq[ (String, String) ] = for (i <- 0 until cng.length; if (! isNull(lg(i), cng(i)))) yield Tuple2( cng(i), lg(i) );
val synthPairs = {
val synthCols = syntheticColumns( file, cng );
for ( ci <- synthCols; label = ci.label.getOrElse( null ); if (! isNull( label, ci.name ) ) ) yield ( ci.name, label );
};
Some( immutable.Map( (pairs ++ synthPairs) : _* ) );
} else {
WARNING.log("Uh oh... length of colNames and labels lists don't match. Reporting no labels.");
None;
}
}
/**
* An empty map, rather than a None value, signifies no labels
*/
protected def findLabels(metaDatas : Array[MetaData], files : Seq[NamedDataFileSource]) : Map[String,String] = {
val labelMaps = for (i <- 0 until metaDatas.length) yield ( createLabelMap( metaDatas(i), files(i) ) );
val goodLabelMaps = labelMaps.filter( _ != None ).map( _.get );
val bindingsUnion : Set[Tuple2[String,String]] = goodLabelMaps.foldLeft( immutable.Set.empty[Tuple2[String,String]] )( _ ++ _ );
// merge duplicate values
val keySet : Set[String] = immutable.Set( bindingsUnion.map( _._1 ).toSeq : _* );
val outBuilder = mutable.Map.empty[String,String];
for ( k <- keySet ) {
outBuilder += Tuple2( k , bindingsUnion.filter( _._1 == k ).map( _._2 ). mkString(" || ") );
}
return immutable.Map.empty ++ outBuilder;
}
// these options all get filled in sequence
// it is safe to assume they have values if the function
// that fills in the data has been called
//
// however some vals (empty strings, empty maps) may signify
// unknown or null
protected var _metaDatas : Option[Array[MetaData]] = None;
protected var _labels : Option[Map[String,String]] = None; //empty if unknown
protected var _mergedPrologues : Option[String] = None; //"" if no prologues provided
protected var _filesInfo : Option[FilesInfo] = None;
protected var _unifiedTableInfo : Option[TableInfo] = None;
protected var _unifiedNamesToColInfos : Option[Map[String,ColumnInfo]] = None;
protected var _maybeQualifiedTableName : Option[String] = None;
protected var _createDdl : Option[String] = None;
protected var _fkdLineKeepers : Option[Map[NamedDataFileSource,FkdLineKeeper]] = None;
// pkConstraint need NOT be filled in, if the table being
// generated has specified no primary key fields. calls
// to get should always be guarded.
protected var _pkConstraint : Option[String] = None;
def archiveFiles( csrc : ConnectionSource ) : Unit = {
inferTableInfo();
generateCreateDdl();
if (shouldDrop) dropDeindexTable( csrc );
executeCreateDdl( csrc );
insertFiles( csrc );
}
def archiveFilesNoDups( csrc : ConnectionSource ) : Unit = archiveFilesNoDups( csrc, true );
def archiveFilesNoDups( csrc : ConnectionSource, imposePkConstraint : Boolean ) : Unit = {
inferTableInfo();
// asserts that _values are set during inference
val maybeQualifiedTableName = _maybeQualifiedTableName.get;
val unifiedTableInfo = _unifiedTableInfo.get;
val tmpTableName = {
if (unifiedTableInfo.tschema == None ) {
tmpTablePfx + maybeQualifiedTableName; //schema-less case
} else {
unifiedTableInfo.tschema.get + '.' + tmpTablePfx + unifiedTableInfo.tname.get;
}
}
FINE.log( s"tmpTableName: ${tmpTableName}" );
// if for some reason there is an old shadow of the tempory table,
// drop it
dropTable( csrc, tmpTableName );
generateCreateDdl( false, tmpTableName );
executeCreateDdl( csrc, tmpTableName );
insertFiles( csrc, this.files, _filesInfo.get, concurrentInserts, tmpTableName );
if ( shouldDrop ) dropDeindexTable(csrc);
var con : Connection = null;
var stmt : Statement = null;
try {
con = csrc.getConnection();
stmt = con.createStatement();
stmt.executeUpdate("CREATE TABLE %s AS ( SELECT DISTINCT * FROM %s )".format(maybeQualifiedTableName, tmpTableName));
stmt.executeUpdate("DROP TABLE %s".format(tmpTableName));
if ( imposePkConstraint ) {
if ( _pkConstraint != None) stmt.executeUpdate("ALTER TABLE %s ADD %s".format( maybeQualifiedTableName, _pkConstraint.get ));
else WARNING.log("Could not impose PRIMARY KEY constraint on generated table. No PRIMARY KEY columns specified.");
}
} finally {
attemptClose( stmt, con );
}
}
def dropDeindexTable( csrc : ConnectionSource ) : Unit = dropTable( csrc );
protected def prologuesDefined() : Boolean = {
// should only be called after inference is complete. effectively asserts...
_mergedPrologues.get.length > 0;
}
final def dropTable( csrc : ConnectionSource ) : Unit = {
withConnection( csrc ) { con =>
withStatement( con ) { stmt =>
dropTable( stmt );
}
}
}
final def dropTable( csrc : ConnectionSource, mqtn : String ) : Unit = {
withConnection( csrc ) { con =>
withStatement( con ) { stmt =>
dropTable( stmt, mqtn );
}
}
}
final def dropTable( stmt : Statement ) : Unit = {
// effectively asserts that _maybeQualifiedTableName has been set
val maybeQualifiedTableName = _maybeQualifiedTableName.get;
dropTable( stmt, maybeQualifiedTableName );
}
final def dropTable(stmt : Statement, mqtn : String) : Unit = {
try { stmt.executeUpdate("DROP TABLE %s".format(mqtn)); }
catch { case (exc : SQLException) => ; } //ignore, assume table wasn't there
}
private def learnFromMetaDatas() : Unit = {
if ( _metaDatas == None ) readAllMetaDatas();
val metaDatas = _metaDatas.get;
if ( _filesInfo == None ) _filesInfo = Some( FilesInfo( metaDatas ) );
val filesInfo = _filesInfo.get;
val labels : Map[String,String] = findLabels( metaDatas, files );
this._labels = Some( labels );
this._mergedPrologues = Some( mergePrologues( metaDatas ) );
}
private def learnFromUnifiedTableInfo() : Unit = {
//println( "getting unified table info");
// effectively asserts that _unifiedTableInfo has been set
val unifiedTableInfo = _unifiedTableInfo.get;
val unifiedNamesToColInfos = immutable.TreeMap[String,ColumnInfo]()(colSort) ++ unifiedTableInfo.cols.get.map( ci => (ci.name -> ci) ); //sorted
this._unifiedNamesToColInfos = Some( unifiedNamesToColInfos );
if (unifiedTableInfo.pkNames != None) {
val pkeyExtras = unifiedTableInfo.pkNames.get.filterNot( unifiedNamesToColInfos.keySet.contains(_) );
if (! pkeyExtras.isEmpty )
throw new DbArchiverException("Column names specified as primary keys don't exist in the inferred or specified table. Unknown primary keys: " + pkeyExtras.mkString(", ") );
}
this._maybeQualifiedTableName = {
if (unifiedTableInfo.tschema == None) unifiedTableInfo.tname else Some( unifiedTableInfo.tschema.get + '.' + unifiedTableInfo.tname.get )
}
}
private def inferTableInfo(): Unit = {
if (priorTableInfo.tname == None)
throw new DbArchiverException("A table name must be specified in priorTableInfo. Currently: " + priorTableInfo);
if (mustInfer) {
learnFromMetaDatas();
val metaDatas = _metaDatas.get;
val filesInfo = _filesInfo.get;
val labels = _labels.get;
// this has the side effect of setting up _fkdLineKeepers
val inferredColMap : Map[ NamedDataFileSource, Seq[ExaminedColumn] ] = inferFileCols( filesInfo );
val inferredCols : SortedMap[String, ExaminedColumn] = {
immutable.TreeMap.empty[String, ExaminedColumn]( colSort ) ++
( for ( cn <- filesInfo.allColumnNames ) yield cn -> (filesInfo.fileColsByColumnName( cn ).map( fc => inferredColMap( fc._1 ).apply(fc._2) ) ).reduceLeft( _ && _ ) );
}
val inferredColInfos = inferredCols.map( tup => new ColumnInfo(tup._1,
if (labels == None || ! labels.contains(tup._1)) None; else Some( labels(tup._1) ),
Some(tup._2.bestTypeDeclaration._1),
Some(tup._2.bestTypeDeclaration._2),
Some(tup._2.setter) ) ).toSeq;
val inferredTableInfo = new TableInfo( None, None, Some(inferredColInfos), None ); // name and any schema or pkeys should be set on priorTableInfo
// unified table info will contain information about synthetic colums by inference (they were included in filesInfo)
val unifiedTableInfo = priorTableInfo.reconcileOver( inferredTableInfo );
//val unifiedTableInfo = TableInfo( priorTableInfo.tschema, priorTableInfo.tname, Some( allColSeq ), priorTableInfo.pkNames );
this._unifiedTableInfo = Some( unifiedTableInfo );
//printf("unifiedTableInfo: %s\n", unifiedTableInfo);
learnFromUnifiedTableInfo();
val unifiedNamesToColInfos = _unifiedNamesToColInfos.get;
val extraCols = immutable.Set(unifiedNamesToColInfos.keySet.toSeq : _*) -- filesInfo.allColumnNames;
if (! extraCols.isEmpty )
throw new DbArchiverException("Information is specified (via priorTableInfo) about columns not in the data. Extras: " + extraCols.mkString(", "));
} else {
// we still have to make sure all the member variables that
// would normally have been filled by inspection get filled.
learnFromMetaDatas();
// we still need to infer what synth columns will be required
// metaDatas was set above
val metaDatas = _metaDatas.get;
val synthCols = {
val fileSynthCols = for (i <- 0 until metaDatas.length) yield syntheticColumns( files(i), metaDatas(i).apply( Key.COL_NAMES ) );
val tmp = mutable.Map.empty[String, ColumnInfo];
for ( fileseq <- fileSynthCols; ci <- fileseq ) {
val check = tmp.get( ci.name );
if (check != None) {
tmp += ( ci.name -> check.get.reconcile( ci ));
} else {
tmp += ( ci.name -> ci );
}
}
tmp.map( _._2 );
};
if (synthCols.isEmpty) {
this._unifiedTableInfo = Some( priorTableInfo );
} else {
this._unifiedTableInfo = Some( priorTableInfo.reconcileOver( TableInfo( None, None, Some(synthCols), None ) ) );
}
FINE.log(s"priorTableInfo: ${priorTableInfo}");
FINE.log(s"set table info: ${this._unifiedTableInfo}");
learnFromUnifiedTableInfo();
this._fkdLineKeepers = Some( createEmptyFkdLineKeepers() );
}
}
// for when we are skipping inference and fully specifying the expected table structure
// in this case, we don't have the opportunity to pre-inspect files for bad records
private def createEmptyFkdLineKeepers() : Map[NamedDataFileSource,FkdLineKeeper] =
{ immutable.Map( files.map( f => ( f, FkdLineKeeper(f, _filesInfo.get.colNamesByFile(f).toList, Nil) ) ) : _* ); }
private def generateCreateDdl() : Unit = generateCreateDdl( true, _maybeQualifiedTableName.get /* asserts _maybeQualifiedTableName set */ );
private def generateCreateDdl( includePkConstraint : Boolean, maybeQualifiedTableName : String ) : Unit = {
val unifiedTableInfo = _unifiedTableInfo.get;
val unifiedNamesToColInfos = _unifiedNamesToColInfos.get;
//printf("In generateCreateDdl( ... ) -- unifiedTableInfo %s\n", unifiedTableInfo);
// XXX: 4096 is hardcoded here...
val sb : StringBuilder = new StringBuilder(4096);
sb.append("CREATE TABLE ");
sb.append( maybeQualifiedTableName );
val decls : List[String] = {
val rawPkColNames = unifiedTableInfo.pkNames;
val noPkDecls = unifiedNamesToColInfos.toList.map( tup => ( transformQuoteColName( tup._2.name ) + " " + tup._2.sqlTypeDecl.get ) );
if (rawPkColNames != None && rawPkColNames.get != Nil) {
val pkConstraint = "PRIMARY KEY( " + rawPkColNames.get.map( transformQuoteColName _ ).mkString(", ") + " )";
this._pkConstraint = Some( pkConstraint );
if (includePkConstraint) {
noPkDecls:::pkConstraint::Nil;
} else {
noPkDecls;
}
} else {
noPkDecls;
}
};
sb.append( decls.mkString(" ( ", ", ", " )") );
_createDdl = Some( sb.toString() );
}
private def executeCreateDdl( csrc : ConnectionSource ) : Unit = {
val maybeQualifiedTableName = _maybeQualifiedTableName.get; // effectively asserts that _maybeQualifiedTableName has been set
executeCreateDdl( csrc, maybeQualifiedTableName );
}
private def executeCreateDdl( csrc : ConnectionSource, maybeQualifiedTableName : String ) : Unit = {
val createDdl = _createDdl.get; // effectively asserts that _createDdl has been set
var con : Connection = null;
var stmt : Statement = null;
try {
con = csrc.getConnection();
stmt = con.createStatement();
// if ( drop )
// {
// printf("Attempting to drop table '%s', if present.\n", maybeQualifiedTableName);
// drop( stmt );
// }
INFO.log(s"Creating table '${maybeQualifiedTableName}'.");
INFO.log( createDdl );
stmt.executeUpdate( createDdl );
afterTableCreate( con );
INFO.log("CREATE was executed without exceptions.");
} finally {
attemptClose( stmt, con );
}
}
private def insertFile( csrc : ConnectionSource, f : NamedDataFileSource, filesInfo : FilesInfo ) : Unit = withConnection( csrc ) { insertFile( _, f, filesInfo) }
private def insertFile( csrc : ConnectionSource, f : NamedDataFileSource, filesInfo : FilesInfo, maybeQualifiedTableName : String ) : Unit =
withConnection( csrc ) { insertFile( _, f, filesInfo, maybeQualifiedTableName) }
private def insertFile( con : Connection, f : NamedDataFileSource, filesInfo : FilesInfo ) : Unit = {
// effectively assets that member variables have been set...
var maybeQualifiedTableName = _maybeQualifiedTableName.get;
insertFile( con, f, filesInfo, maybeQualifiedTableName );
}
private def insertFile( con : Connection, f : NamedDataFileSource, filesInfo : FilesInfo, maybeQualifiedTableName : String ) : Unit = {
FINE.log(s"Populating '${maybeQualifiedTableName}' with data from ${f.sourceName}.")
//printf( "insertFile -- %s\n", f.sourceName );
if (batchFileInserts) con.setAutoCommit(false)
// effectively assets that member variables have been set...
val unifiedTableInfo = _unifiedTableInfo.get;
val unifiedNamesToColInfos = _unifiedNamesToColInfos.get;
val fkdLineKeepers = _fkdLineKeepers.get;
val myFkdLineKeeper = fkdLineKeepers(f); // there should be one for every file
val myFkdRawLines = immutable.HashSet( myFkdLineKeeper.fkdLines.map( _.rawLine ) : _* );
//printf("%s -- myFkdRawLines\n%s\n", f.sourceName, myFkdRawLines.mkString("\n"));
var skipped : Int = 0;
var br : BufferedReader = null;
var ps : PreparedStatement = null;
try {
val fileColNames = Array[String]( filesInfo.colNamesByFile( f ) : _* );
val psColList = fileColNames.map( transformQuoteColName( _ ) ).mkString("( ", ", ", " )"); // in File order, not table order
val psInsertionPoints = ( (for (fcn <- fileColNames) yield '?').mkString("( ", ", ", " )") );
// generate appropriate data setting functions, mixing inferred with preset columns
val setters = for ( cn <- fileColNames) yield copyMaybeSetter( unifiedNamesToColInfos( cn ).setter ); //not all setters are thread safe... we must copy
//val fcns = fileColNames.mkString(", ")
//println( s"fileColNames: ${fcns}" )
//println( s"SETTERS: ${setters.map( _.get ).mkString(", ")}" )
val typeCode = for ( cn <- fileColNames) yield unifiedNamesToColInfos( cn ).typeCode;
val insertStmt = String.format("INSERT INTO %s %s VALUES %s", maybeQualifiedTableName, psColList, psInsertionPoints );
//printf("insertStmt: %s\n", insertStmt);
ps = con.prepareStatement( insertStmt );
br = f.createBufferedReader( bufferSize, fileEncoding );
readMetaData( br ); // skip preliminaries...
val prepObj = prepareTransformFileData( f, fileColNames );
var batchCount = 0;
var line = readDataLine( br );
//println("line: " + line);
while ( goodLine( line ) ) {
if (! myFkdRawLines( line ) ) { // if the line is known to be fkd, we don't try to parse, read, and insert it
line = transformUnsplitDataLine( line, prepObj );
val data = transformSplitData( split( line ), prepObj );
/*
printf( "data %s\n", data.mkString("[ ", ", " , " ]") );
printf( "psColList %s\n", psColList );
printf( "psInsertionPoints %s\n", psInsertionPoints );
*/
for (i <- 0 until data.length) {
if ( isNull( data(i), fileColNames(i) ) ) {
ps.setNull( i + 1, typeCode(i).get); // XXX: should guard this get
} else {
try {
setters(i).get.apply(ps, i + 1, data(i)); // XXX: should guard this get, ps slots are indexed by 1, arrays are indexed by zero
} catch {
case e : Exception => {
WARNING.log(s"BAD VALUE: COLUMN ${i+1} of\n${line}");
throw e;
}
}
}
}
// println(ps);
if ( batchFileInserts ) {
ps.addBatch();
batchCount += 1;
if (batchCount == maxBatchSize) {
FINE.log(s"${f.sourceName}: batch size limit ${batchCount} reached. executing, then resetting.");
ps.executeBatch();
con.commit()
batchCount = 0;
FINE.log(s"${f.sourceName}: executed batch and reset batch size count.");
}
} else {
ps.executeUpdate();
afterRowInsert( con, data, f, fileColNames );
}
} else {
skipped += 1;
WARNING.log(s"${f.sourceName}: Skipped bad line: ${line}");
}
line = readDataLine( br );
}
if (batchFileInserts) {
ps.executeBatch();
con.commit()
}
if ( skipped > 0 ) {
WARNING.log( s"${f.sourceName}: Skipped ${skipped} lines. Expected to skip ${myFkdLineKeeper.fkdLines.length} lines.");
}
} catch {
case t : Throwable => {
if (batchFileInserts) con.rollback()
printThrowable(t);
throw t;
}
} finally {
attemptClose( ps, br );
}
}
def insertFiles(csrc : ConnectionSource) : Unit = {
// effectively asserts that _filesInfo and _maybeQualifiedTableName have been set
insertFiles(csrc, this.files, _filesInfo.get, concurrentInserts, _maybeQualifiedTableName.get );
}
def insertFiles(csrc : ConnectionSource, files : Iterable[NamedDataFileSource], filesInfo : FilesInfo, concurrent : Boolean, maybeQualifiedTableName : String) : Unit = {
if (concurrent) {
val fcn : (NamedDataFileSource) => Option[Throwable] = {
(f : NamedDataFileSource) => {
try { insertFile( csrc, f, filesInfo, maybeQualifiedTableName ); None; }
catch { case (t : Throwable) => { printThrowable(t); Some(t); } };
}
}
files.par.map( fcn );
} else {
withConnection( csrc )( con => files.foreach( insertFile( con, _, filesInfo, maybeQualifiedTableName ) ) )
}
afterAllInserts( csrc );
}
private def printThrowable( t : Throwable ) : Unit = {
if ( t != null ) {
t.printStackTrace();
if ( t.isInstanceOf[SQLException] ) printThrowable( t.asInstanceOf[SQLException].getNextException )
}
}
// XXX: hardcoded 8K starting buffer for headers
private def buildStreams() : Seq[BufferedReader] = {
var buildStreams = new ArrayBuffer[BufferedReader];
try {
for (f <- files) buildStreams += f.createBufferedReader(8192, fileEncoding);
buildStreams.toVector;
} catch {
case ex : Exception => { buildStreams.foreach( attemptClose _ ); throw ex; }
}
}
private def readAllMetaDatas() : Unit = {
val streams = buildStreams();
try {
_metaDatas = Some( for( br <- streams.toArray[BufferedReader] ) yield ( readValidateMetaData( br ) ) );
} finally{
streams.foreach( attemptClose _ );
}
}
private def readValidateMetaData( br : BufferedReader ) : MetaData = {
val out = readMetaData( br );
validateMetaData( out );
out;
}
// as a side effect, sets up _fkdLineKeepers
private def inferFileCols( fi : FilesInfo ) : Map[ NamedDataFileSource, Seq[ExaminedColumn] ] = {
// since synthetic file cols are set up in filesInfo, they will be passed to doInferFile
def doInferFile( f : NamedDataFileSource, colNames : Array[String] ) : (NamedDataFileSource, Seq[ExaminedColumn], FkdLineKeeper) = {
var numCols : Int = colNames.length;
INFO.log(s"Examining ${f.sourceName}, which has ${numCols} columns.");
val br = f.createBufferedReader( bufferSize, fileEncoding );
readMetaData( br ); //skip preliminaries
try {
val (recSeq, flk) = examineColumns( br, colNames, f );
( f, recSeq.map( _.toExaminedColumn), flk )
} finally {
br.close();
}
}
val xformFcn = (pair : Tuple2[NamedDataFileSource,Array[String]]) => { doInferFile( pair._1, pair._2 ) };
//val processor = CollectionProcessor( xformFcn, maxConcurrency );
//val triples = processor.process( files.map( f => (f, fi.colNamesByFile(f)) ) );
val triples = files.par.map( f => (f, fi.colNamesByFile(f)) ).map( xformFcn ).toVector;
val keepers : Map[NamedDataFileSource,FkdLineKeeper] = immutable.Map.empty ++ triples.map( trip => (trip._3.source -> trip._3 ) );
this._fkdLineKeepers = Some( keepers );
val pairs = triples.map( tup => Tuple2(tup._1, tup._2) )
immutable.HashMap( pairs.toSeq : _*);
}
private[SuperFlexDbArchiver] case class ExaminedColumn(val colName : String,
val booleanOnly : Boolean,
val numericOnly : Boolean,
val integerOnly : Boolean,
val fixedLength : Option[Int],
val maxLength : Int, // if maxLength < 0, all known entries of this colum are null
val dateFormatStr : Option[String],
val dateTimeFormatStr : Option[String]) {
def &&( other : ExaminedColumn ) = {
require( this.colName == other.colName );
ExaminedColumn(colName,
this.booleanOnly && other.booleanOnly,
this.numericOnly && other.numericOnly,
this.integerOnly && other.integerOnly,
if (this.fixedLength == other.fixedLength) { fixedLength } else { None },
this.maxLength max other.maxLength,
if (this.dateFormatStr == other.dateFormatStr) { dateFormatStr } else { None },
if (this.dateTimeFormatStr == other.dateTimeFormatStr) { dateTimeFormatStr } else { None } );
}
// XXX: Put all this type crap into some formal enumeration
// TODO: Make this customizable to different databases
val bestTypeDeclaration : Tuple2[String, Int] = {
val forced = forceInferredType( colName, maxLength );
if (forced != None ) {
val out = ( forced.get._1.toUpperCase, forced.get._2 );
assert( ("TIMESTAMP"::"DATE"::"INTEGER"::"BIGINT"::"DOUBLE PRECISION"::"BOOLEAN"::Nil).contains( out._1 ) || out._1.indexOf("CHAR") >= 0,
"Forced column types must be one of 'TIMESTAMP', 'DATE', 'INTEGER', 'BIGINT', 'DOUBLE PRECISION', 'BOOLEAN', or something containg 'CHAR'" );
if (debugColumnInspection) FINE.log(s"[${colName}] Column type '${out._1}' was forced by a subclass override.");
out
} else if (maxLength < 1) { // all null
defaultTypeDeclaration;
} else if (booleanOnly) {
("BOOLEAN", Types.BOOLEAN)
} else if (dateTimeFormatStr != None) {
("TIMESTAMP", Types.TIMESTAMP);
} else if (dateFormatStr != None) {
("DATE", Types.DATE);
} else if (integerOnly) {
if (maxLength < 10) {
("INTEGER", Types.INTEGER);
} else {
("BIGINT", Types.BIGINT);
}
} else if (numericOnly) {
("DOUBLE PRECISION", Types.DOUBLE);
} else if (fixedLength != None) {
(String.format("CHAR(%d)", int2Integer(fixedLength.get.asInstanceOf[Int]) ), Types.CHAR); //workaround of weird type, ambiguous conversion problems
} else {
(String.format("VARCHAR(%d)", int2Integer(padFieldLength( maxLength ))), Types.VARCHAR);
}
}
val setter : (PreparedStatement, Int, String) => Unit = {
bestTypeDeclaration._1 match {
case "BOOLEAN" => booleanSetter;
case "TIMESTAMP" => timestampSetter( dateTimeFormatStr.get );
case "DATE" => dateSetter( dateFormatStr.get );
case "INTEGER" => integerSetter;
case "BIGINT" => bigintSetter;
case "DOUBLE PRECISION" => doublePrecisionSetter;
case _ => charTypeSetter;
}
}
}
private def goodLine( line : String ) : Boolean = { line != null && line.trim().length > 0 }
// colNames includes synthetic colNames (appended at end), as they are set up in filesInfo
private def examineColumns(br : BufferedReader, colNames : Array[String], f : NamedDataFileSource) : Tuple2[Seq[Rec], FkdLineKeeper] = {
var fkdLines : List[FkdLine] = Nil;
var numCols = colNames.length;
val out = new Array[Rec](numCols);
for (i <- 0 until numCols)
out(i) = new Rec(colNames(i));
// the prepare / transform API takes care of appending any synthetic values
// to the parsed data
val prepObj = prepareTransformFileData( f, colNames );
var rawLine = readDataLine( br );
while (goodLine(rawLine)) {
val line = transformUnsplitDataLine( rawLine, prepObj );
//println(line);
//print('.');
var data = transformSplitData( split(line), prepObj );
if ( data.length == numCols ) {
for (i <- 0 until numCols) out(i).update(data(i));
} else {
var fkd = FkdLine( rawLine, line, data, String.format("data.length (%s) and numCols (%s) should be equal.", data.length.asInstanceOf[Object], numCols.asInstanceOf[Object]) );
//printf("BAD LINE: %S\n", fkd);
if (strict) {
throw new DbArchiverException("UNREADABLE LINE in %s! ABORTING (since we are in strict mode). INFO: %s".format(f.sourceName, fkd.toString));
} else {
fkdLines = fkd::fkdLines;
}
}
rawLine = readDataLine( br );
}
Tuple2(out, FkdLineKeeper( f, colNames.toList, fkdLines.reverse ) );
}
/**
* extraValidators is necessary because sometimes SimpleDateFormat fails to
* reject non-exact matches, filling in missing info, even with lenient set to
* false. The params of extraValidator should be maybeDate and patternStr, in
* that order
*/
private[SuperFlexDbArchiver] class DateFormatGuesser( colName : String, patternStrs : Iterable[String], extraValidator : (String,String)=>Boolean ) {
def this( colName : String, patternStrs: Iterable[String] ) = this( colName, patternStrs, null)
import scala.collection.mutable.Queue;
var inPlay = new Queue[SimpleDateFormat];
inPlay ++= patternStrs.map( new SimpleDateFormat( _ ) );
inPlay.foreach( _.setLenient( false ) );
var anyHope = !inPlay.isEmpty;
def check( maybeDate : String) : Unit = {
def check(maybeDate : String, df : SimpleDateFormat) : Boolean = {
var out = {
try { df.parse( maybeDate ); true; }
catch { case ex : ParseException => false; }
}
if ( debugColumnInspection ) {
if (! out) FINE.log(s"[${colName}] Pattern ${df.toPattern} ruled out by datum '${maybeDate}'");
}
if ( out && extraValidator != null ) {
out = extraValidator( maybeDate, df.toPattern );
if ( debugColumnInspection && !out) FINE.log(s"[${colName}] Datum ${maybeDate} conforms to pattern ${df.toPattern}, but is ruled out extra validator.");
}
out;
}
inPlay.dequeueAll( ! check( maybeDate, _ ) );
anyHope = !inPlay.isEmpty;
if (!anyHope && debugColumnInspection) FINE.log(s"[${colName}] now hopeless in ${this}.");
}
def guess = { inPlay.front; }
}
class Rec( colName : String ) {
var booleanOnly : Boolean = true;
var numericOnly : Boolean = true;
var integerOnly : Boolean = true;
var fixedLength : Int = -1; // -2 means not fixed, -1 means unknown, nonneg means putatively fixed
var maxLength : Int = -1;
val dfg : DateFormatGuesser = new DateFormatGuesser( colName, dateFormatPatterns, extraDateValidator);
val dtfg : DateFormatGuesser = new DateFormatGuesser( colName, dateTimeFormatPatterns, extraDateValidator);
//val dfg : DateFormatGuesser = new DateFormatGuesser( colName, dateFormatPatterns);
//val dtfg : DateFormatGuesser = new DateFormatGuesser( colName, dateTimeFormatPatterns);
def toExaminedColumn = ExaminedColumn(colName,
booleanOnly,
numericOnly,
integerOnly,
if (fixedLength > 0) { Some(fixedLength) } else { None },
maxLength,
if ( dfg.anyHope ) { Some( dfg.guess.toPattern ) } else { None },
if ( dtfg.anyHope ) { Some( dtfg.guess.toPattern ) } else { None }
);
//note that we don't treat all zero strings of any length as containing "leading zeros"
def hasLeadingZeros( datum : String ) : Boolean = {
val m = leadingZerosRegex.findFirstMatchIn(datum);
(m != None && m.get.start == 0 && !datum.forall( _ == '0'))
}
private def badPlusOrMinus( str : String ) : Boolean = {
if ( str.indexOf('+') < 0 && str.indexOf('-') < 0 ) { // no plus or minus
false
} else {
val indices = mutable.SortedSet.empty[Int]
var i = 0
val len = str.length
while (i < len) {
val c = str.charAt(i)
if ( c == '+' || c == '-' ) indices += i
i += 1
}
def leading( index : Int ) = index == 0
def scientific( index : Int ) = str.toLowerCase.charAt( index - 1 ) == 'e'
def leadingOrScientific( index : Int ) = leading( index ) || scientific( index )
indices.size match {
case 1 => if (leadingOrScientific( indices.head )) false else true
case 2 => if ( leading( indices.head ) && scientific( indices.last ) ) false else true
case _ => true
}
}
}
def update( datum : String ) : Unit = {
if (datum == null || isNull(datum, colName)) //we can draw no inferences from nulls
return;
booleanOnly = ( booleanOnly && (asBoolean(datum) != None) ); // does &&= work?
if (numericOnly) { // integerOnly implies numericOnly
if (datum.length == 0) { //we interpret this case as empty string, since we know it isn't interpreted as null
numericOnly = false;
integerOnly = false;
if ( debugColumnInspection ) FINE.log(s"[${colName}] Numeric types ruled out by empty string not interpreted as NULL");
} else if ( (! datum.forall( "+-0123456789.eE".contains( _ ) )) ||
(! "0123456789.".contains(datum.last)) ) { //we accept the letter E for representations in scientific notation
numericOnly = false;
integerOnly = false;
if ( debugColumnInspection ) FINE.log(s"[${colName}] Numeric types ruled out by datum '${datum}', which cannot be interpreted as a number.");
} else if ( numericOnly && badPlusOrMinus(datum) ) { //we have to deal with negatives from scientific notation... yes i should use parseDouble or NumberFormat...
numericOnly = false;
integerOnly = false;
if ( debugColumnInspection ) FINE.log(s"[${colName}] Numeric types ruled out by datum '${datum}', which cannot be interpreted as a number because of an internal, not-scientific-notation plus or minus.");
} else if( leadingZerosNonNumeric && numericOnly && hasLeadingZeros( datum ) ) {
numericOnly = false;
integerOnly = false;
if ( debugColumnInspection ){
FINE.log("[${colName}] Numeric types ruled out by datum '${datum}', since config param 'leadingZerosNonNumeric' is true, and the datum contains leading zeros.");
}
} else if ( integerOnly && (datum.contains('.') || datum.contains('e') || datum.contains('E') ) ) {
integerOnly = false;
if ( debugColumnInspection ) FINE.log("[${colName}] Integral types ruled out by datum '${datum}', which contains a '.'");
}
}
if (fixedLength > -2) {
if (fixedLength >= 0) {
val newFixedLength = if (datum.length == fixedLength) { datum.length } else { -2 }; //a variation, -2 means not fixed
if ( debugColumnInspection && newFixedLength == -2) {
FINE.log(s"[${colName}] Putative fixed length of ${fixedLength} invalidated by '${datum}' of length ${datum.length}");
}
fixedLength = newFixedLength;
} else {
fixedLength = datum.length;
if ( debugColumnInspection ) FINE.log(s"[${colName}] Putative fixed length of ${fixedLength} set by first non-null value, '${datum}'.");
}
}
maxLength = maxLength max datum.length;
if ( dfg.anyHope ) dfg.check ( datum );
if (dtfg.anyHope ) dtfg.check ( datum );
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy