
com.mchange.sc.v1.superflex.package.scala Maven / Gradle / Ivy
The newest version!
/*
* Distributed as part of superflex v0.2.0
*
* Copyright (C) 2013 Machinery For Change, Inc.
*
* Author: Steve Waldman
*
* This library is free software; you can redistribute it and/or modify
* it under the terms of EITHER:
*
* 1) The GNU Lesser General Public License (LGPL), version 2.1, as
* published by the Free Software Foundation
*
* OR
*
* 2) The Eclipse Public License (EPL), version 1.0
*
* You may choose which license to accept if you wish to redistribute
* or modify this work. You may offer derivatives of this work
* under the license you have chosen, or you may provide the same
* choice of license which you have been offered here.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received copies of both LGPL v2.1 and EPL v1.0
* along with this software; see the files LICENSE-EPL and LICENSE-LGPL.
* If not, the text of these licenses are currently available at
*
* LGPL v2.1: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html
* EPL v1.0: http://www.eclipse.org/org/documents/epl-v10.php
*
*/
package com.mchange.sc.v1;
import java.io.{BufferedReader,BufferedWriter,File,FileReader,FileWriter,PrintWriter};
import java.sql.{Connection,DriverManager,Statement,SQLException};
import scala.collection.mutable.ArrayBuffer;
import scala.collection.immutable.Set;
import scala.collection.immutable.SortedSet;
import scala.collection.immutable.TreeSet;
import scala.collection.immutable.HashSet;
import com.mchange.v2.csv.FastCsvUtils;
import com.mchange.sc.v1.sql.ResourceUtils._;
import com.mchange.sc.v1.util.ClosableUtils;
package object superflex {
def attemptCreate (stmt : Statement, objName : String, createDdl : String ) : Unit = {
try {
stmt.executeUpdate(createDdl);
} catch {
// assume an SQLException means that the object is already there.
case (t : SQLException) => {
printf("Failed to create %s. Verify that it is already present.\n", objName);
t.printStackTrace();
}
}
}
def attemptCreateSchema( csrc : ConnectionSource, sname : String ) = {
withConnection( csrc ) { con =>
withStatement( con ) { stmt =>
attemptCreate( stmt, sname, "CREATE SCHEMA " + sname );
}
}
}
private def headers( f : File ) : Array[String] = headers( f, None );
private def headers( f : File, xform : Option[Function1[Array[String],Array[String]]] ) : Array[String] = {
//println(f);
// XXX: hardcoded buffer-size of 8K
ClosableUtils.withClosable( () => new BufferedReader( new FileReader( f ), 8192 ) ) {
br => headers( br, xform );
}
}
// BufferedReader should be prior to the first line
private def headers( br : BufferedReader, xform : Option[Function1[Array[String],Array[String]]] ) : Array[String] = {
val raw = FastCsvUtils.splitRecord( br.readLine() );
if ( xform != None ) {
xform.get.apply( raw );
} else {
raw;
}
}
def allColNamesOneRowHeaderCsv( files : Iterable[File] ) : SortedSet[String] = {
allColNamesOneRowHeaderCsv( files, None );
}
def allColNamesOneRowHeaderCsv( files : Iterable[File], xform : Option[Function1[Array[String],Array[String]]] ) : SortedSet[String] = {
files.foldLeft( (new TreeSet[String]).asInstanceOf[SortedSet[String]] )( (set : SortedSet[String], f : File) => set ++ headers( f, xform ) );
}
def findBounds( files : Iterable[File], excludeKeyCols : Iterable[String], numBounds : Int ) : List[String] = {
findBounds( files, excludeKeyCols, numBounds, None )
}
def findBounds( files : Iterable[File], excludeKeyCols : Iterable[String], numBounds : Int, xform : Option[Function1[Array[String],Array[String]]] ) : List[String] = {
var colList = (allColNamesOneRowHeaderCsv( files, xform ).filterNot( excludeKeyCols.toSet.contains(_) ) ).toList;
val spaceBetween = Math.round( Math.ceil( colList.length.asInstanceOf[Float] / (numBounds + 1) ).asInstanceOf[Float] );
println( "colList.length: " + colList.length );
println( "spaceBetween: " + spaceBetween );
println( (spaceBetween until colList.length by spaceBetween).mkString(", ") );
(spaceBetween until colList.length by spaceBetween).map( colList(_) ).toList;
}
def csvColumnCount( f : File ) : Int = headers(f).size; //no need to transform, number of columns must be invariant to transformations
def splitOneRowHeaderCsvFile( splitMe : File, primaryKeyColNames : Set[String], maxCols: Int, splitBufferSize : Int, splitFileDir : File ) : Iterable[File] = {
splitOneRowHeaderCsvFile( splitMe, primaryKeyColNames, maxCols, splitBufferSize, splitFileDir, None );
}
def splitOneRowHeaderCsvFile(splitMe : File,
primaryKeyColNames : Set[String],
maxCols: Int,
splitBufferSize : Int,
splitFileDir : File,
xform : Option[Function1[Array[String],Array[String]]] ) : Iterable[File] = {
require( splitMe.getName().endsWith(".csv") );
var totalCols = csvColumnCount( splitMe );
//printf("%s: totalCols: %d, maxCols: %d\n", splitMe, totalCols, maxCols);
if (totalCols > maxCols) {
val baseTableName = splitMe.getName().substring(0, splitMe.getName().length() - 4);
var numFiles = totalCols / maxCols + 1;
var outFiles = (1 to numFiles).map( n => new File( splitFileDir, baseTableName + "_"+ n + ".csv" ) ).toList;
printf("Splitting %s into %s\n", splitMe, outFiles.mkString(", "));
divideOneRowHeaderCsvFile( primaryKeyColNames, splitMe, outFiles, splitBufferSize, xform );
outFiles;
} else {
splitMe::Nil;
}
}
def splitOneRowHeaderCsvFileByBounds(splitMe : File,
allColNames : List[String],
primaryKeyColNames : Set[String],
boundaryCols : List[String],
splitBufferSize : Int,
splitFileDir : File ) : Iterable[File] = {
splitOneRowHeaderCsvFileByBounds(splitMe, allColNames, primaryKeyColNames, boundaryCols, splitBufferSize, splitFileDir, None);
}
def splitOneRowHeaderCsvFileByBounds(splitMe : File,
allColNames : List[String],
primaryKeyColNames : Set[String],
boundaryCols : List[String],
splitBufferSize : Int,
splitFileDir : File,
xform : Option[Function1[Array[String],Array[String]]] ) : Iterable[File] = {
require( splitMe.getName().endsWith(".csv") );
val baseTableName = splitMe.getName().substring(0, splitMe.getName().length() - 4);
var numFiles = boundaryCols.length + 1;
var outFiles = (1 to numFiles).map( n => new File( splitFileDir, baseTableName + "_"+ n + ".csv" ) ).toList;
//printf("Splitting %s into %s on bounds %s.\n", splitMe, outFiles.mkString(", "), boundaryCols.mkString(", "));
divideOneRowHeaderCsvFileByBounds( primaryKeyColNames, allColNames, splitMe, boundaryCols, outFiles, splitBufferSize, xform );
outFiles;
}
def divideOneRowHeaderCsvFile( keyColNames : Set[String], inFile: File, outFiles : List[File], bufferSize : Int) : Unit = {
divideOneRowHeaderCsvFile( keyColNames, inFile, outFiles, bufferSize, None );
}
def divideOneRowHeaderCsvFile( keyColNames : Set[String], inFile: File, outFiles : List[File], bufferSize : Int, xform : Option[Function1[Array[String],Array[String]]]) : Unit = {
val br = new BufferedReader( new FileReader( inFile ), bufferSize );
try {
val headerList = headers( br, xform ).toList;
var line = br.readLine();
var rows = new Iterator[List[String]] {
def hasNext : Boolean = (line != null);
def next : List[String] = {
var out = FastCsvUtils.splitRecord( line ).toList;
line = br.readLine();
out;
}
}
val sinks = outFiles.map( new CsvFileWritableTable( _ , bufferSize ) );
try {
divideIntoJoinableByNumSinks(headerList, keyColNames, rows, sinks );
}
finally {
ClosableUtils.attemptCloseAll( sinks : _* );
}
}
finally {
ClosableUtils.attemptClose( br );
}
}
def divideOneRowHeaderCsvFileByBounds(keyColNames : Set[String],
allColNames : List[String],
inFile: File,
boundaryColsExclusive : List[String],
outFiles : List[File],
bufferSize : Int) : Unit = {
divideOneRowHeaderCsvFileByBounds(keyColNames,
allColNames,
inFile,
boundaryColsExclusive,
outFiles,
bufferSize,
None);
}
def divideOneRowHeaderCsvFileByBounds(keyColNames : Set[String],
allColNames : List[String],
inFile: File,
boundaryColsExclusive : List[String],
outFiles : List[File],
bufferSize : Int,
xform : Option[Function1[Array[String],Array[String]]]) : Unit = {
assert( boundaryColsExclusive.length == outFiles.length - 1 );
val br = new BufferedReader( new FileReader( inFile ), bufferSize );
try {
val headerList = headers( br, xform ).toList;
//printf("file header list: %s\n", headerList.mkString(", "));
var line = br.readLine();
var rows = new Iterator[List[String]] {
def hasNext : Boolean = (line != null);
def next : List[String] = {
var out = FastCsvUtils.splitRecord( line ).toList;
line = br.readLine();
out;
}
}
val sinks = outFiles.map( new CsvFileWritableTable( _ , bufferSize ) );
try {
divideIntoJoinableByBounds(headerList, allColNames, keyColNames, boundaryColsExclusive, rows, sinks );
}
finally {
ClosableUtils.attemptCloseAll( sinks : _* );
}
}
finally {
ClosableUtils.attemptClose( br );
}
}
trait WritableTable {
def setColNames( colNames : List[String] ) : Unit;
def addDataRow( row : List[String] ) : Unit;
def close() : Unit;
}
class CsvFileWritableTable( f : File, bufferSize : Int ) extends WritableTable {
private var pw = new PrintWriter( new BufferedWriter( new FileWriter(f), bufferSize ) );
private def mkStringCsv( data : List[String] ) : String = data.mkString("\"","\",\"","\"");
def setColNames( colNames : List[String] ) : Unit = pw.println( mkStringCsv(colNames) );
def addDataRow( row : List[String] ) : Unit = pw.println( mkStringCsv(row) );
def close() : Unit = pw.close();
}
private def divideIntoJoinableByNumSinks(colNames : List[String],
keyColNames : Set[String],
inputRows : Iterator[List[String]],
sinks : List[WritableTable]) : Unit =
divideIntoJoinableByNumSinks(colNames, keyColNames, inputRows, sinks, None);
private def divideIntoJoinableByBounds(colNames : List[String],
allColNames : List[String],
keyColNames : Set[String],
boundaryColsExclusive : List[String],
inputRows : Iterator[List[String]],
sinks : List[WritableTable]) : Unit =
divideIntoJoinableByBounds(colNames, allColNames, keyColNames, boundaryColsExclusive, inputRows, sinks, None);
private def splitColSort( colNames : List[String] ) : (String) => Ordered[String] = {
me => {
new Ordered[String] {
def compare( other : String ) : Int = {
val meIdx = colNames.indexOf( me );
val othIdx = colNames.indexOf( other );
assert( meIdx >= 0 && othIdx >= 0, "colNames: %s | me: %s | meIdx: %s | other: %s | othIdx: %s\n".format( colNames.mkString(", "), me, meIdx, other, othIdx ) );
if ( meIdx > othIdx ) 1;
else if ( meIdx < othIdx ) -1;
else 0;
}
}
}
}
private def splitByNumSinks(colNames : List[String], keyColNames : Set[String], numSinks : Int) : List[List[String]] = {
val numKeys = keyColNames.size;
implicit val sort = splitColSort( colNames );
val keyColList = (TreeSet.empty[String] ++ keyColNames).toList;
val othColList = ( (TreeSet.empty[String] ++ colNames).filterNot( keyColNames.contains(_) ) ).toList;
//println( keyColList.mkString(", ") );
//println( othColList.mkString(", ") );
val tailTableNonKeyLen = (othColList.length / numSinks);
val firstTableNonKeyLen = (tailTableNonKeyLen + (othColList.length % numSinks));
val firstTableCols = keyColList ++ othColList.take( firstTableNonKeyLen );
val otherTablesColsList = (for ( start <- firstTableNonKeyLen until othColList.length by tailTableNonKeyLen )
yield ( keyColList ++ othColList.slice(start, start + tailTableNonKeyLen) )).toList;
firstTableCols::otherTablesColsList;
}
private def splitByBounds( colNames : List[String], allColNames : List[String], keyColNames : Set[String], boundaryColsExclusive : List[String] ) : List[List[String]] = {
assert ( ! boundaryColsExclusive.isEmpty );
val numKeys = keyColNames.size;
implicit val sort = splitColSort( allColNames );
val keyColList : List[String] = (TreeSet.empty[String] ++ keyColNames).toList;
val othColSet : SortedSet[String] = TreeSet[String]( (colNames.filterNot( keyColNames.contains( _ ) ) ) : _* );
val othColList : List[String] = othColSet.toList;
printf( "keyColList: %s\n\n", keyColList );
val boundaryColsIter = (TreeSet.empty[String] ++ boundaryColsExclusive).iterator;
val outBuf = new ArrayBuffer[List[String]];
var from : Option[String] = None;
var to : Option[String] = None;
do {
to = {
if (boundaryColsIter.hasNext) Some(boundaryColsIter.next);
else None;
}
outBuf += (
( from, to ) match {
case ( None, Some(_) ) => keyColList:::((othColSet.to( to.get )).toList);
case ( Some(_), Some(_) ) => keyColList:::((othColSet.range( from.get, to.get )).toList);
case ( Some(_), None ) => keyColList:::((othColSet.from( from.get )).toList);
case pair => throw new RuntimeException("Huh? Unexpected pair: " + pair);
}
);
println( Tuple2( from, to ) );
from = to
}
while ( to != None );
printf("outBuf.size: %d\n", outBuf.size);
val out = outBuf.toList;
println( out.mkString("\n~~~\n") );
out;
}
def divideIntoJoinableByNumSinks(colNames : List[String],
keyColNames : Set[String],
inputRows : Iterator[List[String]],
sinks : List[WritableTable],
badRowWriter : Option[PrintWriter]) : Unit = {
def createTableColList() : List[List[String]] = {
splitByNumSinks(colNames, keyColNames, sinks.size);
}
divideIntoJoinable(colNames, keyColNames, inputRows, sinks, badRowWriter, createTableColList);
}
def divideIntoJoinableByBounds(colNames : List[String],
allColNames : List[String],
keyColNames : Set[String],
boundaryColsExclusive : List[String],
inputRows : Iterator[List[String]],
sinks : List[WritableTable],
badRowWriter : Option[PrintWriter]) : Unit = {
printf("Unexpected? %s\n", (Set( colNames : _* ) -- allColNames).mkString(", "));
def createTableColList() : List[List[String]] = {
printf("Calling split by bounds.\n");
splitByBounds(colNames, allColNames, keyColNames, boundaryColsExclusive);
}
divideIntoJoinable(colNames, keyColNames, inputRows, sinks, badRowWriter, createTableColList);
}
private def divideIntoJoinable(colNames : List[String],
keyColNames : Set[String],
inputRows : Iterator[List[String]],
sinks : List[WritableTable],
badRowWriter : Option[PrintWriter],
createTableColList : () => List[List[String]]) : Unit = {
val numSinks = sinks.length;
val allTablesCols = createTableColList();
val headerSinkTuples = allTablesCols.zip( sinks );
headerSinkTuples.foreach( tup => tup._2.setColNames( tup._1 ) );
inputRows.foreach {
row => {
if ( colNames.size != row.size ) {
if ( row.size >= 3 )
printf("badRow: \"%s\",\"%s\",\"%s\"...\n", row(0), row(1), row(2));
else
println("short bad row... "+ row.mkString("\"","\",\"","\""));
if ( badRowWriter != None ) {
var badRow = row.mkString("\"","\",\"","\"");
badRowWriter.get.println( badRow );
}
} else {
val rowMap = Map.empty ++ colNames.zip( row );
headerSinkTuples.foreach {
tup => {
val headers = tup._1;
val sink = tup._2;
val dataRow = headers.map( rowMap( _ ) );
sink.addDataRow( dataRow );
}
}
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy