fuzzycsv.FuzzyCSV.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fuzzy-csv Show documentation
Show all versions of fuzzy-csv Show documentation
A groovy/java tabular Data (from CSV,SQL,JSON) processing library that supports fuzzy column matching,tranformations/merging/querying etc
package fuzzycsv
import com.github.kayr.phrasematcher.PhraseMatcher
import com.opencsv.CSVParser
import com.opencsv.CSVReader
import com.opencsv.CSVWriter
import fuzzycsv.rdbms.DDLUtils
import groovy.json.JsonOutput
import groovy.json.JsonSlurper
import groovy.sql.Sql
import groovy.transform.CompileStatic
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import java.sql.Clob
import java.sql.ResultSet
import java.sql.ResultSetMetaData
import java.sql.SQLException
import static fuzzycsv.RecordFx.fx
class FuzzyCSV {
private static Logger log = LoggerFactory.getLogger(FuzzyCSV.class)
public static ThreadLocal ACCURACY_THRESHOLD = new ThreadLocal() {
@Override
protected Double initialValue() {
return 1.0
}
}
public static ThreadLocal THROW_EXCEPTION_ON_ABSENT_COLUMN = new ThreadLocal() {
@Override
protected Boolean initialValue() {
return true
}
}
@CompileStatic
static List parseCsv(String csv,
char separator = CSVParser.DEFAULT_SEPARATOR,
char quoteChar = CSVParser.DEFAULT_QUOTE_CHARACTER,
char escapeChar = CSVParser.DEFAULT_ESCAPE_CHARACTER) {
return parseCsv(new StringReader(csv), separator, quoteChar, escapeChar)
}
@CompileStatic
static List parseCsv(Reader reader,
char separator = CSVParser.DEFAULT_SEPARATOR,
char quoteChar = CSVParser.DEFAULT_QUOTE_CHARACTER,
char escapeChar = CSVParser.DEFAULT_ESCAPE_CHARACTER) {
def rd = new CSVReader(reader, separator, quoteChar, escapeChar)
return rd.readAll()
}
static List getValuesForColumn(List extends List> csvList, int colIdx) {
csvList.collect { it[colIdx] }
}
static List getAt(List csv, IntRange range) {
def csvSize = csv.size()
if (csvSize <= 1) return csv
def header = csv[0]
def maxValue = csvSize - 1
def fromInt = range.fromInt
def toInt = range.toInt
def isReverse = range.reverse
if (fromInt.abs() >= csvSize) {
fromInt = (maxValue) * (fromInt < 0 ? -1 : 1)
}
if (toInt.abs() >= csvSize) {
toInt = (maxValue) * (toInt < 0 ? -1 : 1)
}
range = isReverse ? new IntRange(true, toInt, fromInt) : new IntRange(true, fromInt, toInt)
def tail = csv[range]
def newCsv = [header]; newCsv.addAll(tail)
return newCsv
}
static List putInCellWithHeader(List extends List> csv, String columnHeader, int rowIdx, Object value) {
def position = Fuzzy.findPosition(csv[0], columnHeader)
return putInCell(csv, position, rowIdx, value)
}
static List putInCell(List extends List> csv, int colIdx, int rowIdx, Object value) {
csv[rowIdx][colIdx] = value
return csv
}
@CompileStatic
static List putInColumn(List extends List> csvList, List column, int insertIdx) {
csvList.eachWithIndex { List entry, int lstIdx ->
def entryList = entry
def cellValue = lstIdx >= column.size() ? null : column[lstIdx]
entryList[insertIdx] = cellValue
}
return csvList
}
@CompileStatic
static List copyColumn(List extends List> src, List dest, int srcIdx, int destIdx) {
def size = src.size()
for (int i = 0; i < size; i++) {
dest[i][destIdx] = src[i][srcIdx]
}
return dest
}
@CompileStatic
static List filter(List extends List> csvList, RecordFx fx) {
def header = csvList[0]
def newCsv = [header]
csvList.eachWithIndex { List entry, Integer idx ->
if (idx == 0) return
def rec = Record.getRecord(header, entry, csvList, idx)
def value = fx.getValue(rec)
if (value == true) newCsv.add entry
}
return newCsv
}
@CompileStatic
static List> toCSV(ResultSet resultSet) {
def metaData = resultSet.getMetaData()
def columnCount = metaData.columnCount
def columns = getColumns(metaData)
def resultSetSize = resultSet.fetchSize
def csv = new ArrayList(resultSetSize >= 1 ? resultSetSize : 10)
csv << columns
while (resultSet.next()) {
List record = new ArrayList(columnCount)
for (int i = 0; i < columnCount; i++) {
def object = resultSet.getObject(i + 1)
if (object instanceof Clob) {
def stream = DDLUtils.clobToString(object)
record.add(stream)
} else
record.add(object)
}
csv << record
}
return csv
}
@CompileStatic
static int writeCsv(Sql sql, String query, Writer stream, boolean includeNames = true, boolean trim = false) {
def rt = -1
sql.query(query) { ResultSet rs ->
rt = writeCsv(rs, stream, includeNames, trim)
}
return rt
}
@CompileStatic
static int writeCsv(ResultSet resultSet, Writer stream, boolean includeNames = true, boolean trim = false) {
def writer = new CSVWriter(stream)
return writer.writeAll(resultSet, includeNames, trim)
}
@SuppressWarnings("GroovyVariableNotAssigned")
@CompileStatic
static List> toCSV(Sql sql, String query) {
List> csv
sql.query(query) { ResultSet rs ->
csv = toCSV(rs)
}
return csv
}
@CompileStatic
static List getColumns(ResultSetMetaData metadata) throws SQLException {
int columnCount = metadata.getColumnCount()
List nextLine = new ArrayList(columnCount)
for (int i = 0; i < columnCount; i++) {
nextLine[i] = metadata.getColumnLabel(i + 1)
}
return nextLine
}
@CompileStatic
static List putInColumn(List extends List> csvList, RecordFx column, int insertIdx, List extends List> sourceCSV = null) {
def header = csvList[0]
csvList.eachWithIndex { List entry, int lstIdx ->
def cellValue
if (lstIdx == 0) {
cellValue = column.name
} else {
def record = Record.getRecord(header, entry, csvList, lstIdx).setLeftCsv(sourceCSV)
if (sourceCSV) {
def oldCSVRecord = sourceCSV[lstIdx]
def oldCSVHeader = sourceCSV[0]
record.leftRecord = oldCSVRecord
record.leftHeaders = oldCSVHeader
}
cellValue = column.getValue(record)
}
entry[insertIdx] = cellValue
}
return csvList
}
static void writeToFile(List extends List> csv, String file) {
writeToFile(csv, new File(file))
}
static void writeToFile(List extends List> csv, File sysFile) {
if (sysFile.exists())
sysFile.delete()
if (!sysFile.parentFile?.exists()) {
sysFile.parentFile?.mkdirs()
}
sysFile.withWriter { fileWriter ->
writeCSV(csv, fileWriter)
}
}
static void writeCSV(List extends List> csv, Writer fileWriter) {
CSVWriter writer = new FuzzyCSVWriter(fileWriter)
writer.writeAll(csv)
}
@CompileStatic
static String csvToString(List extends List> csv) {
def stringWriter = new StringWriter()
def writer = new FuzzyCSVWriter(stringWriter)
writer.writeAll(csv)
stringWriter.toString()
}
@CompileStatic
static void writeJson(List extends List> csv, String file) {
writeJson(csv, new File(file))
}
@CompileStatic
static void writeJson(List extends List> csv, File file) {
file.withWriter { BufferedWriter w -> writeJson(csv, w) }
}
@CompileStatic
static void writeJson(List extends List> csv, Writer fileWriter) {
def json = toJsonText(csv)
fileWriter.write(json)
}
static String toJsonText(List extends List> csv) {
return JsonOutput.toJson(csv)
}
static Object fromJsonText(String text) {
new JsonSlurper().parseText(text)
}
private static List toListofList(Object object) {
if (object instanceof List) {
return object;
}
throw new IllegalArgumentException("Json is not a valid csv")
}
@CompileStatic
static def fromJson(Reader reader) {
return new JsonSlurper().parse(reader)
}
@CompileStatic
static def fromJson(File source) {
return new JsonSlurper().parse(source)
}
private final static NULL_ON_FUNCTION = null
static List join(List extends List> csv1, List extends List> csv2, String[] joinColumns) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2, joinColumns), NULL_ON_FUNCTION, false, false, hpRightRecordFinder(joinColumns))
}
static List leftJoin(List extends List> csv1, List extends List> csv2, String[] joinColumns) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2, joinColumns), NULL_ON_FUNCTION, true, false, hpRightRecordFinder(joinColumns))
}
static List rightJoin(List extends List> csv1, List extends List> csv2, String[] joinColumns) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2, joinColumns), NULL_ON_FUNCTION, false, true, hpRightRecordFinder(joinColumns))
}
static List fullJoin(List extends List> csv1, List extends List> csv2, String[] joinColumns) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2, joinColumns), NULL_ON_FUNCTION, true, true, hpRightRecordFinder(joinColumns))
}
static List join(List extends List> csv1, List extends List> csv2, RecordFx onExpression, String[] selectColumns) {
return superJoin(csv1, csv2, selectColumns as List, onExpression, false, false)
}
static List leftJoin(List extends List> csv1, List extends List> csv2, RecordFx onExpression, String[] selectColumns) {
return superJoin(csv1, csv2, selectColumns as List, onExpression, true, false)
}
static List rightJoin(List extends List> csv1, List extends List> csv2, RecordFx onExpression, String[] selectColumns) {
return superJoin(csv1, csv2, selectColumns as List, onExpression, false, true)
}
static List fullJoin(List extends List> csv1, List extends List> csv2, RecordFx onExpression, String[] selectColumns) {
return superJoin(csv1, csv2, selectColumns as List, onExpression, true, true)
}
static List joinOnIdx(List extends List> csv1, List extends List> csv2) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2), NULL_ON_FUNCTION, false, false, fnJoinOnIndexFinder())
}
static List leftJoinOnIdx(List extends List> csv1, List extends List> csv2) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2), NULL_ON_FUNCTION, true, false, fnJoinOnIndexFinder())
}
static List rightJoinOnIdx(List extends List> csv1, List extends List> csv2) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2), NULL_ON_FUNCTION, false, true, fnJoinOnIndexFinder())
}
static List fullJoinOnIdx(List extends List> csv1, List extends List> csv2) {
return superJoin(csv1, csv2, selectAllHeaders(csv1, csv2), NULL_ON_FUNCTION, true, true, fnJoinOnIndexFinder())
}
/*
* Returns a function that matches a left record to a right record with the help of an index cache.
* This is an attempt to improve performance while doing joins
*/
@CompileStatic
private static Closure> hpRightRecordFinder(String[] joinColumns) {
//A map to store all left record indexes.
Map> rightIdx = [:]
Closure> c = { Record leftRecord, RecordFx mOnFunction, List extends List> mRCsv ->
def header = mRCsv[0]
if (rightIdx.isEmpty()) {
//build the right index
def csvSize = mRCsv.size()
for (int i = 0; i < csvSize; i++) {
def rRecord = mRCsv[i]
def rRecObj = Record.getRecord(header, rRecord, mRCsv, i)
def rightString = joinColumns.collect { String colName -> rRecObj.val(colName) }.join('-')
if (!rightIdx[rightString]) {
rightIdx[rightString] = (List) newList()
}
rightIdx[rightString] << rRecObj
}
}
def leftString = joinColumns.collect { String colName -> leftRecord.val(colName) }.join('-')
return rightIdx[leftString]
}
return c
}
@CompileStatic
private static Closure> getDefaultRightRecordFinder() {
Closure> c = { Record r, RecordFx mOnFunction, List extends List> mRCsv ->
def rSize = mRCsv.size()
List finalValues = []
for (int rIdx = 0; rIdx < rSize; rIdx++) {
List rightRecord = mRCsv[rIdx]
if (rIdx == 0) continue
r.rightRecord = rightRecord
if (mOnFunction.getValue(r)) {
def rec = Record.getRecord(mRCsv[0], rightRecord, mRCsv, rIdx)
finalValues << rec
}
}
finalValues
}
return c
}
@CompileStatic
private static Closure> fnJoinOnIndexFinder() {
Closure> c = { Record r, RecordFx mOnFunction, List extends List> mRCsv ->
def idx = r.idx()
def get = FuzzyCSVUtils.safeGet(mRCsv, idx)
List returnValues = []
if (get != null)
returnValues.add(Record.getRecord(mRCsv[0], (List) get, mRCsv, idx))
return returnValues
}
return c
}
@CompileStatic
private static List superJoin(List extends List> leftCsv,
List extends List> rightCsv,
List selectColumns,
RecordFx onFunction,
boolean doLeftJoin,
boolean doRightJoin,
Closure> findRightRecord = null) {
//container to keep track the matchedCSV2 records
def matchedRightRecordIndices = new HashSet()
def finalCSV = [selectColumns]
Record recObj = new Record(leftHeaders: leftCsv[0], rightHeaders: rightCsv[0], recordIdx: -1, leftCsv: leftCsv, rightCsv: rightCsv, finalCsv: finalCSV)
if (!findRightRecord) {
findRightRecord = getDefaultRightRecordFinder()
}
def lSize = leftCsv.size()
for (int lIdx = 0; lIdx < lSize; lIdx++) {
List leftRecord = leftCsv[lIdx]
if (lIdx == 0) continue
recObj.rightRecord = Collections.EMPTY_LIST
recObj.leftRecord = leftRecord
recObj.setRecordIdx(lIdx)
def rightRecords = findRightRecord.call(recObj, onFunction, rightCsv) as List
if (!rightRecords) {
if (doLeftJoin) {
recObj.rightRecord = Collections.EMPTY_LIST
List