
prerna.poi.main.helper.excel.ExcelBlock Maven / Gradle / Ivy
The newest version!
package prerna.poi.main.helper.excel;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import prerna.algorithm.api.SemossDataType;
import prerna.poi.main.helper.FileHelperUtil;
public class ExcelBlock {
// keep track of statistics on the start column
private SummaryStatistics startColumnIndexStats = new SummaryStatistics();
// keep track of the statistics on the end column
private SummaryStatistics totalColumnsInRowStats = new SummaryStatistics();
// contain a list of the indicies that this block contains
private List rowIndicesInBlock = new Vector();
private int lastColMaxIndex = -1;
private Map columnToRowIndexStats = new Hashtable ();
// for data types
private Map> columnToTypeStats = new Hashtable >();
private Map> additionalFormatTracker = new HashMap>();
/**
* Get the ranges of the block
* This will split up the entire segment if there are empty columns
* @return
*/
public List getRanges() {
List ranges = new Vector();
boolean started = false;
int startCol = new Double(startColumnIndexStats.getMin()).intValue();
int max = new Double(columnToRowIndexStats.get(startCol).getMax()).intValue();
for(int colIndex = 0; colIndex <= lastColMaxIndex; colIndex++) {
// get max row by comparing every max in each column
SummaryStatistics rowIndexStats = columnToRowIndexStats.get(new Integer(colIndex - 1));
if (rowIndexStats != null) {
int tempMax = new Double(rowIndexStats.getMax()).intValue();
if (tempMax > max) {
max = tempMax;
}
}
if(!columnToRowIndexStats.containsKey( new Integer(colIndex) ) && started) {
if(columnToRowIndexStats.containsKey(startCol)) {
int min = new Double(columnToRowIndexStats.get(new Integer(startCol)).getMin()).intValue();
ExcelRange r = new ExcelRange(startCol + 1, colIndex, min, max);
ranges.add(r);
}
startCol = 0;
started = false;
}
if(columnToRowIndexStats.containsKey(new Integer(colIndex)) && !started ) {
started = true;
startCol = colIndex;
}
}
return ranges;
}
public Object[][] getRangeTypes(ExcelRange range) {
int[] rangeIndex = range.getIndices();
int numCols = rangeIndex[2] - rangeIndex[0] + 1;
Object[][] predictedTypes = new Object[numCols][3];
// loop through based on the range to figure out the types
// note, this is just < and not <= in the loop
int colIndex = 0;
COLUMN_LOOP : for(int cellIndex = rangeIndex[0]-1; cellIndex < rangeIndex[2]; cellIndex++, colIndex++) {
// get column index
int startRow = rangeIndex[1];
int endRow = rangeIndex[3];
Map typesMap = columnToTypeStats.get(cellIndex);
// we gotta see based on the range
// what the type will be
// basically
// if the summary stats min/max contains the startrow/endrow
// we know it is that type
// but the order of the checks is important
// if it contains 1 string, then string
// it it contains multiple types, then string
// if it is combination of int and double, then double
// if it is combination of timestamp and date, then timestamp
// if other combination, then string
boolean containsStr = false;
boolean containsInt = false;
boolean containsDouble = false;
boolean containsDate = false;
boolean containsTimestamp = false;
if(typesMap.containsKey(SemossDataType.STRING)) {
containsStr = testTypeContainedWtihinRange(startRow, endRow, typesMap.get(SemossDataType.STRING));
// if we have string, we are done
if(containsStr) {
Object[] columnPrediction = new Object[2];
columnPrediction[0] = SemossDataType.STRING;
predictedTypes[colIndex] = columnPrediction;
continue COLUMN_LOOP;
}
}
if(typesMap.containsKey(SemossDataType.INT)) {
containsInt = testTypeContainedWtihinRange(startRow, endRow, typesMap.get(SemossDataType.INT));
}
if(typesMap.containsKey(SemossDataType.DOUBLE)) {
containsDouble = testTypeContainedWtihinRange(startRow, endRow, typesMap.get(SemossDataType.DOUBLE));
}
if(typesMap.containsKey(SemossDataType.DATE)) {
containsDate = testTypeContainedWtihinRange(startRow, endRow, typesMap.get(SemossDataType.DATE));
}
if(typesMap.containsKey(SemossDataType.TIMESTAMP)) {
containsTimestamp = testTypeContainedWtihinRange(startRow, endRow, typesMap.get(SemossDataType.TIMESTAMP));
}
// if we have some kind of number
// if only int, then int
// otherwise, double
if(!containsDate && !containsTimestamp && (containsInt || containsDouble)) {
Object[] columnPrediction = new Object[2];
if(containsInt && !containsDouble) {
columnPrediction[0] = SemossDataType.INT;
} else {
columnPrediction[0] = SemossDataType.DOUBLE;
}
predictedTypes[colIndex] = columnPrediction;
continue COLUMN_LOOP;
}
// if we have some kind of date or timestamp
// if only date, then date
// otherwise, timestamp
if(!containsInt && !containsDouble && (containsDate || containsTimestamp)) {
Map formatting = additionalFormatTracker.get(cellIndex);
Map mostPopularFormat = new HashMap();
for(String s : formatting.keySet()) {
mostPopularFormat.put(s, (int) formatting.get(s).getN());
}
// reconcile formats
FileHelperUtil.reconcileDateFormats(mostPopularFormat);
String mostOccuringFormat = Collections.max(mostPopularFormat.entrySet(), Comparator.comparingInt(Map.Entry::getValue)).getKey();
Object[] columnPrediction = new Object[2];
if(containsDate && !containsTimestamp) {
columnPrediction[0] = SemossDataType.DATE;
} else {
columnPrediction[0] = SemossDataType.TIMESTAMP;
}
columnPrediction[1] = mostOccuringFormat;
predictedTypes[colIndex] = columnPrediction;
continue COLUMN_LOOP;
}
// we have mixed types
// return string
Object[] columnPrediction = new Object[2];
columnPrediction[0] = SemossDataType.STRING;
predictedTypes[colIndex] = columnPrediction;
continue COLUMN_LOOP;
}
return predictedTypes;
}
private boolean testTypeContainedWtihinRange(int startRow, int endRow, SummaryStatistics stats) {
// we need to ignore the type for the startRow
double minRow = stats.getMin();
double maxRow = stats.getMax();
if(startRow > minRow) {
return true;
}
if(endRow <= maxRow) {
return true;
}
return false;
}
/**
* Determine if 2 blocks are the same
* @param block
* @return
*/
public boolean sameAs(ExcelBlock newBlock) {
boolean retValue = false;
double startDiff = Math.abs(this.startColumnIndexStats.getMean() - newBlock.startColumnIndexStats.getMean());
double endDiff = Math.abs(this.totalColumnsInRowStats.getMean() - newBlock.totalColumnsInRowStats.getMean());
// need to check to see if they are within the standard deviation
// if the start column is similar
// and if the number of columns is similar then there is a possibilty this is the same block
if(startDiff <= this.startColumnIndexStats.getStandardDeviation()) {
if(endDiff <= this.totalColumnsInRowStats.getStandardDeviation()) {
retValue = true;
}
}
return retValue;
}
public void merge(ExcelBlock equivBlock) {
this.rowIndicesInBlock.addAll(equivBlock.rowIndicesInBlock);
// try to set the last column
this.trySetLastColMaxIndex(equivBlock.lastColMaxIndex);
}
/**
* Add the start column index of this row
* @param colIndex
*/
public void addStartColumnIndex(int colIndex) {
startColumnIndexStats.addValue(colIndex);
}
/**
* Add the total number of columns to this row
* @param numRows
*/
public void addTotalColumnsInRowStats(int numRows) {
totalColumnsInRowStats.addValue(numRows);
}
/**
* Add the row index if it contains data
* @param rowIndex
*/
public void addRowIndexContainingData(int rowIndex) {
rowIndicesInBlock.add(rowIndex);
}
/**
* Determine if this block is empty
* @return
*/
public boolean isEmpty() {
return rowIndicesInBlock.isEmpty();
}
/**
* Get the number of indices in the block
* @return
*/
public int numIndicesInBlock() {
return rowIndicesInBlock.size();
}
public void addColumnToRowIndexWithData(int columnIndex, int rowIndex, SemossDataType type, String additionalType) {
Integer objColumnIndex = new Integer(columnIndex);
// update index stats
SummaryStatistics stats = null;
if(columnToRowIndexStats.containsKey(objColumnIndex)) {
stats = columnToRowIndexStats.get(objColumnIndex);
} else {
stats = new SummaryStatistics();
columnToRowIndexStats.put(objColumnIndex, stats);
}
stats.addValue(rowIndex);
// update data type stats
Map typeMap = null;
if(columnToTypeStats.containsKey(objColumnIndex)) {
typeMap = columnToTypeStats.get(objColumnIndex);
} else {
typeMap = new HashMap();
columnToTypeStats.put(objColumnIndex, typeMap);
}
SummaryStatistics typeStats = null;
if(typeMap.containsKey(type)) {
typeStats = typeMap.get(type);
} else {
typeStats = new SummaryStatistics();
typeMap.put(type, typeStats);
}
typeStats.addValue(rowIndex);
// also acount for additional types
if(additionalType != null) {
Map tracker = null;
if(additionalFormatTracker.containsKey(objColumnIndex)) {
tracker = additionalFormatTracker.get(objColumnIndex);
} else {
tracker = new HashMap();
additionalFormatTracker.put(objColumnIndex, tracker);
}
SummaryStatistics additionalTypeStats = null;
if(tracker.containsKey(additionalType)) {
additionalTypeStats = tracker.get(additionalType);
} else {
additionalTypeStats = new SummaryStatistics();
tracker.put(additionalType, additionalTypeStats);
}
stats.addValue(rowIndex);
}
}
/**
* Try to set a new last col max index
* @param startColMinIndex
*/
public void trySetLastColMaxIndex(int lastColMaxIndex) {
if(this.lastColMaxIndex == -1) {
this.lastColMaxIndex = lastColMaxIndex;
} else if(this.lastColMaxIndex < lastColMaxIndex) {
this.lastColMaxIndex = lastColMaxIndex;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy