com.datastax.data.exploration.biz.stat.Cross Maven / Gradle / Ivy
The newest version!
package com.datastax.data.exploration.biz.stat;
import com.datastax.data.exploration.biz.chiSquare.ChiSquare;
import com.datastax.data.exploration.biz.datatable.DataTable;
import com.datastax.data.exploration.common.DataOperator;
import com.datastax.data.exploration.dto.dataChart.stackedBar.Crosstab;
import com.datastax.data.exploration.util.Consts;
import org.javatuples.Pair;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import static com.datastax.data.exploration.biz.stat.Cross.Func.*;
/**
* 交叉分组表
*/
public class Cross {
/**
* 获取summary摘要
*
* @param id 数据集 id
* @return 返回的是summary数组
*/
public static String[] getSummary(String id) {
String[] summary = new String[6];
long exactCount = DataOperator.readExactData(id).size();
long errorCount = DataOperator.readErrorData(id).size();
summary[0] = String.valueOf(exactCount);
summary[1] = setCale((double) exactCount / (exactCount + errorCount) * 100) + "%";
summary[2] = String.valueOf(errorCount);
summary[3] = setCale((double) errorCount / (exactCount + errorCount) * 100) + "%";
summary[4] = String.valueOf(exactCount + errorCount);
summary[5] = "100.0%";
return summary;
}
/**
* 获取交叉分组表内容信息
*
* @param map 计算式
* @param groupSubs 分组子分项
* @param pair 包含groupCounts和stackCountTotals
* @param pairTotal 包含countSum和groupCountTotals
* @return 交叉分组表内容信息
*/
public static Crosstab.ModuleData getContentModule(Map map, String group, List groupSubs, Pair>, List> pair, Pair> pairTotal) {
List> groupCounts = pair.getValue0();
List stackCountTotals = pair.getValue1();
long countSum = pairTotal.getValue0();
List groupCountTotals = pairTotal.getValue1();
List groupDatas = new ArrayList<>();
for (int grp = 0; grp < groupSubs.size(); grp++) {
List dataList = new ArrayList<>();
long stackCountTotal = stackCountTotals.get(grp);
for (Func func : map.keySet()) {
List rowValue = rowCal(func, groupCountTotals, groupCounts.get(grp), stackCountTotal, countSum);
dataList.add(new Crosstab.Data(map.get(func), rowValue));
}
groupDatas.add(new Crosstab.GroupData(groupSubs.get(grp), dataList));
}
return new Crosstab.ModuleData(group, groupDatas);
}
/**
* 获取交叉分组表合计信息
*
* @param map 计算式
* @param groupCountTotals 合计中的count值
* @param countSum 合计总数
* @return 返回的是交叉分组表合计信息
*/
public static Crosstab.GroupData getTotalModule(Map map, List groupCountTotals, long countSum) {
int stackSubSize = groupCountTotals.size();
groupCountTotals.add(countSum);
List totalDatas = new ArrayList();
List expectTotal = new ArrayList();
List perRowTotal = new ArrayList();
List perColTotal = new ArrayList();
for (int stk = 0; stk < stackSubSize + 1; stk++) {
if (stk < stackSubSize) {
expectTotal.add(setCale(groupCountTotals.get(stk)));
perRowTotal.add(setCale((double) groupCountTotals.get(stk) / countSum * 100) + "%");
} else {
expectTotal.add(setCale(countSum));
perRowTotal.add("100.0%");
}
perColTotal.add("100.0%");
}
List perSumTotal = perRowTotal;
totalDatas.add(new Crosstab.Data(map.get(COUNT), groupCountTotals));
totalDatas.add(new Crosstab.Data(map.get(EXPECT), expectTotal));
totalDatas.add(new Crosstab.Data(map.get(PERCENT_ROW), perRowTotal));
totalDatas.add(new Crosstab.Data(map.get(PERCENT_COLUMN), perColTotal));
totalDatas.add(new Crosstab.Data(map.get(PERCENT_TOTAL), perSumTotal));
return new Crosstab.GroupData("合计", totalDatas);
}
/**
* 获取卡方验证的信息
*
* @param groupCounts 每分组的count值
* @param n 有效案例个数
* @return 返回的是卡方验证的信息
*/
public static List getChiSquare(List> groupCounts, long n) {
List chiSquares = new ArrayList<>();
int groupSize = groupCounts.size();
if (groupSize > 1 && groupCounts.get(0).size() >1) {
int stackSize = groupCounts.get(0).size();
int df = (groupSize - 1) * (stackSize - 1);
long[][] counts = new long[groupSize][stackSize];
for (int grp = 0; grp < groupSize; grp++) {
for (int co = 0; co < stackSize; co++) {
counts[grp][co] = groupCounts.get(grp).get(co);
}
}
BigDecimal chiVal = setCale(ChiSquare.chiValue(counts));
BigDecimal pVal = setCale(ChiSquare.chiPvalue(counts));
chiSquares.add(new Crosstab.ChiSquare("Perarson 卡方", String.valueOf(chiVal), String.valueOf(df), String.valueOf(pVal)));
chiSquares.add(new Crosstab.ChiSquare("有效案例中的N", String.valueOf(n), "", ""));
} else {
chiSquares.add(new Crosstab.ChiSquare("Perarson 卡方","无","无","无"));
chiSquares.add(new Crosstab.ChiSquare("有效案例中的N", String.valueOf(n), "", ""));
}
return chiSquares;
}
/**
* 获取每分组的count数据,以及分组的count总和,是后面各计算的基础
*
* @return 返回的是分组的count数据和分组的count总和 两元组
*/
public static Pair>, List> getCount(DataTable table, String group, String stack) {
Map> aggSubCounts = aggSubCounts(table, group, stack);
List stackCountTotals = new ArrayList();
List> groupCounts = new ArrayList();
aggSubCounts.forEach((grp, stks) -> {
List counts = new ArrayList();
AtomicLong countTotal = new AtomicLong();
stks.forEach((k, v) -> {
countTotal.addAndGet(v);
counts.add(v);
});
stackCountTotals.add(countTotal.get());
groupCounts.add(counts);
});
return Pair.with(groupCounts, stackCountTotals);
}
/**
* 获取 count总和 和 合计中的count值
*
* @param groupCounts 每分组的count数据
* @return
*/
public static Pair> getCountTotals(List> groupCounts) {
if (groupCounts.size() > 0) {
long countSum = 0;
List groupCountTotals = new ArrayList();
for (int stk = 0; stk < groupCounts.get(0).size(); stk++) {
long groupConutTotal = 0;
for (int grp = 0; grp < groupCounts.size(); grp++) {
groupConutTotal += groupCounts.get(grp).get(stk);
}
countSum += groupConutTotal;
groupCountTotals.add(groupConutTotal);
}
return Pair.with(countSum, groupCountTotals);
}
return new Pair<>(0L, Collections.EMPTY_LIST);
}
/**
* 对展示内容对应分组的计算
*
* @param func 计算式
* @return 返回的是每行的计算结果
*/
private static List rowCal(Func func, List groupCountTotals, List groupCounts, long stackCountTotal, long countSum) {
int stackSubSize = groupCountTotals.size();
switch (func) {
case COUNT: {
List count = new ArrayList();
count.addAll(groupCounts);
count.add(stackCountTotal);
return count;
}
case EXPECT: {
List expectValue = new ArrayList();
for (int stk = 0; stk < stackSubSize; stk++) {
double value = (double) groupCountTotals.get(stk) / countSum * stackCountTotal;
expectValue.add(setCale(value));
}
expectValue.add(setCale(stackCountTotal));
return expectValue;
}
case PERCENT_ROW: {
List perValue = new ArrayList();
for (int stk = 0; stk < stackSubSize; stk++) {
double value = (double) groupCounts.get(stk) / stackCountTotal * 100;
perValue.add(setCale(value) + "%");
}
perValue.add("100.0%");
return perValue;
}
case PERCENT_COLUMN: {
List perColValue = new ArrayList();
for (int stk = 0; stk < stackSubSize; stk++) {
double value = (double) groupCounts.get(stk) / groupCountTotals.get(stk) * 100;
perColValue.add(setCale(value) + "%");
}
perColValue.add(setCale((double) stackCountTotal / countSum * 100) + "%");
return perColValue;
}
case PERCENT_TOTAL: {
List perTotalValue = new ArrayList();
for (int stk = 0; stk < stackSubSize; stk++) {
double value = (double) groupCounts.get(stk) / countSum * 100;
perTotalValue.add(setCale(value) + "%");
}
perTotalValue.add(setCale((double) stackCountTotal / countSum * 100) + "%");
return perTotalValue;
}
case RESIDUAL_UN: {
List residualValue = new ArrayList();
for (int stk = 0; stk < stackSubSize; stk++) {
double expectValue = (double) groupCountTotals.get(stk) / countSum * stackCountTotal;
double value = groupCounts.get(stk) - expectValue;
residualValue.add(setCale(value));
}
residualValue.add(Consts.EMPTY);
return residualValue;
}
default:
;
}
return Collections.EMPTY_LIST;
}
public static Map getFuncs(String group, String stack) {
Map map = new LinkedHashMap<>();
map.put(COUNT, "计数");
map.put(EXPECT, "期望的计数");
map.put(PERCENT_ROW, group + " 中的%");
map.put(PERCENT_COLUMN, stack + " 中的%");
map.put(PERCENT_TOTAL, "总数的%");
map.put(RESIDUAL_UN, "残差");
return map;
}
private static Map> aggSubCounts(DataTable table, String groupName, String valueName) {
Map> map = new HashMap<>();
table.getRows().aggList(groupName).forEach((k, v) -> {
List