All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datastax.data.exploration.biz.stat.Cross Maven / Gradle / Ivy

The newest version!
package com.datastax.data.exploration.biz.stat;

import com.datastax.data.exploration.biz.chiSquare.ChiSquare;
import com.datastax.data.exploration.biz.datatable.DataTable;
import com.datastax.data.exploration.common.DataOperator;
import com.datastax.data.exploration.dto.dataChart.stackedBar.Crosstab;
import com.datastax.data.exploration.util.Consts;
import org.javatuples.Pair;

import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;

import static com.datastax.data.exploration.biz.stat.Cross.Func.*;

/**
 * 交叉分组表
 */
public class Cross {

    /**
     * 获取summary摘要
     *
     * @param id 数据集 id
     * @return 返回的是summary数组
     */
    public static String[] getSummary(String id) {
        String[] summary = new String[6];
        long exactCount = DataOperator.readExactData(id).size();
        long errorCount = DataOperator.readErrorData(id).size();
        summary[0] = String.valueOf(exactCount);
        summary[1] = setCale((double) exactCount / (exactCount + errorCount) * 100) + "%";
        summary[2] = String.valueOf(errorCount);
        summary[3] = setCale((double) errorCount / (exactCount + errorCount) * 100) + "%";
        summary[4] = String.valueOf(exactCount + errorCount);
        summary[5] = "100.0%";
        return summary;
    }

    /**
     * 获取交叉分组表内容信息
     *
     * @param map       计算式
     * @param groupSubs 分组子分项
     * @param pair      包含groupCounts和stackCountTotals
     * @param pairTotal 包含countSum和groupCountTotals
     * @return 交叉分组表内容信息
     */
    public static Crosstab.ModuleData getContentModule(Map map, String group, List groupSubs, Pair>, List> pair, Pair> pairTotal) {
        List> groupCounts = pair.getValue0();
        List stackCountTotals = pair.getValue1();
        long countSum = pairTotal.getValue0();
        List groupCountTotals = pairTotal.getValue1();
        List groupDatas = new ArrayList<>();
        for (int grp = 0; grp < groupSubs.size(); grp++) {
            List dataList = new ArrayList<>();
            long stackCountTotal = stackCountTotals.get(grp);
            for (Func func : map.keySet()) {
                List rowValue = rowCal(func, groupCountTotals, groupCounts.get(grp), stackCountTotal, countSum);
                dataList.add(new Crosstab.Data(map.get(func), rowValue));
            }
            groupDatas.add(new Crosstab.GroupData(groupSubs.get(grp), dataList));
        }
        return new Crosstab.ModuleData(group, groupDatas);
    }

    /**
     * 获取交叉分组表合计信息
     *
     * @param map              计算式
     * @param groupCountTotals 合计中的count值
     * @param countSum         合计总数
     * @return 返回的是交叉分组表合计信息
     */
    public static Crosstab.GroupData getTotalModule(Map map, List groupCountTotals, long countSum) {
        int stackSubSize = groupCountTotals.size();
        groupCountTotals.add(countSum);
        List totalDatas = new ArrayList();
        List expectTotal = new ArrayList();
        List perRowTotal = new ArrayList();
        List perColTotal = new ArrayList();
        for (int stk = 0; stk < stackSubSize + 1; stk++) {
            if (stk < stackSubSize) {
                expectTotal.add(setCale(groupCountTotals.get(stk)));
                perRowTotal.add(setCale((double) groupCountTotals.get(stk) / countSum * 100) + "%");
            } else {
                expectTotal.add(setCale(countSum));
                perRowTotal.add("100.0%");
            }
            perColTotal.add("100.0%");
        }
        List perSumTotal = perRowTotal;
        totalDatas.add(new Crosstab.Data(map.get(COUNT), groupCountTotals));
        totalDatas.add(new Crosstab.Data(map.get(EXPECT), expectTotal));
        totalDatas.add(new Crosstab.Data(map.get(PERCENT_ROW), perRowTotal));
        totalDatas.add(new Crosstab.Data(map.get(PERCENT_COLUMN), perColTotal));
        totalDatas.add(new Crosstab.Data(map.get(PERCENT_TOTAL), perSumTotal));
        return new Crosstab.GroupData("合计", totalDatas);
    }

    /**
     * 获取卡方验证的信息
     *
     * @param groupCounts 每分组的count值
     * @param n           有效案例个数
     * @return 返回的是卡方验证的信息
     */
    public static List getChiSquare(List> groupCounts, long n) {
        List chiSquares = new ArrayList<>();
        int groupSize = groupCounts.size();
        if (groupSize > 1 && groupCounts.get(0).size() >1) {
            int stackSize = groupCounts.get(0).size();
            int df = (groupSize - 1) * (stackSize - 1);
            long[][] counts = new long[groupSize][stackSize];
            for (int grp = 0; grp < groupSize; grp++) {
                for (int co = 0; co < stackSize; co++) {
                    counts[grp][co] = groupCounts.get(grp).get(co);
                }
            }
            BigDecimal chiVal = setCale(ChiSquare.chiValue(counts));
            BigDecimal pVal = setCale(ChiSquare.chiPvalue(counts));
            chiSquares.add(new Crosstab.ChiSquare("Perarson 卡方", String.valueOf(chiVal), String.valueOf(df), String.valueOf(pVal)));
            chiSquares.add(new Crosstab.ChiSquare("有效案例中的N", String.valueOf(n), "", ""));
        } else {
            chiSquares.add(new Crosstab.ChiSquare("Perarson 卡方","无","无","无"));
            chiSquares.add(new Crosstab.ChiSquare("有效案例中的N", String.valueOf(n), "", ""));
        }
        return chiSquares;
    }

    /**
     * 获取每分组的count数据,以及分组的count总和,是后面各计算的基础
     *
     * @return 返回的是分组的count数据和分组的count总和 两元组
     */
    public static Pair>, List> getCount(DataTable table, String group, String stack) {
        Map> aggSubCounts = aggSubCounts(table, group, stack);
        List stackCountTotals = new ArrayList();
        List> groupCounts = new ArrayList();
        aggSubCounts.forEach((grp, stks) -> {
            List counts = new ArrayList();
            AtomicLong countTotal = new AtomicLong();
            stks.forEach((k, v) -> {
                countTotal.addAndGet(v);
                counts.add(v);
            });
            stackCountTotals.add(countTotal.get());
            groupCounts.add(counts);
        });
        return Pair.with(groupCounts, stackCountTotals);
    }

    /**
     * 获取 count总和 和 合计中的count值
     *
     * @param groupCounts 每分组的count数据
     * @return
     */
    public static Pair> getCountTotals(List> groupCounts) {
        if (groupCounts.size() > 0) {
            long countSum = 0;
            List groupCountTotals = new ArrayList();
            for (int stk = 0; stk < groupCounts.get(0).size(); stk++) {
                long groupConutTotal = 0;
                for (int grp = 0; grp < groupCounts.size(); grp++) {
                    groupConutTotal += groupCounts.get(grp).get(stk);
                }
                countSum += groupConutTotal;
                groupCountTotals.add(groupConutTotal);
            }
            return Pair.with(countSum, groupCountTotals);
        }
        return new Pair<>(0L, Collections.EMPTY_LIST);
    }

    /**
     * 对展示内容对应分组的计算
     *
     * @param func 计算式
     * @return 返回的是每行的计算结果
     */
    private static List rowCal(Func func, List groupCountTotals, List groupCounts, long stackCountTotal, long countSum) {
        int stackSubSize = groupCountTotals.size();
        switch (func) {
            case COUNT: {
                List count = new ArrayList();
                count.addAll(groupCounts);
                count.add(stackCountTotal);
                return count;
            }
            case EXPECT: {
                List expectValue = new ArrayList();
                for (int stk = 0; stk < stackSubSize; stk++) {
                    double value = (double) groupCountTotals.get(stk) / countSum * stackCountTotal;
                    expectValue.add(setCale(value));
                }
                expectValue.add(setCale(stackCountTotal));
                return expectValue;
            }
            case PERCENT_ROW: {
                List perValue = new ArrayList();
                for (int stk = 0; stk < stackSubSize; stk++) {
                    double value = (double) groupCounts.get(stk) / stackCountTotal * 100;
                    perValue.add(setCale(value) + "%");
                }
                perValue.add("100.0%");
                return perValue;
            }
            case PERCENT_COLUMN: {
                List perColValue = new ArrayList();
                for (int stk = 0; stk < stackSubSize; stk++) {
                    double value = (double) groupCounts.get(stk) / groupCountTotals.get(stk) * 100;
                    perColValue.add(setCale(value) + "%");
                }
                perColValue.add(setCale((double) stackCountTotal / countSum * 100) + "%");
                return perColValue;
            }
            case PERCENT_TOTAL: {
                List perTotalValue = new ArrayList();
                for (int stk = 0; stk < stackSubSize; stk++) {
                    double value = (double) groupCounts.get(stk) / countSum * 100;
                    perTotalValue.add(setCale(value) + "%");
                }
                perTotalValue.add(setCale((double) stackCountTotal / countSum * 100) + "%");
                return perTotalValue;
            }
            case RESIDUAL_UN: {
                List residualValue = new ArrayList();
                for (int stk = 0; stk < stackSubSize; stk++) {
                    double expectValue = (double) groupCountTotals.get(stk) / countSum * stackCountTotal;
                    double value = groupCounts.get(stk) - expectValue;
                    residualValue.add(setCale(value));
                }
                residualValue.add(Consts.EMPTY);
                return residualValue;
            }
            default:
                ;
        }
        return Collections.EMPTY_LIST;
    }

    public static Map getFuncs(String group, String stack) {
        Map map = new LinkedHashMap<>();
        map.put(COUNT, "计数");
        map.put(EXPECT, "期望的计数");
        map.put(PERCENT_ROW, group + " 中的%");
        map.put(PERCENT_COLUMN, stack + " 中的%");
        map.put(PERCENT_TOTAL, "总数的%");
        map.put(RESIDUAL_UN, "残差");
        return map;
    }

    private static Map> aggSubCounts(DataTable table, String groupName, String valueName) {
        Map> map = new HashMap<>();
        table.getRows().aggList(groupName).forEach((k, v) -> {
            List list = v.parallelStream().map(row -> row.getValue(valueName)).collect(Collectors.toList());
            map.put((String) k, list.parallelStream().map(e -> String.valueOf(e)).collect(Collectors.groupingBy(s -> s, Collectors.counting())));
        });
        return map;
    }

    public enum Func {
        COUNT, EXPECT, PERCENT_ROW, PERCENT_COLUMN, PERCENT_TOTAL, RESIDUAL_UN;
    }

    private static BigDecimal setCale(double value) {
        return new BigDecimal(value).setScale(1, RoundingMode.UP);
    }

}