co.cask.wrangler.statistics.BasicStatistics Maven / Gradle / Ivy
/*
* Copyright © 2017 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.wrangler.statistics;
import co.cask.wrangler.api.Record;
import co.cask.wrangler.api.statistics.ColumnMetric;
import co.cask.wrangler.api.statistics.Statistics;
import io.dataapps.chlorine.finder.FinderEngine;
import java.util.List;
import java.util.Map;
/**
* Created by nitin on 2/4/17.
*/
public class BasicStatistics implements Statistics {
private final FinderEngine engine;
public BasicStatistics() throws Exception {
engine = new FinderEngine("wrangler-finder.xml", true, false);
}
@Override
public Record aggregate(List records) {
ColumnMetric types = new ColumnMetric();
ColumnMetric stats = new ColumnMetric();
Double count = new Double(0);
for (Record record : records) {
++count;
for (int i = 0; i < record.length(); ++i) {
String column = record.getColumn(i);
Object object = record.getValue(i);
if (object == null) {
stats.increment(column, "null");
} else {
stats.increment(column, "non-null");
}
if (object instanceof String) {
String value = ((String) object);
if (value.isEmpty()) {
stats.increment(column, "empty");
} else {
Map> finds = engine.findWithType(value);
for (String find : finds.keySet()) {
types.increment(column, find);
}
}
}
}
}
Record recordTypes = new Record();
for (String column : types.getColumns()) {
recordTypes.add(column, types.percentage(column, count));
}
Record recordStats = new Record();
for (String column : stats.getColumns()) {
recordStats.add(column, stats.percentage(column, count));
}
Record record = new Record();
record.add("types", recordTypes);
record.add("stats", recordStats);
record.add("total", count);
return record;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy