Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
co.cask.cdap.dq.DataQualityApp Maven / Gradle / Ivy
/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.dq;
import co.cask.cdap.api.Config;
import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.table.Put;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceConfigurer;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.api.plugin.PluginProperties;
import co.cask.cdap.api.schedule.Schedules;
import co.cask.cdap.api.stream.GenericStreamEventData;
import co.cask.cdap.dq.functions.BasicAggregationFunction;
import co.cask.cdap.dq.functions.CombinableAggregationFunction;
import co.cask.cdap.dq.rowkey.AggregationsRowKey;
import co.cask.cdap.dq.rowkey.ValuesRowKey;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.cdap.etl.batch.mapreduce.MapReduceSourceContext;
import co.cask.cdap.etl.common.DatasetContextLookupProvider;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
/**
* Data Quality Application
*/
public class DataQualityApp extends AbstractApplication {
private static final Logger LOG = LoggerFactory.getLogger(DataQualityApp.class);
private static final Gson GSON = new Gson();
private static final Type TOKEN_TYPE_MAP_STRING_SET_STRING = new TypeToken>>() { }.getType();
/**
* Configuration Class for the application
* Sets following fields: aggregationName, fields (a comma separated list
* of the fields we want to aggregate over), workflowScheduleMinutes, source, datasetName,
* inputFormat, and schema.
*/
public static class DataQualityConfig extends Config {
private int workflowScheduleMinutes;
private DataQualitySource source;
private String datasetName;
private Map> fieldAggregations;
public DataQualityConfig(int workflowScheduleMinutes, DataQualitySource source,
String datasetName, Map> fieldAggregations) {
this.workflowScheduleMinutes = workflowScheduleMinutes;
this.source = source;
this.datasetName = datasetName;
this.fieldAggregations = fieldAggregations;
}
}
@Override
public void configure() {
DataQualityConfig configObj = getContext().getConfig();
// TODO: CDAP-3552 : Support different units for workflowSchedule - hours, days etc.
Preconditions.checkArgument(configObj.workflowScheduleMinutes > 0,
"Workflow Frequency in minutes (>0) should be provided");
Preconditions.checkArgument(configObj.source != null, "Configuration for DataQualityApp Source is missing");
Preconditions.checkArgument(!Strings.isNullOrEmpty(configObj.source.getName()),
"Data Quality source name should not be null or empty");
Preconditions.checkArgument(!Strings.isNullOrEmpty(configObj.source.getId()),
"Data Quality source id should not be null or empty");
Preconditions.checkArgument(!Strings.isNullOrEmpty(configObj.datasetName),
"Output Dataset name should be not be null or empty");
Preconditions.checkArgument(configObj.fieldAggregations != null, "fieldAggregations needs to be specified");
Preconditions.checkArgument(!configObj.fieldAggregations.isEmpty(), "fieldAggregations should not be empty");
boolean validEntry = false;
for (Map.Entry> entry : configObj.fieldAggregations.entrySet()) {
if (!Strings.isNullOrEmpty(entry.getKey()) && !entry.getValue().isEmpty()) {
validEntry = true;
break;
}
}
Preconditions.checkArgument(validEntry, "At least one field with one or more aggregations must be provided");
Integer scheduleMinutes = configObj.workflowScheduleMinutes;
addMapReduce(new FieldAggregator(configObj.source,
configObj.datasetName,
configObj.fieldAggregations));
setName("DataQualityApp");
setDescription("Application with MapReduce job to determine the data quality in a Batch Source");
createDataset(configObj.datasetName, Table.class);
addService(new DataQualityService(configObj.datasetName));
addWorkflow(new DataQualityWorkflow());
String schedule = "*/" + scheduleMinutes + " * * * *";
scheduleWorkflow(Schedules.builder("aggregatorSchedule")
.setDescription("Schedule execution every " + scheduleMinutes + " min")
.createTimeSchedule(schedule),
"DataQualityWorkflow");
}
/**
* Map Reduce job that ingests a stream of data and builds an aggregation of the ingested data
* that maps timestamp, sourceId, field type and field value to frequency.
*/
public static final class FieldAggregator extends AbstractMapReduce {
private static final String PLUGIN_ID = "input";
private final String datasetName;
private final Map> fieldAggregations;
private DataQualitySource source;
@SuppressWarnings("unused")
private Metrics metrics;
public FieldAggregator(DataQualitySource source, String datasetName,
Map> fieldAggregations) {
this.source = source;
this.datasetName = datasetName;
this.fieldAggregations = fieldAggregations;
}
@Override
public void configure() {
super.configure();
final MapReduceConfigurer mrConfigurer = getConfigurer();
BatchSource batchSource = usePlugin("batchsource", source.getName(), PLUGIN_ID,
PluginProperties.builder().addAll(source.getProperties()).build());
Preconditions.checkNotNull(batchSource, "Could not find plugin %s of type 'source'", source.getName());
// We use pluginId as the prefixId
batchSource.configurePipeline(new MapReducePipelineConfigurer(mrConfigurer, PLUGIN_ID));
setName("FieldAggregator");
setProperties(ImmutableMap.builder()
.put("fieldAggregations", GSON.toJson(fieldAggregations))
.put("sourceId", source.getId())
.put("sourceName", source.getName())
.put("datasetName", datasetName)
.build());
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
job.setMapperClass(AggregationMapper.class);
job.setReducerClass(AggregationReducer.class);
BatchSource batchSource = context.newPluginInstance(PLUGIN_ID);
// Constructs a BatchSourceContext. The stageId needs to match the format expected by PluginID
BatchSourceContext sourceContext = new MapReduceSourceContext(
context, metrics, new DatasetContextLookupProvider(context),
"batchsource:" + context.getSpecification().getProperty("sourceName") + ":0", context.getRuntimeArguments());
batchSource.prepareRun(sourceContext);
context.addOutput(Output.ofDataset(context.getSpecification().getProperty("datasetName")));
}
}
/**
* Take in data and output a field type as a key and a DataQualityWritable containing the field value
* as an associated value.
*/
public static class AggregationMapper extends
Mapper, Text, DataQualityWritable>
implements ProgramLifecycle {
private Set fieldsSet = new HashSet<>();
@Override
public void initialize(MapReduceContext mapReduceContext) throws Exception {
Map> fieldAggregations =
GSON.fromJson(mapReduceContext.getSpecification().getProperty("fieldAggregations"),
TOKEN_TYPE_MAP_STRING_SET_STRING);
if (fieldAggregations != null) {
fieldsSet = fieldAggregations.keySet();
}
}
@Override
public void map(LongWritable key, GenericStreamEventData event, Context context)
throws IOException, InterruptedException {
StructuredRecord body = event.getBody();
for (Schema.Field field : body.getSchema().getFields()) {
String fieldName = field.getName();
Object fieldVal = body.get(fieldName);
if (fieldVal != null) {
DataQualityWritable outputValue;
if (field.getSchema().isNullable()) {
outputValue = getOutputValue(field.getSchema().getNonNullable().getType(), fieldVal);
} else {
outputValue = getOutputValue(field.getSchema().getType(), fieldVal);
}
if (outputValue != null && (fieldsSet.contains(fieldName) || fieldsSet.isEmpty())) {
context.write(new Text(fieldName), outputValue);
}
}
}
}
@Nullable
private DataQualityWritable getOutputValue(Schema.Type type, Object o) {
DataQualityWritable writable = new DataQualityWritable();
switch (type) {
case STRING:
writable.set(new Text((String) o));
break;
case INT:
writable.set(new IntWritable((Integer) o));
break;
case LONG:
writable.set(new LongWritable((Long) o));
break;
case DOUBLE:
writable.set(new DoubleWritable((Double) o));
break;
case FLOAT:
writable.set(new FloatWritable((Float) o));
break;
case BOOLEAN:
writable.set(new BooleanWritable((Boolean) o));
break;
case BYTES:
writable.set(new ByteWritable((Byte) o));
break;
default:
return null;
}
return writable;
}
@Override
public void destroy() {
}
}
/**
* Generate and write an aggregation with the data collected by the mapper
*/
public static class AggregationReducer extends Reducer
implements ProgramLifecycle {
private static final Gson GSON = new Gson();
private String sourceId;
private Map> fieldAggregations;
long timeKey = 0;
@Override
public void initialize(MapReduceContext mapReduceContext) throws Exception {
timeKey = mapReduceContext.getLogicalStartTime();
sourceId = mapReduceContext.getSpecification().getProperty("sourceId");
fieldAggregations = GSON.fromJson(mapReduceContext.getSpecification().getProperty("fieldAggregations"),
TOKEN_TYPE_MAP_STRING_SET_STRING);
}
@Override
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
LOG.trace("timestamp: {}", timeKey);
Set aggregationTypesSet = fieldAggregations.get(key.toString());
List aggregationTypeValueList = new ArrayList<>();
AggregationsRowKey aggregationsRowKey = new AggregationsRowKey(timeKey, sourceId);
byte[] fieldColumnKey = Bytes.toBytes(key.toString());
for (String aggregationType : aggregationTypesSet) {
try {
Class> aggregationClass = Class.forName("co.cask.cdap.dq.functions." + aggregationType);
BasicAggregationFunction instance = (BasicAggregationFunction) aggregationClass.newInstance();
boolean isCombinable = instance instanceof CombinableAggregationFunction;
for (DataQualityWritable value : values) {
instance.add(value);
}
ValuesRowKey valuesRowKey = new ValuesRowKey(timeKey, key.toString(), sourceId);
context.write(valuesRowKey.getTableRowKey(), new Put(valuesRowKey.getTableRowKey(),
Bytes.toBytes(aggregationType),
instance.aggregate()));
AggregationTypeValue aggregationTypeValue = new AggregationTypeValue(aggregationType, isCombinable);
aggregationTypeValueList.add(aggregationTypeValue);
} catch (ClassNotFoundException | RuntimeException | InstantiationException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}
byte[] aggregationTypeListBytes = Bytes.toBytes(GSON.toJson(aggregationTypeValueList));
context.write(aggregationsRowKey.getTableRowKey(),
new Put(aggregationsRowKey.getTableRowKey(), fieldColumnKey, aggregationTypeListBytes));
//TODO: CDAP-3538 There are issues if multiple apps use same source, same fields, same dataset
//same aggregations but different schedule minutes
}
@Override
public void destroy() {
}
}
}