co.cask.cdap.dq.DataQualityService Maven / Gradle / Ivy
/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.dq;
import co.cask.cdap.api.annotation.Property;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.api.service.AbstractService;
import co.cask.cdap.api.service.http.AbstractHttpServiceHandler;
import co.cask.cdap.api.service.http.HttpServiceContext;
import co.cask.cdap.api.service.http.HttpServiceRequest;
import co.cask.cdap.api.service.http.HttpServiceResponder;
import co.cask.cdap.dq.functions.BasicAggregationFunction;
import co.cask.cdap.dq.functions.CombinableAggregationFunction;
import co.cask.cdap.dq.rowkey.AggregationsRowKey;
import co.cask.cdap.dq.rowkey.ValuesRowKey;
import com.google.common.base.Charsets;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import java.io.IOException;
import java.lang.reflect.Type;
import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.ws.rs.DefaultValue;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.QueryParam;
/**
* Service for querying values in the data quality histogram
*/
public class DataQualityService extends AbstractService {
public static final String SERVICE_NAME = "DataQualityService";
private static final Gson GSON = new Gson();
private static final Type TOKEN_TYPE_SET_AGGREGATION_TYPE_VALUES =
new TypeToken>() { }.getType();
private final String datasetName;
public DataQualityService(String datasetName) {
this.datasetName = datasetName;
}
@Override
protected void configure() {
setName(SERVICE_NAME);
setDescription("Service to query data quality histogram.");
addHandler(new ValuesLookup(datasetName));
}
/**
* Handler class for Data Quality Combinable Aggregations Service
*/
@Path("/v1")
public static final class ValuesLookup extends AbstractHttpServiceHandler {
@Property
private final String datasetName;
Table dataStore;
public ValuesLookup(String datasetName) {
this.datasetName = datasetName;
}
@Override
public void initialize(HttpServiceContext context) throws Exception {
super.initialize(context);
dataStore = context.getDataset(datasetName);
}
/**
* Gets the fields that are queryable for a given time range and sourceID
* for combinable aggregation functions
*/
@Path("sources/{sourceID}/fields")
@GET
public void fieldsGetter(HttpServiceRequest request, HttpServiceResponder responder,
@PathParam("sourceID") String sourceID,
@QueryParam("startTimestamp") @DefaultValue("0") long startTimestamp,
@QueryParam("endTimestamp") @DefaultValue("9223372036854775807")
long endTimestamp) throws IOException {
AggregationsRowKey aggregationsRowKeyStart = new AggregationsRowKey(startTimestamp, sourceID);
// scan rows inclusive of endTimestamp
AggregationsRowKey aggregationsRowKeyEnd = new AggregationsRowKey(endTimestamp + 1, sourceID);
Scanner scanner = dataStore.scan(aggregationsRowKeyStart.getTableRowKey(),
aggregationsRowKeyEnd.getTableRowKey());
Row row;
Map fieldDetailMap = new HashMap<>();
try {
while ((row = scanner.next()) != null) {
Map columnsMapBytes = row.getColumns();
List timestampSpecificFieldDetailList = new ArrayList<>();
for (Map.Entry columnMapEntry : columnsMapBytes.entrySet()) {
String fieldName = Bytes.toString(columnMapEntry.getKey());
byte[] output = columnMapEntry.getValue();
String outputString = Bytes.toString(output);
Set aggregationTypeValuesSet =
GSON.fromJson(outputString, TOKEN_TYPE_SET_AGGREGATION_TYPE_VALUES);
FieldDetail fieldDetail = new FieldDetail(fieldName, aggregationTypeValuesSet);
timestampSpecificFieldDetailList.add(fieldDetail);
}
for (FieldDetail fdTimestampSpecific : timestampSpecificFieldDetailList) {
String fdTimestampSpecificFieldName = fdTimestampSpecific.getFieldName();
if (fieldDetailMap.containsKey(fdTimestampSpecificFieldName)) {
FieldDetail fdCombined = fieldDetailMap.get(fdTimestampSpecificFieldName);
fdCombined.addAggregations(fdTimestampSpecific.getAggregationTypeSet());
} else {
fieldDetailMap.put(fdTimestampSpecificFieldName, fdTimestampSpecific);
}
}
}
} finally {
scanner.close();
}
if (fieldDetailMap.isEmpty()) {
responder.sendString(HttpURLConnection.HTTP_NOT_FOUND,
String.format("No fields for source '%s' found within time range.", sourceID), Charsets.UTF_8);
} else {
responder.sendJson(HttpURLConnection.HTTP_OK, fieldDetailMap.values());
}
}
/**
* Gets the aggregation functions that are queryable for a given time range, sourceID, and field name
*/
@Path("sources/{sourceID}/fields/{fieldName}")
@GET
public void aggregationTypesGetter(HttpServiceRequest request, HttpServiceResponder responder,
@PathParam("fieldName") String fieldName,
@PathParam("sourceID") String sourceID,
@QueryParam("startTimestamp") @DefaultValue("0") long startTimestamp,
@QueryParam("endTimestamp") @DefaultValue("9223372036854775807")
long endTimestamp) throws IOException {
AggregationsRowKey aggregationsRowKeyStart = new AggregationsRowKey(startTimestamp, sourceID);
// scan rows inclusive of endTimestamp
AggregationsRowKey aggregationsRowKeyEnd = new AggregationsRowKey(endTimestamp + 1, sourceID);
Scanner scanner = dataStore.scan(aggregationsRowKeyStart.getTableRowKey(),
aggregationsRowKeyEnd.getTableRowKey());
Row row;
byte[] fieldNameBytes = Bytes.toBytes(fieldName);
Set commonAggregationTypeValues = new HashSet<>();
try {
while ((row = scanner.next()) != null) {
Map columnsMapBytes = row.getColumns();
byte[] output = columnsMapBytes.get(fieldNameBytes);
String outputString = Bytes.toString(output);
Set aggregationTypeValuesSet =
GSON.fromJson(outputString, TOKEN_TYPE_SET_AGGREGATION_TYPE_VALUES);
commonAggregationTypeValues.addAll(aggregationTypeValuesSet);
}
} finally {
scanner.close();
}
if (commonAggregationTypeValues.isEmpty()) {
responder.sendString(HttpURLConnection.HTTP_NOT_FOUND,
String.format("No aggregations for source '%s' and field '%s' found within time range.",
sourceID, fieldName), Charsets.UTF_8);
} else {
responder.sendJson(HttpURLConnection.HTTP_OK, commonAggregationTypeValues);
}
}
/**
* Gets the corresponding aggregation for a given aggregation type, field name, sourceID, and time interval
*/
@Path("sources/{sourceID}/fields/{fieldName}/aggregations/{aggregationType}/totals")
@GET
public void combinableAggregationGetter(HttpServiceRequest request, HttpServiceResponder responder,
@PathParam("fieldName") String fieldName,
@PathParam("aggregationType") String aggregationType,
@PathParam("sourceID") String sourceID,
@QueryParam("startTimestamp") @DefaultValue("0") long startTimestamp,
@QueryParam("endTimestamp") @DefaultValue("9223372036854775807")
long endTimestamp) throws IOException {
ValuesRowKey valuesRowKeyStart = new ValuesRowKey(startTimestamp, fieldName, sourceID);
ValuesRowKey valuesRowKeyEnd = new ValuesRowKey(endTimestamp + 1,
fieldName, sourceID); // scan rows inclusive of endTimestamp
try {
Class> aggregationClass = Class.forName("co.cask.cdap.dq.functions." + aggregationType);
CombinableAggregationFunction aggregationClassInstance =
(CombinableAggregationFunction) aggregationClass.newInstance();
Scanner scanner = dataStore.scan(valuesRowKeyStart.getTableRowKey(),
valuesRowKeyEnd.getTableRowKey());
Row row;
byte[] aggregationTypeBytes = Bytes.toBytes(aggregationType);
try {
while ((row = scanner.next()) != null) {
Map columnsMapBytes = row.getColumns();
byte[] output = columnsMapBytes.get(aggregationTypeBytes);
if (output != null) {
aggregationClassInstance.combine(output);
}
}
} finally {
scanner.close();
}
Object output = aggregationClassInstance.retrieveAggregation();
if (output == null) {
responder.sendString(HttpURLConnection.HTTP_NOT_FOUND,
"No aggregation for the given parameters", Charsets.UTF_8);
} else {
responder.sendJson(HttpURLConnection.HTTP_OK, output);
}
} catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
responder.sendString(HttpURLConnection.HTTP_NOT_FOUND,
String.format("No aggregations for source '%s' and field '%s' found within time range.",
sourceID, fieldName), Charsets.UTF_8);
} catch (ClassCastException e) {
responder.sendString(HttpURLConnection.HTTP_BAD_REQUEST,
"Aggregation function is not a Combinable Aggregation Function", Charsets.UTF_8);
}
}
/**
* Gets the corresponding aggregation for a given aggregation type, field name, sourceID, and time interval
*/
@Path("sources/{sourceID}/fields/{fieldName}/aggregations/{aggregationType}/timeseries")
@GET
public void basicAggregationGetter(HttpServiceRequest request, HttpServiceResponder responder,
@PathParam("fieldName") String fieldName,
@PathParam("aggregationType") String aggregationType,
@PathParam("sourceID") String sourceID,
@QueryParam("startTimestamp") @DefaultValue("0") long startTimestamp,
@QueryParam("endTimestamp") @DefaultValue("9223372036854775807")
long endTimestamp) throws IOException {
ValuesRowKey valuesRowKeyStart = new ValuesRowKey(startTimestamp, fieldName, sourceID);
ValuesRowKey valuesRowKeyEnd = new ValuesRowKey(endTimestamp + 1,
fieldName, sourceID); // scan rows inclusive of endTimestamp
List timestampValueList = new ArrayList<>();
try {
Class> aggregationClass = Class.forName("co.cask.cdap.dq.functions." + aggregationType);
BasicAggregationFunction aggregationClassInstance =
(BasicAggregationFunction) aggregationClass.newInstance();
Scanner scanner = dataStore.scan(valuesRowKeyStart.getTableRowKey(),
valuesRowKeyEnd.getTableRowKey());
Row row;
byte[] aggregationTypeBytes = Bytes.toBytes(aggregationType);
try {
while ((row = scanner.next()) != null) {
byte[] rowBytes = row.getRow();
Long timestamp = Bytes.toLong(rowBytes, rowBytes.length - Bytes.SIZEOF_LONG);
Map columnsMapBytes = row.getColumns();
byte[] output = columnsMapBytes.get(aggregationTypeBytes);
if (output != null) {
Object deserializedOutput = aggregationClassInstance.deserialize(output);
TimestampValue tsValue = new TimestampValue(timestamp, deserializedOutput);
timestampValueList.add(tsValue);
}
}
} finally {
scanner.close();
}
if (timestampValueList.isEmpty()) {
responder.sendString(HttpURLConnection.HTTP_NOT_FOUND,
"No aggregation for the given parameters", Charsets.UTF_8);
} else {
responder.sendJson(HttpURLConnection.HTTP_OK, timestampValueList);
}
} catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
responder.sendString(HttpURLConnection.HTTP_NOT_FOUND,
String.format("Aggregations '%s' for source '%s' and field '%s' could not be found within time range.",
aggregationType, sourceID, fieldName), Charsets.UTF_8);
} catch (ClassCastException e) {
responder.sendString(HttpURLConnection.HTTP_BAD_REQUEST,
"Aggregation function is not a Basic Aggregation Function", Charsets.UTF_8);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy