Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
net.e6tech.elements.cassandra.etl.PartitionStrategy Maven / Gradle / Ivy
/*
* Copyright 2017 Futeh Kao
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.e6tech.elements.cassandra.etl;
import net.e6tech.elements.cassandra.Session;
import net.e6tech.elements.cassandra.Sibyl;
import net.e6tech.elements.cassandra.async.AsyncPrepared;
import net.e6tech.elements.cassandra.driver.cql.Prepared;
import net.e6tech.elements.cassandra.driver.cql.ResultSet;
import net.e6tech.elements.cassandra.driver.cql.Row;
import net.e6tech.elements.common.reflection.ObjectConverter;
import net.e6tech.elements.common.resources.Resources;
import net.e6tech.elements.common.util.SystemException;
import net.e6tech.elements.common.util.TextBuilder;
import net.e6tech.elements.common.util.datastructure.Pair;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
@SuppressWarnings({"unchecked", "java:S1192"})
/**
* The default behavior is to use select ${pk} ... where ... allow filtering in one call.
* However, if asyncTimeUnitStepSize is set it will break the query into separate queries. In addition,
* if asyncUseFutures is set to ture, each query is broken down into smaller queries.
* See detailed explanation in execAsyncQuery2.
*
* ETLContext
* context.setAsyncTimeUnitStepSize(100);
* context.setAsyncUseFutures(true);
*/
public class PartitionStrategy implements BatchStrategy {
public static final String QUERY_PARTITION = "select ${pk}, count(*) from ${table} " +
"where ${pk} > ${start} and ${pk} < ${end} group by ${pk} allow filtering";
public static final String ASYNC_QUERY_PARTITION = "select ${pk}, count(*) from ${table} where ${pk} = :pk";
public static final String QUERY_RANGE = "select distinct ${pk} from ${table} " +
"where ${pk} > ${start} and ${pk} < ${end} allow filtering";
public static final String ASYNC_QUERY_RANGE = "select ${pk} from ${table} where ${pk} = :pk";
private ObjectConverter converter = new ObjectConverter();
private String partitionTiming;
@Override
public int load(C context, List source) {
if (context.getLoadDelegate() != null) {
return context.getLoadDelegate().applyAsInt(source);
}
return 0;
}
/**
* Return a map of partitions and count. The partitions are sorted from small to large.
*
* @param p PartitionQuery
* @return map of partition anc count
*/
public Map, Long> queryPartitions(PartitionQuery p) {
long start = System.currentTimeMillis();
String query = TextBuilder.using(QUERY_PARTITION)
.build("pk", p.partitionKey, "table", p.table, "start", p.lastUpdate.getLastUpdate(), "end", p.end);
Map, Long> map = Collections.synchronizedMap(new TreeMap<>());
List> partitions = Collections.synchronizedList(new LinkedList<>());
p.context.open().accept(Resources.class, res -> {
try {
ResultSet rs = res.getInstance(Session.class).execute(query);
List rows = rs.all();
for (Row row : rows) {
Comparable pk = (Comparable) row.get(0, p.context.getPartitionKeyType());
map.put(pk, row.get(1, Long.class));
partitions.add(pk);
}
} catch (Exception ex) {
logger.warn("queryPartitions failed: " + query, ex);
throw ex;
}
});
Map, Long> ret = sortPartitions(map, partitions);
partitionTiming = "queryPartition for " + p.table + " took " + (System.currentTimeMillis() - start) + "ms";
return ret;
}
public Map, Long> queryPartitions2(PartitionQuery p) {
try {
new BigDecimal(p.lastUpdate.getLastUpdate());
} catch (Exception ex) {
logger.warn("Cannot parse latUpdate " + p.lastUpdate.getLastUpdate() + " for " + p.lastUpdate.getExtractor());
return queryPartitions(p);
}
if (p.context.getTimeUnit() != null && p.asyncStep != null && p.asyncStep > 0) {
long start = System.currentTimeMillis();
String query = buildQuery(p, QUERY_PARTITION, ASYNC_QUERY_PARTITION);
Map, Long> map = Collections.synchronizedMap(new TreeMap<>());
List> partitions = Collections.synchronizedList(new LinkedList<>());
asyncQuery(p, query, row -> {
Comparable pk = (Comparable) row.get(0, p.context.getPartitionKeyType());
map.put(pk, row.get(1, Long.class));
partitions.add(pk);
});
Map, Long> ret = sortPartitions(map, partitions);
partitionTiming = "queryPartition2 for " + p.table + " took " + (System.currentTimeMillis() - start) + "ms";
return ret;
} else {
return queryPartitions(p);
}
}
private String buildQuery(PartitionQuery p, String query, String asyncQuery) {
if (p.context.isAsyncUseFutures())
return TextBuilder.using(asyncQuery).build("pk", p.partitionKey, "table", p.table);
else
return TextBuilder.using(query).build("pk", p.partitionKey, "table", p.table, "start", ":start", "end", ":end");
}
private Map, Long> sortPartitions(Map, Long> map, List> partitions) {
partitions.sort(null);
Map, Long> result = new LinkedHashMap<>(partitions.size() + 1, 1.0f);
for (Comparable partition : partitions) {
result.put(partition, map.get(partition));
}
return result;
}
private Pair fromAndTo(PartitionQuery p) {
BigDecimal from = new BigDecimal(p.lastUpdate.getLastUpdate());
BigDecimal to = new BigDecimal(p.context.getCutoff().toString());
if (from.compareTo(BigDecimal.ZERO) <= 0) {
from = new BigDecimal(p.context.getCutoff(System.currentTimeMillis(), p.context.getMaxPast()).toString());
}
return new Pair<>(from, to);
}
protected void asyncQuery(PartitionQuery p, String query, Consumer rowConsumer) {
C context = p.context;
Pair fromTo = fromAndTo(p);
BigDecimal from = fromTo.key();
BigDecimal to = fromTo.value();
BigDecimal start = from;
List ranges = getAsyncRanges(query, start, to, p.asyncStep, p.asyncMaxChunkSize);
while (!ranges.isEmpty()) {
List working = ranges;
int retries = p.retries;
while (retries >= 0) {
try {
execAsyncQuery(p.context, query, working, rowConsumer);
break;
} catch (Exception ex) {
String info = "extractor=" + context.extractor() +
" sourceClass=" + context.getSourceClass() +
" tableName=" + context.tableName();
if (retries == 0) {
logger.warn("Cannot transmutate " + query, ex);
throw ex;
} else {
logger.warn("Cannot transmutate " + query + ", " + retries + " retry attempts left, " + info, ex);
}
try {
Thread.sleep(context.getRetrySleep());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
} finally {
retries--;
}
}
Range lastRange = ranges.get(ranges.size() - 1);
start = lastRange.end.subtract(BigDecimal.ONE);
ranges = getAsyncRanges(query, start, to, p.asyncStep, p.asyncMaxChunkSize);
}
}
// using select ${pk}, count(*) from ${table} where ${pk} > ${start} and ${pk} < ${end} group by ${pk} allow filtering
protected void execAsyncQuery(C context, String query, List working, Consumer rowConsumer) {
if (context.isAsyncUseFutures()) {
execAsyncQuery2(context, query, working, rowConsumer);
return;
}
context.open().accept(Sibyl.class, sibyl ->
sibyl.createAsync(query)
.execute(working, (range, bound) -> {
try {
bound.set("start", converter.convert(range.start.toString(), context.getPartitionKeyType(), null), context.getPartitionKeyType());
bound.set("end", converter.convert(range.end.toPlainString(), context.getPartitionKeyType(), null), context.getPartitionKeyType());
} catch (IOException ex) {
throw new SystemException(ex);
}
})
.inExecutionOrderRows(rowConsumer)
);
}
// instead of using select ${pk}, count(*) from ${table} where ${pk} > ${start} and ${pk} < ${end} group by ${pk} allow filtering
// this method break the query into a number of queries of the form
// select ${pk}, count(*) from ${table} where ${pk} = :pk, where pk is start, start + 1, start +2 ... end.
// asyncStepSize controls the chunkSize and should be around 100.
protected void execAsyncQuery2(C context, String query, List working, Consumer rowConsumer) {
for (Range r : working) {
List list = new LinkedList<>();
for (BigDecimal i = r.start.add(BigDecimal.ONE); i.compareTo(r.end) < 0; i = i.add(BigDecimal.ONE))
list.add(i);
context.open().accept(Sibyl.class, sibyl ->
sibyl.createAsync(query)
.execute(list, (pk, bound) -> {
try {
bound.set("pk", converter.convert(pk.toPlainString(), context.getPartitionKeyType(), null), context.getPartitionKeyType());
} catch (IOException ex) {
throw new SystemException(ex);
}
})
.inExecutionOrderRows(row -> {
if (!row.isNull(0))
rowConsumer.accept(row);
})
);
}
}
// Breaks from and to into a list ranges. Each range has a start and its end is start + asyncStepSize (or smaller).
// The asyncMaxChunkSize is used to limit the size of the list. Caller will call this method over and over
// again until the list is empty.
protected List getAsyncRanges(String query, BigDecimal from, BigDecimal to, int asyncStepSize, int asyncMaxChunkSize) {
BigDecimal start = from;
BigDecimal end = from.add(new BigDecimal(asyncStepSize));
if (end.compareTo(to) > 0)
end = to;
if (end.subtract(start).compareTo(BigDecimal.ONE) <= 0)
return new ArrayList<>();
List ranges = new LinkedList<>();
boolean loopEntered = false;
while (true) {
if (!loopEntered) {
loopEntered = true;
} else if (end.compareTo(to) >= 0 || ranges.size() >= asyncMaxChunkSize)
break;
ranges.add(new Range(start, end));
start = end.subtract(BigDecimal.ONE); // because ${pk} > ${start} and ${pk} < ${end}, > and <
end = start.add(new BigDecimal(asyncStepSize));
if (end.compareTo(to) > 0)
end = to;
}
Range first = ranges.get(0);
if (from.compareTo(first.start) != 0)
logger.warn("async range error for {}, first {} ", query, first);
return new ArrayList<>(ranges);
}
@Override
public List extract(C context) {
return context.open().apply(Sibyl.class, sibyl -> {
String query = TextBuilder.using("select * from ${tbl} where ${pk} = :partitionKey")
.build("tbl", context.tableName(), "pk", context.getInspector().getPartitionKeyColumn(0));
Prepared pstmt = context.getPreparedStatements().computeIfAbsent("extract",
key -> sibyl.getSession().prepare(query));
AsyncPrepared> async = sibyl.createAsync(pstmt);
for (Comparable> hour : context.getPartitions()) {
async.execute(bound -> bound.set("partitionKey", hour, (Class) hour.getClass()));
}
List list = new ArrayList<>();
async.inExecutionOrder(rs -> list.addAll(sibyl.mapAll(context.getSourceClass(), rs)));
return list;
});
}
@Override
public int run(C context) {
long start = System.currentTimeMillis();
int importedCount = 0;
context.initialize();
Map, Long> partitions; // partition key vs count
PartitionQuery p = new PartitionQuery<>(context);
partitions = queryPartitions2(p);
context.reset();
List> concurrent = new LinkedList<>();
boolean empty = partitions.isEmpty();
while (partitions.size() > 0) {
LastUpdate lastUpdate = context.getLastUpdate();
concurrent.clear();
boolean first = true;
long count = 0;
for (Map.Entry, Long> entry : partitions.entrySet()) {
if (first) { // always add first regardless of batch size
concurrent.add(entry.getKey());
first = false;
count = entry.getValue();
} else if (count + entry.getValue() <= context.getBatchSize()) {
count += entry.getValue();
concurrent.add(entry.getKey());
} else {
break;
}
}
int processedCount = run(context, concurrent);
importedCount += processedCount;
if (logger.isInfoEnabled())
logger.info("Batch loaded {} instances of {}", processedCount, context.extractor());
for (Comparable> partition : concurrent) {
partitions.remove(partition);
lastUpdate.update(partition);
}
context.saveLastUpdate(lastUpdate);
}
if (context.getTimeUnit() != null && empty && context.isAsyncUseFutures()) {
LastUpdate lastUpdate = context.getLastUpdate();
BigDecimal from = new BigDecimal(lastUpdate.getLastUpdate());
BigDecimal to = new BigDecimal(p.context.getCutoff().toString());
if (to.compareTo(from) > 0) {
BigDecimal currentTimeInTimeUnit = timeExpressedInCorrectTimeUnit(context.getTimeUnit(), System.currentTimeMillis());
if (to.compareTo(currentTimeInTimeUnit) > 0)
lastUpdate.update(currentTimeInTimeUnit.subtract(BigDecimal.ONE));
else
lastUpdate.update(to.subtract(BigDecimal.ONE));
context.saveLastUpdate(lastUpdate);
}
}
if (logger.isInfoEnabled()) {
logger.info("{}.run for {} loaded {} instances of {} took {}ms, {}",
getClass().getSimpleName(), context.extractor(),
importedCount, context.getSourceClass().getSimpleName(),
System.currentTimeMillis() - start, partitionTiming
);
}
context.reset();
return importedCount;
}
public BigDecimal timeExpressedInCorrectTimeUnit(TimeUnit timeUnit, long time) {
BigDecimal timeToConvert = new BigDecimal(time);
if (timeUnit == TimeUnit.DAYS)
return timeToConvert.divide(new BigDecimal(ETLContext.DAY), 0, RoundingMode.DOWN);
else if (timeUnit == TimeUnit.HOURS)
return timeToConvert.divide(new BigDecimal(ETLContext.HOUR), 0, RoundingMode.DOWN);
else if (timeUnit == TimeUnit.MINUTES)
return timeToConvert.divide(new BigDecimal(ETLContext.MINUTE), 0, RoundingMode.DOWN);
else if (timeUnit == TimeUnit.SECONDS)
return timeToConvert.divide(new BigDecimal(ETLContext.SECOND), 0, RoundingMode.DOWN);
return timeToConvert;
}
public int run(C context, List> partitions) {
context.setPartitions(partitions);
List batchResults = extract(context);
return load(context, batchResults);
}
public List> queryRange(PartitionQuery p) {
long start = System.currentTimeMillis();
C context = p.context;
LastUpdate lastUpdate = context.getLastUpdate();
Comparable> end = context.getCutoff();
String partitionKey = context.getInspector().getPartitionKeyColumn(0);
String table = context.tableName();
List> list = Collections.synchronizedList(new LinkedList<>());
// select distinct only works when selecting partition keys
String query = TextBuilder.using(QUERY_RANGE)
.build("pk", partitionKey, "table", table,
"start", lastUpdate.getLastUpdate(), "end", end);
context.open().accept(Resources.class, res -> {
try {
ResultSet rs = res.getInstance(Session.class).execute(query);
for (Row row : rs.all()) {
list.add((Comparable) row.get(0, context.getPartitionKeyType()));
}
} catch (Exception ex) {
logger.warn("PartitionStrategy.queryRange failed: " + query, ex);
throw ex;
}
});
list.sort(null);
partitionTiming = "queryRange for " + p.table + " took " + (System.currentTimeMillis() - start) + "ms";
return list;
}
public List> queryRange2(PartitionQuery p) {
try {
new BigDecimal(p.lastUpdate.getLastUpdate());
} catch (Exception ex) {
logger.warn("Cannot parse latUpdate " + p.lastUpdate.getLastUpdate() + " for " + p.lastUpdate.getExtractor());
return queryRange(p);
}
if (p.context.getTimeUnit() != null && p.asyncStep != null && p.asyncStep > 0) {
long start = System.currentTimeMillis();
List> list = Collections.synchronizedList(new LinkedList<>());
String query = buildQuery(p, QUERY_RANGE, ASYNC_QUERY_RANGE);
asyncQuery(p, query, row -> list.add((Comparable) row.get(0, p.context.getPartitionKeyType())));
list.sort(null);
partitionTiming = "queryRange2 for " + p.table + " took " + (System.currentTimeMillis() - start) + "ms";
return list;
} else {
return queryRange(p);
}
}
public int runPartitions(C context) {
long start = System.currentTimeMillis();
context.initialize();
PartitionQuery p = new PartitionQuery<>(context);
List> list = queryRange2(p);
List> batch = new ArrayList<>(context.getBatchSize());
int processCount = 0;
for (Comparable> c : list) {
batch.add(c);
if (batch.size() == context.getBatchSize()) {
LastUpdate lastUpdate = context.getLastUpdate();
processCount += runPartitions(batch, context);
lastUpdate.update(batch.get(batch.size() - 1));
context.saveLastUpdate(lastUpdate);
batch.clear();
}
}
if (!batch.isEmpty()) {
LastUpdate lastUpdate = context.getLastUpdate();
processCount += runPartitions(batch, context);
lastUpdate.update(batch.get(batch.size() - 1));
context.saveLastUpdate(lastUpdate);
batch.clear();
}
if (logger.isInfoEnabled()) {
logger.info("{}.runPartitions for {} loaded {} instances of {} took {}ms, {}",
getClass().getSimpleName(), context.extractor(),
processCount, context.getSourceClass().getSimpleName(),
System.currentTimeMillis() - start, partitionTiming
);
}
return list.size();
}
public int runPartitions(List> list, C context) {
context.setPartitions(list);
return context.getLoadDelegate().applyAsInt(list);
}
public static class Range {
BigDecimal start;
BigDecimal end;
public Range(BigDecimal start, BigDecimal end) {
this.start = start;
this.end = end;
}
@Override
public String toString() {
return "range start= " + start + " end=" + end;
}
}
public static class PartitionQuery {
C context;
LastUpdate lastUpdate;
Comparable> end;
String partitionKey;
String table;
int retries;
Integer asyncStep;
Integer asyncMaxChunkSize;
public PartitionQuery(C context) {
this.context = context;
lastUpdate = context.getLastUpdate();
end = context.getCutoff();
partitionKey = context.getInspector().getPartitionKeyColumn(0);
table = context.tableName();
asyncStep = context.getAsyncTimeUnitStepSize();
if (context.getAsyncMaxNumOfChunks() == null)
asyncMaxChunkSize = ETLContext.ASYNC_MAX_NUM_OF_CHUNKS;
else
asyncMaxChunkSize = context.getAsyncMaxNumOfChunks();
retries = context.getRetries();
if (retries < 0)
retries = 0;
}
}
}