ai.grakn.kb.internal.computer.GraknCqlBridgeRecordReader Maven / Gradle / Ivy
/*
* GRAKN.AI - THE KNOWLEDGE GRAPH
* Copyright (C) 2018 Grakn Labs Ltd
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
package ai.grakn.kb.internal.computer;
import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.ColumnMetadata;
import com.datastax.driver.core.Metadata;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.SimpleStatement;
import com.datastax.driver.core.TableMetadata;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.hadoop.ColumnFamilySplit;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.HadoopCompat;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.janusgraph.diskstorage.Entry;
import org.janusgraph.diskstorage.StaticBuffer;
import org.janusgraph.diskstorage.util.StaticArrayBuffer;
import org.janusgraph.diskstorage.util.StaticArrayEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static java.util.stream.Collectors.toList;
/**
* Background: The {@link org.apache.cassandra.hadoop.cql3.CqlRecordReader} class has changed
* significantly in Cassandra-3 from Cassandra-2. This class acts as a bridge between
* CqlRecordReader in Cassandra-2 to Cassandra-3. In essence, this class recreates CqlRecordReader
* from Cassandra-3 without referring to it (because otherwise we'd get functionality from
* CqlRecordReader on Cassandra-2 and we don't want it).
*
* @see Issue 172.
*/
public class GraknCqlBridgeRecordReader extends RecordReader> {
/* Implementation note: This is inspired by Cassandra-3's org/apache/cassandra/hadoop/cql3/CqlRecordReader.java */
private static final Logger log = LoggerFactory.getLogger(GraknCqlBridgeRecordReader.class);
private ColumnFamilySplit split;
private DistinctKeyIterator distinctKeyIterator;
private int totalRowCount; // total number of rows to fetch
private String keyspace;
private String cfName;
private String cqlQuery;
private Cluster cluster;
private Session session;
private IPartitioner partitioner;
private String inputColumns;
private String userDefinedWhereClauses;
private final List partitionKeys = new ArrayList<>();
// partition keys -- key aliases
private final LinkedHashMap partitionBoundColumns = Maps.newLinkedHashMap();
private int nativeProtocolVersion = 1;
// binary type mapping code from CassandraBinaryRecordReader
private KV currentKV;
GraknCqlBridgeRecordReader() { //package private
super();
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
this.split = (ColumnFamilySplit) split;
Configuration conf = HadoopCompat.getConfiguration(context);
totalRowCount = (this.split.getLength() < Long.MAX_VALUE)
? (int) this.split.getLength()
: ConfigHelper.getInputSplitSize(conf);
cfName = ConfigHelper.getInputColumnFamily(conf);
keyspace = ConfigHelper.getInputKeyspace(conf);
partitioner = ConfigHelper.getInputPartitioner(conf);
inputColumns = CqlConfigHelper.getInputcolumns(conf);
userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf);
try {
if (cluster != null) {
return;
}
// create a Cluster instance
String[] locations = split.getLocations();
// disregard the conf as it brings some unforeseen issues.
cluster = Cluster.builder()
.addContactPoints(locations)
.build();
} catch (Exception e) {
throw new RuntimeException("Unable to create cluster for table: " + cfName + ", in keyspace: " + keyspace, e);
}
// cluster should be represent to a valid cluster now
session = cluster.connect(quote(keyspace));
Preconditions.checkState(session != null, "Can't create connection session");
//get negotiated serialization protocol
nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt();
// If the user provides a CQL query then we will use it without validation
// otherwise we will fall back to building a query using the:
// inputColumns
// whereClauses
cqlQuery = CqlConfigHelper.getInputCql(conf);
// validate that the user hasn't tried to give us a custom query along with input columns
// and where clauses
if (StringUtils.isNotEmpty(cqlQuery) && (StringUtils.isNotEmpty(inputColumns) ||
StringUtils.isNotEmpty(userDefinedWhereClauses))) {
throw new AssertionError("Cannot define a custom query with input columns and / or where clauses");
}
if (StringUtils.isEmpty(cqlQuery)) {
cqlQuery = buildQuery();
}
log.trace("cqlQuery {}", cqlQuery);
distinctKeyIterator = new DistinctKeyIterator();
log.trace("created {}", distinctKeyIterator);
}
public void close() {
if (session != null) {
session.close();
}
if (cluster != null) {
cluster.close();
}
}
private static class KV {
private final StaticArrayBuffer key;
private ArrayList entries;
KV(StaticArrayBuffer key) {
this.key = key;
}
void addEntries(Collection toAdd) {
if (entries == null) {
entries = new ArrayList<>(toAdd.size());
}
entries.addAll(toAdd);
}
}
@Override
public StaticBuffer getCurrentKey() {
return currentKV.key;
}
@Override
public Iterable getCurrentValue() throws IOException {
return currentKV.entries;
}
public float getProgress() {
if (!distinctKeyIterator.hasNext()) {
return 1.0F;
}
// the progress is likely to be reported slightly off the actual but close enough
float progress = ((float) distinctKeyIterator.totalRead / totalRowCount);
return progress > 1.0F ? 1.0F : progress;
}
public boolean nextKeyValue() throws IOException {
final Map> kv = distinctKeyIterator.next();
if (kv == null) {
return false;
}
final Map.Entry> onlyEntry = Iterables.getOnlyElement(kv.entrySet());
final KV newKV = new KV(onlyEntry.getKey());
final Map v = onlyEntry.getValue();
final List entries = v.keySet()
.stream()
.map(column -> StaticArrayEntry.of(column, v.get(column)))
.collect(toList());
newKV.addEntries(entries);
currentKV = newKV;
return true;
}
/**
* Return native version protocol of the cluster connection
*
* @return serialization protocol version.
*/
public int getNativeProtocolVersion() {
return nativeProtocolVersion;
}
/**
* A non-static nested class that represents an iterator for distinct keys based on the
* row iterator from DataStax driver. In the usual case, more than one row will be associated
* with a single key in JanusGraph's use of Cassandra.
*/
private class DistinctKeyIterator implements Iterator
© 2015 - 2025 Weber Informatics LLC | Privacy Policy