All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.grakn.kb.internal.computer.GraknCqlBridgeRecordReader Maven / Gradle / Ivy

There is a newer version: 1.4.3
Show newest version
/*
 * GRAKN.AI - THE KNOWLEDGE GRAPH
 * Copyright (C) 2018 Grakn Labs Ltd
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package ai.grakn.kb.internal.computer;

import com.datastax.driver.core.Cluster;
import com.datastax.driver.core.ColumnMetadata;
import com.datastax.driver.core.Metadata;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.SimpleStatement;
import com.datastax.driver.core.TableMetadata;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.hadoop.ColumnFamilySplit;
import org.apache.cassandra.hadoop.ConfigHelper;
import org.apache.cassandra.hadoop.HadoopCompat;
import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.janusgraph.diskstorage.Entry;
import org.janusgraph.diskstorage.StaticBuffer;
import org.janusgraph.diskstorage.util.StaticArrayBuffer;
import org.janusgraph.diskstorage.util.StaticArrayEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static java.util.stream.Collectors.toList;

/**
 * 

Background: The {@link org.apache.cassandra.hadoop.cql3.CqlRecordReader} class has changed * significantly in Cassandra-3 from Cassandra-2. This class acts as a bridge between * CqlRecordReader in Cassandra-2 to Cassandra-3. In essence, this class recreates CqlRecordReader * from Cassandra-3 without referring to it (because otherwise we'd get functionality from * CqlRecordReader on Cassandra-2 and we don't want it).

* * @see Issue 172. */ public class GraknCqlBridgeRecordReader extends RecordReader> { /* Implementation note: This is inspired by Cassandra-3's org/apache/cassandra/hadoop/cql3/CqlRecordReader.java */ private static final Logger log = LoggerFactory.getLogger(GraknCqlBridgeRecordReader.class); private ColumnFamilySplit split; private DistinctKeyIterator distinctKeyIterator; private int totalRowCount; // total number of rows to fetch private String keyspace; private String cfName; private String cqlQuery; private Cluster cluster; private Session session; private IPartitioner partitioner; private String inputColumns; private String userDefinedWhereClauses; private final List partitionKeys = new ArrayList<>(); // partition keys -- key aliases private final LinkedHashMap partitionBoundColumns = Maps.newLinkedHashMap(); private int nativeProtocolVersion = 1; // binary type mapping code from CassandraBinaryRecordReader private KV currentKV; GraknCqlBridgeRecordReader() { //package private super(); } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { this.split = (ColumnFamilySplit) split; Configuration conf = HadoopCompat.getConfiguration(context); totalRowCount = (this.split.getLength() < Long.MAX_VALUE) ? (int) this.split.getLength() : ConfigHelper.getInputSplitSize(conf); cfName = ConfigHelper.getInputColumnFamily(conf); keyspace = ConfigHelper.getInputKeyspace(conf); partitioner = ConfigHelper.getInputPartitioner(conf); inputColumns = CqlConfigHelper.getInputcolumns(conf); userDefinedWhereClauses = CqlConfigHelper.getInputWhereClauses(conf); try { if (cluster != null) { return; } // create a Cluster instance String[] locations = split.getLocations(); // disregard the conf as it brings some unforeseen issues. cluster = Cluster.builder() .addContactPoints(locations) .build(); } catch (Exception e) { throw new RuntimeException("Unable to create cluster for table: " + cfName + ", in keyspace: " + keyspace, e); } // cluster should be represent to a valid cluster now session = cluster.connect(quote(keyspace)); Preconditions.checkState(session != null, "Can't create connection session"); //get negotiated serialization protocol nativeProtocolVersion = cluster.getConfiguration().getProtocolOptions().getProtocolVersion().toInt(); // If the user provides a CQL query then we will use it without validation // otherwise we will fall back to building a query using the: // inputColumns // whereClauses cqlQuery = CqlConfigHelper.getInputCql(conf); // validate that the user hasn't tried to give us a custom query along with input columns // and where clauses if (StringUtils.isNotEmpty(cqlQuery) && (StringUtils.isNotEmpty(inputColumns) || StringUtils.isNotEmpty(userDefinedWhereClauses))) { throw new AssertionError("Cannot define a custom query with input columns and / or where clauses"); } if (StringUtils.isEmpty(cqlQuery)) { cqlQuery = buildQuery(); } log.trace("cqlQuery {}", cqlQuery); distinctKeyIterator = new DistinctKeyIterator(); log.trace("created {}", distinctKeyIterator); } public void close() { if (session != null) { session.close(); } if (cluster != null) { cluster.close(); } } private static class KV { private final StaticArrayBuffer key; private ArrayList entries; KV(StaticArrayBuffer key) { this.key = key; } void addEntries(Collection toAdd) { if (entries == null) { entries = new ArrayList<>(toAdd.size()); } entries.addAll(toAdd); } } @Override public StaticBuffer getCurrentKey() { return currentKV.key; } @Override public Iterable getCurrentValue() throws IOException { return currentKV.entries; } public float getProgress() { if (!distinctKeyIterator.hasNext()) { return 1.0F; } // the progress is likely to be reported slightly off the actual but close enough float progress = ((float) distinctKeyIterator.totalRead / totalRowCount); return progress > 1.0F ? 1.0F : progress; } public boolean nextKeyValue() throws IOException { final Map> kv = distinctKeyIterator.next(); if (kv == null) { return false; } final Map.Entry> onlyEntry = Iterables.getOnlyElement(kv.entrySet()); final KV newKV = new KV(onlyEntry.getKey()); final Map v = onlyEntry.getValue(); final List entries = v.keySet() .stream() .map(column -> StaticArrayEntry.of(column, v.get(column))) .collect(toList()); newKV.addEntries(entries); currentKV = newKV; return true; } /** * Return native version protocol of the cluster connection * * @return serialization protocol version. */ public int getNativeProtocolVersion() { return nativeProtocolVersion; } /** * A non-static nested class that represents an iterator for distinct keys based on the * row iterator from DataStax driver. In the usual case, more than one row will be associated * with a single key in JanusGraph's use of Cassandra. */ private class DistinctKeyIterator implements Iterator>> { public static final String KEY = "key"; public static final String COLUMN_NAME = "column1"; public static final String VALUE = "value"; private final Iterator rowIterator; long totalRead; Row previousRow = null; DistinctKeyIterator() { AbstractType type = partitioner.getTokenValidator(); Object startToken = type.compose(type.fromString(split.getStartToken())); Object endToken = type.compose(type.fromString(split.getEndToken())); SimpleStatement statement = new SimpleStatement(cqlQuery, startToken, endToken); rowIterator = session.execute(statement).iterator(); for (ColumnMetadata meta : cluster.getMetadata().getKeyspace(quote(keyspace)).getTable(quote(cfName)).getPartitionKey()) { partitionBoundColumns.put(meta.getName(), Boolean.TRUE); } } @Override public boolean hasNext() { return rowIterator.hasNext(); } /** *

* Implements the business logic of the outer class. * Relies on the {@linkplain Iterator} of {@linkplain Row} to get a map of rows that correspond * to the same key. *

*

* Note: This is not a general purpose iterator. There is no provision of {@linkplain java.util.ConcurrentModificationException} * while iterating using this iterator. *

* * @return the next element in the iteration of distinct keys, returns null to indicate * end of iteration */ @Override public Map> next() { if (!rowIterator.hasNext()) { return null; // null means no more data } Map> keyColumnValues = new HashMap<>(); // key -> (column1 -> value) Row row; if (previousRow == null) { row = rowIterator.next(); // just the first time, should succeed } else { row = previousRow; } StaticArrayBuffer key = StaticArrayBuffer.of(row.getBytesUnsafe(KEY)); StaticBuffer column1 = StaticArrayBuffer.of(row.getBytesUnsafe(COLUMN_NAME)); StaticBuffer value = StaticArrayBuffer.of(row.getBytesUnsafe(VALUE)); Map cvs = new HashMap<>(); cvs.put(column1, value); keyColumnValues.put(key, cvs); while (rowIterator.hasNext()) { Row nextRow = rowIterator.next(); StaticArrayBuffer nextKey = StaticArrayBuffer.of(nextRow.getBytesUnsafe(KEY)); if (!key.equals(nextKey)) { previousRow = nextRow; break; } StaticBuffer nextColumn = StaticArrayBuffer.of(nextRow.getBytesUnsafe(COLUMN_NAME)); StaticBuffer nextValue = StaticArrayBuffer.of(nextRow.getBytesUnsafe(VALUE)); cvs.put(nextColumn, nextValue); totalRead++; } return keyColumnValues; } } /** * Build a query for the reader of the form: *

* SELECT * FROM ks>cf token(pk1,...pkn)>? AND token(pk1,...pkn)<=? [AND user where clauses] * [ALLOW FILTERING] */ private String buildQuery() { fetchKeys(); List columns = getSelectColumns(); String selectColumnList = columns.size() == 0 ? "*" : makeColumnList(columns); String partitionKeyList = makeColumnList(partitionKeys); return String.format("SELECT %s FROM %s.%s WHERE token(%s)>? AND token(%s)<=?" + getAdditionalWhereClauses(), selectColumnList, quote(keyspace), quote(cfName), partitionKeyList, partitionKeyList); } private String getAdditionalWhereClauses() { String whereClause = ""; if (StringUtils.isNotEmpty(userDefinedWhereClauses)) { whereClause += " AND " + userDefinedWhereClauses; } if (StringUtils.isNotEmpty(userDefinedWhereClauses)) { whereClause += " ALLOW FILTERING"; } return whereClause; } private List getSelectColumns() { List selectColumns = new ArrayList<>(); if (StringUtils.isNotEmpty(inputColumns)) { // We must select all the partition keys plus any other columns the user wants selectColumns.addAll(partitionKeys); for (String column : Splitter.on(',').split(inputColumns)) { if (!partitionKeys.contains(column)) { selectColumns.add(column); } } } return selectColumns; } private String makeColumnList(Collection columns) { return columns.stream().map(this::quote).collect(Collectors.joining(",")); } private void fetchKeys() { // get CF meta data TableMetadata tableMetadata = session.getCluster() .getMetadata() .getKeyspace(Metadata.quote(keyspace)) .getTable(Metadata.quote(cfName)); if (tableMetadata == null) { throw new RuntimeException("No table metadata found for " + keyspace + "." + cfName); } //Here we assume that tableMetadata.getPartitionKey() always //returns the list of columns in order of component_index for (ColumnMetadata partitionKey : tableMetadata.getPartitionKey()) { partitionKeys.add(partitionKey.getName()); } } private String quote(String identifier) { return "\"" + identifier.replaceAll("\"", "\"\"") + "\""; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy