io.cdap.plugin.gcp.spanner.source.SpannerInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-cloud Show documentation
Show all versions of google-cloud Show documentation
Plugins for Google Big Query
/*
* Copyright © 2018 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.plugin.gcp.spanner.source;
import com.google.cloud.spanner.BatchTransactionId;
import com.google.cloud.spanner.Partition;
import com.google.cloud.spanner.ResultSet;
import io.cdap.plugin.gcp.spanner.SpannerConstants;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
/**
* Spanner input format, deserializes and gets the partitions list from configuration to create input splits.
*/
public class SpannerInputFormat extends InputFormat {
private static final Logger LOG = LoggerFactory.getLogger(SpannerInputFormat.class);
/**
* Deserialize partitions list from configuration and create {@link PartitionInputSplit PartitionInputSplits}
* using them.
*/
@Override
public List getSplits(JobContext jobContext) throws IOException, InterruptedException {
Configuration configuration = jobContext.getConfiguration();
List partitionSplits = new ArrayList<>();
try {
ArrayList partitions = deserializeObject(configuration, SpannerConstants.PARTITIONS_LIST);
for (Partition partition : partitions) {
PartitionInputSplit split = new PartitionInputSplit(partition);
partitionSplits.add(split);
}
} catch (Exception e) {
throw new IOException("Exception while trying to initialize spanner and create partition splits", e);
}
LOG.debug("Initialized and configured {} splits", partitionSplits.size());
return partitionSplits;
}
private T deserializeObject(Configuration configuration, String property) throws IOException {
String propertyValue = configuration.get(property);
if (propertyValue == null) {
throw new IOException(String.format("Spanner %s is not available in the configuration", property));
}
byte[] propertyValueBytes = Base64.getDecoder().decode(propertyValue);
try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(propertyValueBytes);
ObjectInputStream objectInputStream = new ObjectInputStream(byteArrayInputStream)) {
return (T) objectInputStream.readObject();
} catch (ClassNotFoundException cfe) {
throw new IOException("Exception while trying to deserialize object ", cfe);
}
}
@Override
public RecordReader createRecordReader(
InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
Configuration configuration = taskAttemptContext.getConfiguration();
BatchTransactionId batchTransactionId =
deserializeObject(configuration, SpannerConstants.SPANNER_BATCH_TRANSACTION_ID);
return new SpannerRecordReader(batchTransactionId);
}
}