org.apache.phoenix.mapreduce.PhoenixInputFormat Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.mapreduce;
import java.io.IOException;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.RegionSizeCalculator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import org.apache.phoenix.compile.QueryPlan;
import org.apache.phoenix.coprocessor.BaseScannerRegionObserver;
import org.apache.phoenix.iterate.MapReduceParallelScanGrouper;
import org.apache.phoenix.iterate.ParallelScanGrouper;
import org.apache.phoenix.jdbc.PhoenixStatement;
import org.apache.phoenix.mapreduce.util.ConnectionUtil;
import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil;
import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.MRJobType;
import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil.SchemaType;
import org.apache.phoenix.query.HBaseFactoryProvider;
import org.apache.phoenix.query.KeyRange;
import org.apache.phoenix.schema.stats.StatisticsUtil;
import org.apache.phoenix.util.PhoenixRuntime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
/**
* {@link InputFormat} implementation from Phoenix.
*
*/
public class PhoenixInputFormat extends InputFormat {
private static final Logger LOGGER = LoggerFactory.getLogger(PhoenixInputFormat.class);
/**
* instantiated by framework
*/
public PhoenixInputFormat() {
}
@Override
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
final Configuration configuration = context.getConfiguration();
final QueryPlan queryPlan = getQueryPlan(context,configuration);
@SuppressWarnings("unchecked")
final Class inputClass = (Class) PhoenixConfigurationUtil.getInputClass(configuration);
return getPhoenixRecordReader(inputClass, configuration, queryPlan);
}
@Override
public List getSplits(JobContext context) throws IOException, InterruptedException {
final Configuration configuration = context.getConfiguration();
final QueryPlan queryPlan = getQueryPlan(context,configuration);
return generateSplits(queryPlan, configuration);
}
private List generateSplits(final QueryPlan qplan, Configuration config) throws IOException {
// We must call this in order to initialize the scans and splits from the query plan
setupParallelScansFromQueryPlan(qplan);
final List splits = qplan.getSplits();
Preconditions.checkNotNull(splits);
// Get the RegionSizeCalculator
try(org.apache.hadoop.hbase.client.Connection connection =
HBaseFactoryProvider.getHConnectionFactory().createConnection(config)) {
RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf(qplan
.getTableRef().getTable().getPhysicalName().toString()));
RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(regionLocator, connection
.getAdmin());
final List psplits = Lists.newArrayListWithExpectedSize(splits.size());
for (List scans : qplan.getScans()) {
// Get the region location
HRegionLocation location = regionLocator.getRegionLocation(
scans.get(0).getStartRow(),
false
);
String regionLocation = location.getHostname();
// Get the region size
long regionSize = sizeCalculator.getRegionSize(
location.getRegionInfo().getRegionName()
);
// Generate splits based off statistics, or just region splits?
boolean splitByStats = PhoenixConfigurationUtil.getSplitByStats(config);
if(splitByStats) {
for(Scan aScan: scans) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Split for scan : " + aScan + "with scanAttribute : " + aScan
.getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : [" +
aScan.getCaching() + ", " + aScan.getCacheBlocks() + ", " + aScan
.getBatch() + "] and regionLocation : " + regionLocation);
}
psplits.add(new PhoenixInputSplit(Collections.singletonList(aScan), regionSize, regionLocation));
}
} else {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Scan count[" + scans.size() + "] : " + Bytes.toStringBinary(scans
.get(0).getStartRow()) + " ~ " + Bytes.toStringBinary(scans.get(scans
.size() - 1).getStopRow()));
LOGGER.debug("First scan : " + scans.get(0) + "with scanAttribute : " + scans
.get(0).getAttributesMap() + " [scanCache, cacheBlock, scanBatch] : " +
"[" + scans.get(0).getCaching() + ", " + scans.get(0).getCacheBlocks()
+ ", " + scans.get(0).getBatch() + "] and regionLocation : " +
regionLocation);
for (int i = 0, limit = scans.size(); i < limit; i++) {
LOGGER.debug("EXPECTED_UPPER_REGION_KEY[" + i + "] : " + Bytes
.toStringBinary(scans.get(i).getAttribute
(BaseScannerRegionObserver.EXPECTED_UPPER_REGION_KEY)));
}
}
psplits.add(new PhoenixInputSplit(scans, regionSize, regionLocation));
}
}
return psplits;
}
}
/**
* Returns the query plan associated with the select query.
* @param context
* @return
* @throws IOException
* @throws SQLException
*/
protected QueryPlan getQueryPlan(final JobContext context, final Configuration configuration)
throws IOException {
Preconditions.checkNotNull(context);
try {
final String txnScnValue = configuration.get(PhoenixConfigurationUtil.TX_SCN_VALUE);
final String currentScnValue = configuration.get(PhoenixConfigurationUtil.CURRENT_SCN_VALUE);
final String tenantId = configuration.get(PhoenixConfigurationUtil.MAPREDUCE_TENANT_ID);
final Properties overridingProps = new Properties();
if(txnScnValue==null && currentScnValue!=null) {
overridingProps.put(PhoenixRuntime.CURRENT_SCN_ATTRIB, currentScnValue);
}
if (tenantId != null && configuration.get(PhoenixRuntime.TENANT_ID_ATTRIB) == null){
overridingProps.put(PhoenixRuntime.TENANT_ID_ATTRIB, tenantId);
}
try (final Connection connection = ConnectionUtil.getInputConnection(configuration, overridingProps);
final Statement statement = connection.createStatement()) {
MRJobType mrJobType = PhoenixConfigurationUtil.getMRJobType(configuration, MRJobType.QUERY.name());
String selectStatement;
switch (mrJobType) {
case UPDATE_STATS:
// This select statement indicates MR job for full table scan for stats collection
selectStatement = "SELECT * FROM " + PhoenixConfigurationUtil.getInputTableName(configuration);
break;
default:
selectStatement = PhoenixConfigurationUtil.getSelectStatement(configuration);
}
Preconditions.checkNotNull(selectStatement);
final PhoenixStatement pstmt = statement.unwrap(PhoenixStatement.class);
// Optimize the query plan so that we potentially use secondary indexes
final QueryPlan queryPlan = pstmt.optimizeQuery(selectStatement);
final Scan scan = queryPlan.getContext().getScan();
if (mrJobType == MRJobType.UPDATE_STATS) {
StatisticsUtil.setScanAttributes(scan, null);
}
// since we can't set a scn on connections with txn set TX_SCN attribute so that the max time range is set by BaseScannerRegionObserver
if (txnScnValue != null) {
scan.setAttribute(BaseScannerRegionObserver.TX_SCN, Bytes.toBytes(Long.valueOf(txnScnValue)));
}
// setting the snapshot configuration
String snapshotName = configuration.get(PhoenixConfigurationUtil.SNAPSHOT_NAME_KEY);
if (snapshotName != null) {
PhoenixConfigurationUtil.setSnapshotNameKey(queryPlan.getContext().getConnection().
getQueryServices().getConfiguration(), snapshotName);
}
return queryPlan;
}
} catch (Exception exception) {
LOGGER.error(String.format("Failed to get the query plan with error [%s]",
exception.getMessage()));
throw new RuntimeException(exception);
}
}
void setupParallelScansFromQueryPlan(QueryPlan queryPlan) {
setupParallelScansWithScanGrouper(queryPlan, MapReduceParallelScanGrouper.getInstance());
}
RecordReader getPhoenixRecordReader(Class inputClass,
Configuration configuration, QueryPlan queryPlan) {
return new PhoenixRecordReader<>(inputClass , configuration, queryPlan,
MapReduceParallelScanGrouper.getInstance());
}
/**
* Initialize the query plan so it sets up the parallel scans
* @param queryPlan Query plan corresponding to the select query
* @param scanGrouper Parallel scan grouper
*/
void setupParallelScansWithScanGrouper(QueryPlan queryPlan, ParallelScanGrouper scanGrouper) {
Preconditions.checkNotNull(queryPlan);
try {
queryPlan.iterator(scanGrouper);
} catch (SQLException e) {
LOGGER.error(String.format("Setting up parallel scans for the query plan failed "
+ "with error [%s]", e.getMessage()));
throw new RuntimeException(e);
}
}
}