org.apache.phoenix.iterate.OrderedResultIterator Maven / Gradle / Ivy
Show all versions of phoenix-server-hbase-2.6
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.iterate;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.compile.ExplainPlanAttributes.ExplainPlanAttributesBuilder;
import org.apache.phoenix.coprocessorclient.BaseScannerRegionObserverConstants;
import org.apache.phoenix.exception.PhoenixIOException;
import org.apache.phoenix.execute.DescVarLengthFastByteComparisons;
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.expression.OrderByExpression;
import org.apache.phoenix.query.QueryServices;
import org.apache.phoenix.schema.SortOrder;
import org.apache.phoenix.schema.tuple.Tuple;
import org.apache.phoenix.thirdparty.com.google.common.base.Function;
import org.apache.phoenix.thirdparty.com.google.common.collect.Collections2;
import org.apache.phoenix.thirdparty.com.google.common.collect.Lists;
import org.apache.phoenix.thirdparty.com.google.common.collect.Ordering;
import org.apache.phoenix.util.ByteUtil;
import org.apache.phoenix.util.ClientUtil;
import org.apache.phoenix.util.EnvironmentEdgeManager;
import org.apache.phoenix.util.PhoenixKeyValueUtil;
import org.apache.phoenix.util.ScanUtil;
import org.apache.phoenix.util.SizedUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import static org.apache.phoenix.thirdparty.com.google.common.base.Preconditions.checkArgument;
import static org.apache.phoenix.thirdparty.com.google.common.base.Preconditions.checkPositionIndex;
import static org.apache.phoenix.util.ScanUtil.isDummy;
/**
* Result scanner that sorts aggregated rows by columns specified in the ORDER BY clause.
*
* Note that currently the sort is entirely done in memory.
*
*
* @since 0.1
*/
public class OrderedResultIterator implements PeekingResultIterator {
private static final Logger LOGGER = LoggerFactory.getLogger(OrderedResultIterator.class);
/** A container that holds pointers to a {@link Result} and its sort keys. */
protected static class ResultEntry {
protected final ImmutableBytesWritable[] sortKeys;
protected final Tuple result;
ResultEntry(ImmutableBytesWritable[] sortKeys, Tuple result) {
this.sortKeys = sortKeys;
this.result = result;
}
ImmutableBytesWritable getSortKey(int index) {
checkPositionIndex(index, sortKeys.length);
return sortKeys[index];
}
Tuple getResult() {
return result;
}
static long sizeOf(ResultEntry e) {
return sizeof(e.sortKeys) + sizeof(toKeyValues(e));
}
private static long sizeof(List kvs) {
long size = Bytes.SIZEOF_INT; // totalLen
for (KeyValue kv : kvs) {
size += kv.getLength();
size += Bytes.SIZEOF_INT; // kv.getLength
}
return size;
}
private static long sizeof(ImmutableBytesWritable[] sortKeys) {
long size = Bytes.SIZEOF_INT;
if (sortKeys != null) {
for (ImmutableBytesWritable sortKey : sortKeys) {
if (sortKey != null) {
size += sortKey.getLength();
}
size += Bytes.SIZEOF_INT;
}
}
return size;
}
private static List toKeyValues(ResultEntry entry) {
Tuple result = entry.getResult();
int size = result.size();
List kvs = new ArrayList(size);
for (int i = 0; i < size; i++) {
kvs.add(PhoenixKeyValueUtil.maybeCopyCell(result.getValue(i)));
}
return kvs;
}
}
/** A function that returns Nth key for a given {@link ResultEntry}. */
private static class NthKey implements Function {
private final int index;
NthKey(int index) {
this.index = index;
}
@Override
public ImmutableBytesWritable apply(ResultEntry entry) {
return entry.getSortKey(index);
}
}
/** Returns the expression of a given {@link OrderByExpression}. */
private static final Function TO_EXPRESSION = new Function() {
@Override
public Expression apply(OrderByExpression column) {
return column.getExpression();
}
};
private final boolean spoolingEnabled;
private final long thresholdBytes;
private final Integer limit;
private final Integer offset;
private final ResultIterator delegate;
private final List orderByExpressions;
private final long estimatedByteSize;
private PeekingResultIterator resultIterator;
private boolean resultIteratorReady = false;
private Tuple dummyTuple = null;
private long byteSize;
private long pageSizeMs;
private Scan scan;
private byte[] scanStartRowKey;
private byte[] actualScanStartRowKey;
private Boolean actualScanIncludeStartRowKey;
private RegionInfo regionInfo = null;
private boolean includeStartRowKey;
private boolean serverSideIterator = false;
private boolean firstScan = true;
private boolean skipValidRowsSent = false;
protected ResultIterator getDelegate() {
return delegate;
}
public OrderedResultIterator(ResultIterator delegate, List orderByExpressions,
boolean spoolingEnabled, long thresholdBytes, Integer limit, Integer offset) {
this(delegate, orderByExpressions, spoolingEnabled, thresholdBytes, limit, offset, 0, Long.MAX_VALUE);
}
public OrderedResultIterator(ResultIterator delegate, List orderByExpressions,
boolean spoolingEnabled, long thresholdBytes) throws SQLException {
this(delegate, orderByExpressions, spoolingEnabled, thresholdBytes, null, null);
}
public OrderedResultIterator(ResultIterator delegate,
List orderByExpressions, boolean spoolingEnabled,
long thresholdBytes, Integer limit, Integer offset, int estimatedRowSize) {
this(delegate, orderByExpressions, spoolingEnabled, thresholdBytes, limit, offset, estimatedRowSize, Long.MAX_VALUE);
}
public OrderedResultIterator(ResultIterator delegate,
List orderByExpressions,
boolean spoolingEnabled,
long thresholdBytes, Integer limit, Integer offset,
int estimatedRowSize, long pageSizeMs, Scan scan,
RegionInfo regionInfo) {
this(delegate, orderByExpressions, spoolingEnabled, thresholdBytes, limit, offset,
estimatedRowSize, pageSizeMs);
this.scan = scan;
// If scan start rowkey is empty, use region boundaries. Reverse region boundaries
// for reverse scan.
// Keep this same as ServerUtil#getScanStartRowKeyFromScanOrRegionBoundaries.
this.scanStartRowKey =
scan.getStartRow().length > 0 ? scan.getStartRow()
: (scan.isReversed() ? regionInfo.getEndKey()
: regionInfo.getStartKey());
// Retrieve start rowkey of the previous scan. This would be different than
// current scan start rowkey if the region has recently moved or split or merged.
this.actualScanStartRowKey =
scan.getAttribute(BaseScannerRegionObserverConstants.SCAN_ACTUAL_START_ROW);
this.actualScanIncludeStartRowKey = true;
this.includeStartRowKey = scan.includeStartRow();
this.serverSideIterator = true;
this.regionInfo = regionInfo;
}
public OrderedResultIterator(ResultIterator delegate,
List orderByExpressions, boolean spoolingEnabled,
long thresholdBytes, Integer limit, Integer offset, int estimatedRowSize, long pageSizeMs) {
checkArgument(!orderByExpressions.isEmpty());
this.delegate = delegate;
this.orderByExpressions = orderByExpressions;
this.spoolingEnabled = spoolingEnabled;
this.thresholdBytes = thresholdBytes;
this.offset = offset == null ? 0 : offset;
if (limit != null) {
this.limit = limit + this.offset;
} else {
this.limit = null;
}
long estimatedEntrySize =
// ResultEntry
SizedUtil.OBJECT_SIZE +
// ImmutableBytesWritable[]
SizedUtil.ARRAY_SIZE + orderByExpressions.size() * SizedUtil.IMMUTABLE_BYTES_WRITABLE_SIZE +
// Tuple
SizedUtil.OBJECT_SIZE + estimatedRowSize;
// Make sure we don't overflow Long, though this is really unlikely to happen.
assert(limit == null || Long.MAX_VALUE / estimatedEntrySize >= limit + this.offset);
// Both BufferedSortedQueue and SizeBoundQueue won't allocate more than thresholdBytes.
this.estimatedByteSize = limit == null ? 0 : Math.min((limit + this.offset) * estimatedEntrySize, thresholdBytes);
this.pageSizeMs = pageSizeMs;
}
public Integer getLimit() {
return limit;
}
public long getEstimatedByteSize() {
return estimatedByteSize;
}
public long getByteSize() {
return byteSize;
}
/**
* Builds a comparator from the list of columns in ORDER BY clause.
* @param orderByExpressions the columns in ORDER BY clause.
* @return the comparator built from the list of columns in ORDER BY clause.
*/
// ImmutableBytesWritable.Comparator doesn't implement generics
@SuppressWarnings("unchecked")
private static Comparator buildComparator(List orderByExpressions) {
Ordering ordering = null;
int pos = 0;
for (OrderByExpression col : orderByExpressions) {
Expression e = col.getExpression();
Comparator comparator =
e.getSortOrder() == SortOrder.DESC && !e.getDataType().isFixedWidth()
? buildDescVarLengthComparator()
: new ImmutableBytesWritable.Comparator();
Ordering o = Ordering.from(comparator);
if(!col.isAscending()) o = o.reverse();
o = col.isNullsLast() ? o.nullsLast() : o.nullsFirst();
Ordering entryOrdering = o.onResultOf(new NthKey(pos++));
ordering = ordering == null ? entryOrdering : ordering.compound(entryOrdering);
}
return ordering;
}
/*
* Same as regular comparator, but if all the bytes match and the length is
* different, returns the longer length as bigger.
*/
private static Comparator buildDescVarLengthComparator() {
return new Comparator() {
@Override
public int compare(ImmutableBytesWritable o1, ImmutableBytesWritable o2) {
return DescVarLengthFastByteComparisons.compareTo(
o1.get(), o1.getOffset(), o1.getLength(),
o2.get(), o2.getOffset(), o2.getLength());
}
};
}
@Override
public Tuple next() throws SQLException {
try {
if (firstScan && serverSideIterator && actualScanStartRowKey != null
&& actualScanIncludeStartRowKey != null) {
if (scanStartRowKey.length > 0 && !ScanUtil.isLocalIndex(scan)) {
if (Bytes.compareTo(actualScanStartRowKey, scanStartRowKey) != 0
|| actualScanIncludeStartRowKey != includeStartRowKey) {
LOGGER.info("Region has moved. Actual scan start rowkey {} is not same as"
+ " current scan start rowkey {}",
Bytes.toStringBinary(actualScanStartRowKey),
Bytes.toStringBinary(scanStartRowKey));
// If region has moved in the middle of the scan operation, after resetting
// the scanner, hbase client uses (latest received rowkey + \x00) as new
// start rowkey for resuming the scan operation on the new scanner.
if (Bytes.compareTo(
ByteUtil.concat(actualScanStartRowKey, ByteUtil.ZERO_BYTE),
scanStartRowKey) == 0) {
scan.setAttribute(QueryServices.PHOENIX_PAGING_NEW_SCAN_START_ROWKEY,
actualScanStartRowKey);
scan.setAttribute(
QueryServices.PHOENIX_PAGING_NEW_SCAN_START_ROWKEY_INCLUDE,
Bytes.toBytes(actualScanIncludeStartRowKey));
} else {
// This happens when the server side scanner has already sent some
// rows back to the client and region has moved, so now we need to
// use skipValidRowsSent flag and also reset the scanner
// at paging region scanner level to re-read the previously sent
// values in order to re-compute the aggregation and then return
// only the next rowkey that was not yet sent back to the client.
skipValidRowsSent = true;
scan.setAttribute(QueryServices.PHOENIX_PAGING_NEW_SCAN_START_ROWKEY,
actualScanStartRowKey);
scan.setAttribute(
QueryServices.PHOENIX_PAGING_NEW_SCAN_START_ROWKEY_INCLUDE,
Bytes.toBytes(actualScanIncludeStartRowKey));
}
}
}
}
if (firstScan) {
firstScan = false;
}
getResultIterator();
if (!resultIteratorReady) {
return dummyTuple;
}
Tuple result = resultIterator.next();
if (skipValidRowsSent) {
while (true) {
if (result == null) {
skipValidRowsSent = false;
return null;
}
ImmutableBytesWritable ptr = new ImmutableBytesWritable();
result.getKey(ptr);
byte[] resultRowKey = new byte[ptr.getLength()];
System.arraycopy(ptr.get(), ptr.getOffset(), resultRowKey, 0,
resultRowKey.length);
// In case of regular scans, if the region moves and scanner is reset,
// hbase client checks the last returned row by the server, gets the
// rowkey and appends "\x00" byte, before resuming the scan. With this,
// scan includeStartRowKey is set to true.
// However, same is not the case with reverse scans. For the reverse scan,
// hbase client checks the last returned row by the server, gets the
// rowkey and treats it as startRowKey for resuming the scan. With this,
// scan includeStartRowKey is set to false.
// Hence, we need to cover both cases here.
if (Bytes.compareTo(resultRowKey, scanStartRowKey) == 0) {
// This can be true for reverse scan case.
skipValidRowsSent = false;
if (includeStartRowKey) {
return result;
}
// If includeStartRowKey is false and the current rowkey is matching
// with scanStartRowKey, return the next row result.
return resultIterator.next();
} else if (
Bytes.compareTo(
ByteUtil.concat(resultRowKey, ByteUtil.ZERO_BYTE),
scanStartRowKey) == 0) {
// This can be true for regular scan case.
skipValidRowsSent = false;
if (includeStartRowKey) {
// If includeStartRowKey is true and the (current rowkey + "\0xx") is
// matching with scanStartRowKey, return the next row result.
return resultIterator.next();
}
}
result = resultIterator.next();
}
}
return result;
} catch (Exception e) {
LOGGER.error("Ordered result iterator next encountered error " + (regionInfo != null
? " for region: " + regionInfo.getRegionNameAsString() : "."), e);
if (e instanceof SQLException) {
throw e;
} else {
throw new PhoenixIOException(e);
}
}
}
private PeekingResultIterator getResultIterator() throws SQLException {
if (resultIteratorReady) {
// The results have not been ordered yet. When the results are ordered then the result iterator
// will be ready to iterate over them
return resultIterator;
}
final int numSortKeys = orderByExpressions.size();
List expressions = Lists.newArrayList(Collections2.transform(orderByExpressions, TO_EXPRESSION));
final Comparator comparator = buildComparator(orderByExpressions);
try{
if (resultIterator == null) {
resultIterator = new RecordPeekingResultIterator(PhoenixQueues.newResultEntrySortedQueue(comparator,
limit, spoolingEnabled, thresholdBytes));
}
final SizeAwareQueue queueEntries = ((RecordPeekingResultIterator)resultIterator).getQueueEntries();
long startTime = EnvironmentEdgeManager.currentTimeMillis();
for (Tuple result = delegate.next(); result != null; result = delegate.next()) {
// result might be empty if it was filtered by a local index
if (result.size() == 0) {
continue;
}
if (isDummy(result)) {
getDummyResult();
return resultIterator;
}
int pos = 0;
ImmutableBytesWritable[] sortKeys = new ImmutableBytesWritable[numSortKeys];
for (Expression expression : expressions) {
final ImmutableBytesWritable sortKey = new ImmutableBytesWritable();
boolean evaluated = expression.evaluate(result, sortKey);
// set the sort key that failed to get evaluated with null
sortKeys[pos++] = evaluated && sortKey.getLength() > 0 ? sortKey : null;
}
queueEntries.add(new ResultEntry(sortKeys, result));
if (EnvironmentEdgeManager.currentTimeMillis() - startTime >= pageSizeMs) {
getDummyResult();
return resultIterator;
}
}
resultIteratorReady = true;
this.byteSize = queueEntries.getByteSize();
} catch (IOException e) {
LOGGER.error("Error while getting result iterator from OrderedResultIterator.", e);
ClientUtil.createIOException(e.getMessage(), e);
throw new SQLException(e);
} finally {
delegate.close();
}
return resultIterator;
}
/**
* Retrieve dummy rowkey.
*/
private void getDummyResult() {
if (scanStartRowKey.length > 0 && !ScanUtil.isLocalIndex(scan)) {
if (Bytes.compareTo(actualScanStartRowKey, scanStartRowKey) != 0
|| actualScanIncludeStartRowKey != includeStartRowKey) {
byte[] lastByte =
new byte[]{scanStartRowKey[scanStartRowKey.length - 1]};
if (scanStartRowKey.length > 1 && Bytes.compareTo(lastByte,
ByteUtil.ZERO_BYTE) == 0) {
byte[] prevKey = new byte[scanStartRowKey.length - 1];
System.arraycopy(scanStartRowKey, 0, prevKey, 0,
prevKey.length);
dummyTuple = ScanUtil.getDummyTuple(prevKey);
} else {
dummyTuple = ScanUtil.getDummyTuple(scanStartRowKey);
}
} else {
dummyTuple = ScanUtil.getDummyTuple(scanStartRowKey);
}
} else {
dummyTuple = ScanUtil.getDummyTuple(scanStartRowKey);
}
}
@Override
public Tuple peek() throws SQLException {
return getResultIterator().peek();
}
@Override
public void close() throws SQLException {
// Guard against resultIterator being null
if (null != resultIterator) {
resultIterator.close();
}
resultIterator = PeekingResultIterator.EMPTY_ITERATOR;
}
@Override
public void explain(List planSteps) {
delegate.explain(planSteps);
planSteps.add("CLIENT" + (offset == null || offset == 0 ? "" : " OFFSET " + offset)
+ (limit == null ? "" : " TOP " + limit + " ROW" + (limit == 1 ? "" : "S")) + " SORTED BY "
+ orderByExpressions.toString());
}
@Override
public void explain(List planSteps,
ExplainPlanAttributesBuilder explainPlanAttributesBuilder) {
delegate.explain(planSteps, explainPlanAttributesBuilder);
explainPlanAttributesBuilder.setClientOffset(offset);
explainPlanAttributesBuilder.setClientRowLimit(limit);
explainPlanAttributesBuilder.setClientSortedBy(
orderByExpressions.toString());
planSteps.add("CLIENT" + (offset == null || offset == 0 ? "" : " OFFSET " + offset)
+ (limit == null ? "" : " TOP " + limit + " ROW" + (limit == 1 ? "" : "S"))
+ " SORTED BY " + orderByExpressions.toString());
}
@Override
public String toString() {
return "OrderedResultIterator [thresholdBytes=" + thresholdBytes
+ ", limit=" + limit + ", offset=" + offset + ", delegate=" + delegate
+ ", orderByExpressions=" + orderByExpressions
+ ", estimatedByteSize=" + estimatedByteSize
+ ", resultIterator=" + resultIterator + ", byteSize="
+ byteSize + "]";
}
private class RecordPeekingResultIterator implements PeekingResultIterator {
int count = 0;
private SizeAwareQueue queueEntries;
RecordPeekingResultIterator(SizeAwareQueue queueEntries){
this.queueEntries = queueEntries;
}
public SizeAwareQueue getQueueEntries() {
return queueEntries;
}
@Override
public Tuple next() throws SQLException {
ResultEntry entry = queueEntries.poll();
while (entry != null && offset != null && count < offset) {
count++;
if (entry.getResult() == null) { return null; }
entry = queueEntries.poll();
}
if (entry == null || (limit != null && count++ > limit)) {
resultIterator.close();
resultIterator = PeekingResultIterator.EMPTY_ITERATOR;
return null;
}
return entry.getResult();
}
@Override
public Tuple peek() throws SQLException {
ResultEntry entry = queueEntries.peek();
while (entry != null && offset != null && count < offset) {
entry = queueEntries.poll();
count++;
if (entry == null) { return null; }
}
if (limit != null && count > limit) { return null; }
entry = queueEntries.peek();
if (entry == null) { return null; }
return entry.getResult();
}
@Override
public void explain(List planSteps) {
}
@Override
public void explain(List planSteps,
ExplainPlanAttributesBuilder explainPlanAttributesBuilder) {
}
@Override
public void close() throws SQLException {
try {
queueEntries.close();
} catch (IOException e) {
throw new SQLException(e);
}
}
}
}