org.apache.hadoop.hbase.codec.prefixtree.encode.PrefixTreeEncoder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of alihbase-prefix-tree Show documentation
Show all versions of alihbase-prefix-tree Show documentation
Prefix Tree Data Block Encoder
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.codec.prefixtree.encode;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.codec.prefixtree.PrefixTreeBlockMeta;
import org.apache.hadoop.hbase.codec.prefixtree.encode.column.ColumnSectionWriter;
import org.apache.hadoop.hbase.codec.prefixtree.encode.other.CellTypeEncoder;
import org.apache.hadoop.hbase.codec.prefixtree.encode.other.ColumnNodeType;
import org.apache.hadoop.hbase.codec.prefixtree.encode.other.LongEncoder;
import org.apache.hadoop.hbase.codec.prefixtree.encode.row.RowSectionWriter;
import org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize.Tokenizer;
import org.apache.hadoop.hbase.io.CellOutputStream;
import org.apache.hadoop.hbase.util.ArrayUtils;
import org.apache.hadoop.hbase.util.ByteRange;
import org.apache.hadoop.hbase.util.SimpleMutableByteRange;
import org.apache.hadoop.hbase.util.byterange.ByteRangeSet;
import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeHashSet;
import org.apache.hadoop.hbase.util.byterange.impl.ByteRangeTreeSet;
import org.apache.hadoop.hbase.util.vint.UFIntTool;
import org.apache.hadoop.io.WritableUtils;
/**
* This is the primary class for converting a CellOutputStream into an encoded byte[]. As Cells are
* added they are completely copied into the various encoding structures. This is important because
* usually the cells being fed in during compactions will be transient.
*
* Usage:
* 1) constructor
* 4) append cells in sorted order: write(Cell cell)
* 5) flush()
*/
@InterfaceAudience.Private
public class PrefixTreeEncoder implements CellOutputStream {
/**************** static ************************/
protected static final Log LOG = LogFactory.getLog(PrefixTreeEncoder.class);
//future-proof where HBase supports multiple families in a data block.
public static final boolean MULITPLE_FAMILIES_POSSIBLE = false;
private static final boolean USE_HASH_COLUMN_SORTER = true;
private static final int INITIAL_PER_CELL_ARRAY_SIZES = 256;
private static final int VALUE_BUFFER_INIT_SIZE = 64 * 1024;
/**************** fields *************************/
protected long numResets = 0L;
protected OutputStream outputStream;
/*
* Cannot change during a single block's encoding. If false, then substitute incoming Cell's
* mvccVersion with zero and write out the block as usual.
*/
protected boolean includeMvccVersion;
/*
* reusable ByteRanges used for communicating with the sorters/compilers
*/
protected ByteRange rowRange;
protected ByteRange familyRange;
protected ByteRange qualifierRange;
protected ByteRange tagsRange;
/*
* incoming Cell fields are copied into these arrays
*/
protected long[] timestamps;
protected long[] mvccVersions;
protected byte[] typeBytes;
protected int[] valueOffsets;
protected int[] tagsOffsets;
protected byte[] values;
protected byte[] tags;
protected PrefixTreeBlockMeta blockMeta;
/*
* Sub-encoders for the simple long/byte fields of a Cell. Add to these as each cell arrives and
* compile before flushing.
*/
protected LongEncoder timestampEncoder;
protected LongEncoder mvccVersionEncoder;
protected CellTypeEncoder cellTypeEncoder;
/*
* Structures used for collecting families and qualifiers, de-duplicating them, and sorting them
* so they can be passed to the tokenizers. Unlike row keys where we can detect duplicates by
* comparing only with the previous row key, families and qualifiers can arrive in unsorted order
* in blocks spanning multiple rows. We must collect them all into a set to de-duplicate them.
*/
protected ByteRangeSet familyDeduplicator;
protected ByteRangeSet qualifierDeduplicator;
protected ByteRangeSet tagsDeduplicator;
/*
* Feed sorted byte[]s into these tokenizers which will convert the byte[]s to an in-memory
* trie structure with nodes connected by memory pointers (not serializable yet).
*/
protected Tokenizer rowTokenizer;
protected Tokenizer familyTokenizer;
protected Tokenizer qualifierTokenizer;
protected Tokenizer tagsTokenizer;
/*
* Writers take an in-memory trie, sort the nodes, calculate offsets and lengths, and write
* all information to an output stream of bytes that can be stored on disk.
*/
protected RowSectionWriter rowWriter;
protected ColumnSectionWriter familyWriter;
protected ColumnSectionWriter qualifierWriter;
protected ColumnSectionWriter tagsWriter;
/*
* Integers used for counting cells and bytes. We keep track of the size of the Cells as if they
* were full KeyValues because some parts of HBase like to know the "unencoded size".
*/
protected int totalCells = 0;
protected int totalUnencodedBytes = 0;//numBytes if the cells were KeyValues
protected int totalValueBytes = 0;
protected int totalTagBytes = 0;
protected int maxValueLength = 0;
protected int maxTagLength = 0;
protected int totalBytes = 0;//
/***************** construct ***********************/
public PrefixTreeEncoder(OutputStream outputStream, boolean includeMvccVersion) {
// used during cell accumulation
this.blockMeta = new PrefixTreeBlockMeta();
this.rowRange = new SimpleMutableByteRange();
this.familyRange = new SimpleMutableByteRange();
this.qualifierRange = new SimpleMutableByteRange();
this.timestamps = new long[INITIAL_PER_CELL_ARRAY_SIZES];
this.mvccVersions = new long[INITIAL_PER_CELL_ARRAY_SIZES];
this.typeBytes = new byte[INITIAL_PER_CELL_ARRAY_SIZES];
this.valueOffsets = new int[INITIAL_PER_CELL_ARRAY_SIZES];
this.values = new byte[VALUE_BUFFER_INIT_SIZE];
// used during compilation
this.familyDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
: new ByteRangeTreeSet();
this.qualifierDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
: new ByteRangeTreeSet();
this.timestampEncoder = new LongEncoder();
this.mvccVersionEncoder = new LongEncoder();
this.cellTypeEncoder = new CellTypeEncoder();
this.rowTokenizer = new Tokenizer();
this.familyTokenizer = new Tokenizer();
this.qualifierTokenizer = new Tokenizer();
this.rowWriter = new RowSectionWriter();
this.familyWriter = new ColumnSectionWriter();
this.qualifierWriter = new ColumnSectionWriter();
initializeTagHelpers();
reset(outputStream, includeMvccVersion);
}
public void reset(OutputStream outputStream, boolean includeMvccVersion) {
++numResets;
this.includeMvccVersion = includeMvccVersion;
this.outputStream = outputStream;
valueOffsets[0] = 0;
familyDeduplicator.reset();
qualifierDeduplicator.reset();
tagsDeduplicator.reset();
tagsWriter.reset();
tagsTokenizer.reset();
rowTokenizer.reset();
timestampEncoder.reset();
mvccVersionEncoder.reset();
cellTypeEncoder.reset();
familyTokenizer.reset();
qualifierTokenizer.reset();
rowWriter.reset();
familyWriter.reset();
qualifierWriter.reset();
totalCells = 0;
totalUnencodedBytes = 0;
totalValueBytes = 0;
maxValueLength = 0;
totalBytes = 0;
}
protected void initializeTagHelpers() {
this.tagsRange = new SimpleMutableByteRange();
this.tagsDeduplicator = USE_HASH_COLUMN_SORTER ? new ByteRangeHashSet()
: new ByteRangeTreeSet();
this.tagsTokenizer = new Tokenizer();
this.tagsWriter = new ColumnSectionWriter();
}
/**
* Check that the arrays used to hold cell fragments are large enough for the cell that is being
* added. Since the PrefixTreeEncoder is cached between uses, these arrays may grow during the
* first few block encodings but should stabilize quickly.
*/
protected void ensurePerCellCapacities() {
int currentCapacity = valueOffsets.length;
int neededCapacity = totalCells + 2;// some things write one index ahead. +2 to be safe
if (neededCapacity < currentCapacity) {
return;
}
int padding = neededCapacity;//this will double the array size
timestamps = ArrayUtils.growIfNecessary(timestamps, neededCapacity, padding);
mvccVersions = ArrayUtils.growIfNecessary(mvccVersions, neededCapacity, padding);
typeBytes = ArrayUtils.growIfNecessary(typeBytes, neededCapacity, padding);
valueOffsets = ArrayUtils.growIfNecessary(valueOffsets, neededCapacity, padding);
}
/******************** CellOutputStream methods *************************/
/**
* Note: Unused until support is added to the scanner/heap
*
* The following method are optimized versions of write(Cell cell). The result should be
* identical, however the implementation may be able to execute them much more efficiently because
* it does not need to compare the unchanged fields with the previous cell's.
*
* Consider the benefits during compaction when paired with a CellScanner that is also aware of
* row boundaries. The CellScanner can easily use these methods instead of blindly passing Cells
* to the write(Cell cell) method.
*
* The savings of skipping duplicate row detection are significant with long row keys. A
* DataBlockEncoder may store a row key once in combination with a count of how many cells are in
* the row. With a 100 byte row key, we can replace 100 byte comparisons with a single increment
* of the counter, and that is for every cell in the row.
*/
/**
* Add a Cell to the output stream but repeat the previous row.
*/
//@Override
public void writeWithRepeatRow(Cell cell) {
ensurePerCellCapacities();//can we optimize away some of this?
//save a relatively expensive row comparison, incrementing the row's counter instead
rowTokenizer.incrementNumOccurrencesOfLatestValue();
addFamilyPart(cell);
addQualifierPart(cell);
addAfterRowFamilyQualifier(cell);
}
@Override
public void write(Cell cell) {
ensurePerCellCapacities();
rowTokenizer.addSorted(CellUtil.fillRowRange(cell, rowRange));
addFamilyPart(cell);
addQualifierPart(cell);
addTagPart(cell);
addAfterRowFamilyQualifier(cell);
}
private void addTagPart(Cell cell) {
CellUtil.fillTagRange(cell, tagsRange);
tagsDeduplicator.add(tagsRange);
}
/***************** internal add methods ************************/
private void addAfterRowFamilyQualifier(Cell cell){
// timestamps
timestamps[totalCells] = cell.getTimestamp();
timestampEncoder.add(cell.getTimestamp());
// memstore timestamps
if (includeMvccVersion) {
mvccVersions[totalCells] = cell.getMvccVersion();
mvccVersionEncoder.add(cell.getMvccVersion());
totalUnencodedBytes += WritableUtils.getVIntSize(cell.getMvccVersion());
}else{
//must overwrite in case there was a previous version in this array slot
mvccVersions[totalCells] = 0L;
if(totalCells == 0){//only need to do this for the first cell added
mvccVersionEncoder.add(0L);
}
//totalUncompressedBytes += 0;//mvccVersion takes zero bytes when disabled
}
// types
typeBytes[totalCells] = cell.getTypeByte();
cellTypeEncoder.add(cell.getTypeByte());
// values
totalValueBytes += cell.getValueLength();
// double the array each time we run out of space
values = ArrayUtils.growIfNecessary(values, totalValueBytes, 2 * totalValueBytes);
CellUtil.copyValueTo(cell, values, valueOffsets[totalCells]);
if (cell.getValueLength() > maxValueLength) {
maxValueLength = cell.getValueLength();
}
valueOffsets[totalCells + 1] = totalValueBytes;
// general
totalUnencodedBytes += KeyValueUtil.length(cell);
++totalCells;
}
private void addFamilyPart(Cell cell) {
if (MULITPLE_FAMILIES_POSSIBLE || totalCells == 0) {
CellUtil.fillFamilyRange(cell, familyRange);
familyDeduplicator.add(familyRange);
}
}
private void addQualifierPart(Cell cell) {
CellUtil.fillQualifierRange(cell, qualifierRange);
qualifierDeduplicator.add(qualifierRange);
}
/****************** compiling/flushing ********************/
/**
* Expensive method. The second half of the encoding work happens here.
*
* Take all the separate accumulated data structures and turn them into a single stream of bytes
* which is written to the outputStream.
*/
@Override
public void flush() throws IOException {
compile();
// do the actual flushing to the output stream. Order matters.
blockMeta.writeVariableBytesToOutputStream(outputStream);
rowWriter.writeBytes(outputStream);
familyWriter.writeBytes(outputStream);
qualifierWriter.writeBytes(outputStream);
tagsWriter.writeBytes(outputStream);
timestampEncoder.writeBytes(outputStream);
mvccVersionEncoder.writeBytes(outputStream);
//CellType bytes are in the row nodes. there is no additional type section
outputStream.write(values, 0, totalValueBytes);
}
/**
* Now that all the cells have been added, do the work to reduce them to a series of byte[]
* fragments that are ready to be written to the output stream.
*/
protected void compile(){
blockMeta.setNumKeyValueBytes(totalUnencodedBytes);
int lastValueOffset = valueOffsets[totalCells];
blockMeta.setValueOffsetWidth(UFIntTool.numBytes(lastValueOffset));
blockMeta.setValueLengthWidth(UFIntTool.numBytes(maxValueLength));
blockMeta.setNumValueBytes(totalValueBytes);
totalBytes += totalTagBytes + totalValueBytes;
//these compile methods will add to totalBytes
compileTypes();
compileMvccVersions();
compileTimestamps();
compileTags();
compileQualifiers();
compileFamilies();
compileRows();
int numMetaBytes = blockMeta.calculateNumMetaBytes();
blockMeta.setNumMetaBytes(numMetaBytes);
totalBytes += numMetaBytes;
}
/**
* The following "compile" methods do any intermediate work necessary to transform the cell
* fragments collected during the writing phase into structures that are ready to write to the
* outputStream.
*
* The family and qualifier treatment is almost identical, as is timestamp and mvccVersion.
*/
protected void compileTypes() {
blockMeta.setAllSameType(cellTypeEncoder.areAllSameType());
if(cellTypeEncoder.areAllSameType()){
blockMeta.setAllTypes(cellTypeEncoder.getOnlyType());
}
}
protected void compileMvccVersions() {
mvccVersionEncoder.compile();
blockMeta.setMvccVersionFields(mvccVersionEncoder);
int numMvccVersionBytes = mvccVersionEncoder.getOutputArrayLength();
totalBytes += numMvccVersionBytes;
}
protected void compileTimestamps() {
timestampEncoder.compile();
blockMeta.setTimestampFields(timestampEncoder);
int numTimestampBytes = timestampEncoder.getOutputArrayLength();
totalBytes += numTimestampBytes;
}
protected void compileQualifiers() {
blockMeta.setNumUniqueQualifiers(qualifierDeduplicator.size());
qualifierDeduplicator.compile();
qualifierTokenizer.addAll(qualifierDeduplicator.getSortedRanges());
qualifierWriter.reconstruct(blockMeta, qualifierTokenizer, ColumnNodeType.QUALIFIER);
qualifierWriter.compile();
int numQualifierBytes = qualifierWriter.getNumBytes();
blockMeta.setNumQualifierBytes(numQualifierBytes);
totalBytes += numQualifierBytes;
}
protected void compileFamilies() {
blockMeta.setNumUniqueFamilies(familyDeduplicator.size());
familyDeduplicator.compile();
familyTokenizer.addAll(familyDeduplicator.getSortedRanges());
familyWriter.reconstruct(blockMeta, familyTokenizer, ColumnNodeType.FAMILY);
familyWriter.compile();
int numFamilyBytes = familyWriter.getNumBytes();
blockMeta.setNumFamilyBytes(numFamilyBytes);
totalBytes += numFamilyBytes;
}
protected void compileTags() {
blockMeta.setNumUniqueTags(tagsDeduplicator.size());
tagsDeduplicator.compile();
tagsTokenizer.addAll(tagsDeduplicator.getSortedRanges());
tagsWriter.reconstruct(blockMeta, tagsTokenizer, ColumnNodeType.TAGS);
tagsWriter.compile();
int numTagBytes = tagsWriter.getNumBytes();
blockMeta.setNumTagsBytes(numTagBytes);
totalBytes += numTagBytes;
}
protected void compileRows() {
rowWriter.reconstruct(this);
rowWriter.compile();
int numRowBytes = rowWriter.getNumBytes();
blockMeta.setNumRowBytes(numRowBytes);
blockMeta.setRowTreeDepth(rowTokenizer.getTreeDepth());
totalBytes += numRowBytes;
}
/********************* convenience getters ********************************/
public long getValueOffset(int index) {
return valueOffsets[index];
}
public int getValueLength(int index) {
return (int) (valueOffsets[index + 1] - valueOffsets[index]);
}
/************************* get/set *************************************/
public PrefixTreeBlockMeta getBlockMeta() {
return blockMeta;
}
public Tokenizer getRowTokenizer() {
return rowTokenizer;
}
public LongEncoder getTimestampEncoder() {
return timestampEncoder;
}
public int getTotalBytes() {
return totalBytes;
}
public long[] getTimestamps() {
return timestamps;
}
public long[] getMvccVersions() {
return mvccVersions;
}
public byte[] getTypeBytes() {
return typeBytes;
}
public LongEncoder getMvccVersionEncoder() {
return mvccVersionEncoder;
}
public ByteRangeSet getFamilySorter() {
return familyDeduplicator;
}
public ByteRangeSet getQualifierSorter() {
return qualifierDeduplicator;
}
public ByteRangeSet getTagSorter() {
return tagsDeduplicator;
}
public ColumnSectionWriter getFamilyWriter() {
return familyWriter;
}
public ColumnSectionWriter getQualifierWriter() {
return qualifierWriter;
}
public ColumnSectionWriter getTagWriter() {
return tagsWriter;
}
public RowSectionWriter getRowWriter() {
return rowWriter;
}
public ByteRange getValueByteRange() {
return new SimpleMutableByteRange(values, 0, totalValueBytes);
}
}