com.facebook.hive.orc.lazy.LazyStringDictionaryTreeReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-dwrf Show documentation
Show all versions of hive-dwrf Show documentation
DWRF file format for Hive
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.hive.orc.lazy;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import com.facebook.hive.orc.BitFieldReader;
import com.facebook.hive.orc.DynamicByteArray;
import com.facebook.hive.orc.InStream;
import com.facebook.hive.orc.MemoryEstimate;
import com.facebook.hive.orc.OrcProto;
import com.facebook.hive.orc.RunLengthIntegerReader;
import com.facebook.hive.orc.StreamName;
import com.facebook.hive.orc.WriterImpl;
import org.apache.hadoop.io.Text;
import com.facebook.hive.orc.OrcProto.RowIndex;
import com.facebook.hive.orc.OrcProto.RowIndexEntry;
public class LazyStringDictionaryTreeReader extends LazyTreeReader {
private static final MemoryEstimate DUMMY_MEMORY_ESTIMATE = new MemoryEstimate();
private DynamicByteArray dictionaryBuffer = null;
private DynamicByteArray strideDictionaryBuffer;
private int dictionarySize;
private int[] strideDictionarySizes;
private int[] dictionaryOffsets;
private int[] strideDictionaryOffsets;
private RunLengthIntegerReader reader;
private BitFieldReader inDictionary;
private InStream directReader;
private RunLengthIntegerReader directLengths;
private int currentUnitDictionary = -1;
LazyStringDictionaryTreeReader(int columnId, long rowIndexStride) {
super(columnId, rowIndexStride);
}
@Override
public void startStripe(Map streams, List encodings,
RowIndex[] indexes, long rowBaseInStripe) throws IOException {
super.startStripe(streams, encodings, indexes, rowBaseInStripe);
dictionarySize = encodings.get(columnId).getDictionarySize();
// read the lengths
StreamName name = new StreamName(columnId, OrcProto.Stream.Kind.LENGTH);
InStream in = streams.get(name);
RunLengthIntegerReader lenReader = new RunLengthIntegerReader(in, false,
WriterImpl.INT_BYTE_SIZE);
int offset = 0;
if (dictionaryOffsets == null ||
dictionaryOffsets.length < dictionarySize + 1) {
dictionaryOffsets = new int[dictionarySize + 1];
}
for(int i=0; i < dictionarySize; ++i) {
dictionaryOffsets[i] = offset;
offset += (int) lenReader.next();
}
dictionaryOffsets[dictionarySize] = offset;
in.close();
// read the dictionary blob
name = new StreamName(columnId,
OrcProto.Stream.Kind.DICTIONARY_DATA);
in = streams.get(name);
if (in.available() > 0) {
dictionaryBuffer = new DynamicByteArray(dictionaryOffsets[dictionarySize],
DUMMY_MEMORY_ESTIMATE);
dictionaryBuffer.readAll(in);
} else {
dictionaryBuffer = null;
}
in.close();
// set up the row reader
name = new StreamName(columnId, OrcProto.Stream.Kind.DATA);
reader = new RunLengthIntegerReader(streams.get(name), false, WriterImpl.INT_BYTE_SIZE);
InStream inDictionaryStream = streams.get(new StreamName(columnId, OrcProto.Stream.Kind.IN_DICTIONARY));
inDictionary = inDictionaryStream == null ? null : new BitFieldReader(inDictionaryStream);
directReader = streams.get(new StreamName(columnId, OrcProto.Stream.Kind.STRIDE_DICTIONARY));
InStream directLengthsStream = streams.get(new StreamName(columnId,
OrcProto.Stream.Kind.STRIDE_DICTIONARY_LENGTH));
directLengths = directLengthsStream == null ? null : new RunLengthIntegerReader(
directLengthsStream, false, WriterImpl.INT_BYTE_SIZE);
if (indexes[columnId] != null) {
loadIndeces(indexes[columnId].getEntryList(), 0);
}
}
@Override
public void seek(int index) throws IOException {
reader.seek(index);
if (inDictionary != null) {
inDictionary.seek(index);
}
}
@Override
public int loadIndeces(List rowIndexEntries, int startIndex) {
int updatedStartIndex = super.loadIndeces(rowIndexEntries, startIndex);
if (inDictionary != null) {
updatedStartIndex = directReader.loadIndeces(rowIndexEntries, updatedStartIndex);
updatedStartIndex = directLengths.loadIndeces(rowIndexEntries, updatedStartIndex);
int numIndeces = rowIndexEntries.size();
strideDictionarySizes = new int[numIndeces + 1];
int i = 0;
for (RowIndexEntry rowIndexEntry : rowIndexEntries) {
strideDictionarySizes[i] = (int) rowIndexEntry.getPositions(updatedStartIndex);
i++;
}
updatedStartIndex++;
updatedStartIndex = reader.loadIndeces(rowIndexEntries, updatedStartIndex);
return inDictionary.loadIndeces(rowIndexEntries, updatedStartIndex);
} else {
updatedStartIndex = reader.loadIndeces(rowIndexEntries, updatedStartIndex);
return updatedStartIndex;
}
}
private void nextFromDictionary(Text result) throws IOException {
int entry = (int) reader.next();
int offset = dictionaryOffsets[entry];
int length = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
// If the column is just empty strings, the size will be zero, so the buffer will be null,
// in that case just return result as it will default to empty
if (dictionaryBuffer != null) {
dictionaryBuffer.setText(result, offset, length);
} else {
result.clear();
}
}
private void loadStrideDictionary(int indexEntry) throws IOException {
currentUnitDictionary = indexEntry;
int offset = 0;
int unitDictionarySize = strideDictionarySizes[indexEntry];
if (strideDictionaryOffsets == null ||
strideDictionaryOffsets.length < unitDictionarySize + 1) {
strideDictionaryOffsets = new int[unitDictionarySize + 1];
}
directLengths.seek(indexEntry);
for(int i=0; i < unitDictionarySize; ++i) {
strideDictionaryOffsets[i] = offset;
offset += (int) directLengths.next();
}
strideDictionaryOffsets[unitDictionarySize] = offset;
if (offset != 0) {
directReader.seek(indexEntry);
strideDictionaryBuffer = new DynamicByteArray(offset, DUMMY_MEMORY_ESTIMATE);
strideDictionaryBuffer.read(directReader, offset);
} else {
// It only contains the empty string
strideDictionaryBuffer = null;
}
}
private void nextFromStrideDictionary(Text result) throws IOException {
int indexEntry = computeRowIndexEntry(previousRow);
if (indexEntry != currentUnitDictionary) {
loadStrideDictionary(indexEntry);
}
int entry = (int) reader.next();
int offset = strideDictionaryOffsets[entry];
int length;
// if it isn't the last entry, subtract the offsets otherwise use
// the buffer length.
if (entry < strideDictionaryOffsets.length - 1) {
length = strideDictionaryOffsets[entry + 1] - offset;
} else {
length = strideDictionaryBuffer.size() - offset;
}
// If the column is just empty strings, the size will be zero, so the buffer will be null,
// in that case just return result as it will default to empty
if (strideDictionaryBuffer != null) {
strideDictionaryBuffer.setText(result, offset, length);
} else {
result.clear();
}
}
@Override
public Object next(Object previous) throws IOException {
Text result = null;
if (valuePresent) {
if (previous == null) {
result = new Text();
} else {
result = (Text) previous;
}
boolean isDictionaryEncoded = inDictionary == null || inDictionary.next() == 1;
if (isDictionaryEncoded) {
nextFromDictionary(result);
} else {
nextFromStrideDictionary(result);
}
}
return result;
}
@Override
public void skipRows(long numNonNullValues) throws IOException {
reader.skip(numNonNullValues);
if (inDictionary != null) {
inDictionary.skip(numNonNullValues);
}
}
@Override
public void close() throws IOException {
super.close();
if (reader != null) {
reader.close();
}
if (inDictionary != null) {
inDictionary.close();
}
if (directReader != null) {
directReader.close();
}
if (directLengths != null) {
directLengths.close();
}
dictionaryBuffer = null;
strideDictionaryBuffer = null;
strideDictionarySizes = null;
dictionaryOffsets = null;
strideDictionaryOffsets = null;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy