org.apache.parquet.io.RecordReaderImplementation Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.io;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.io.api.RecordMaterializer;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* used to read reassembled records
*
* @param the type of the materialized record
*/
class RecordReaderImplementation extends RecordReader {
private static final Logger LOG = LoggerFactory.getLogger(RecordReaderImplementation.class);
public static class Case {
private int id;
private final int startLevel;
private final int depth;
private final int nextLevel;
private final boolean goingUp;
private final boolean goingDown;
private final int nextState;
private final boolean defined;
public Case(int startLevel, int depth, int nextLevel, int nextState, boolean defined) {
this.startLevel = startLevel;
this.depth = depth;
this.nextLevel = nextLevel;
this.nextState = nextState;
this.defined = defined;
// means going up the tree (towards the leaves) of the record
// true if we need to open up groups in this case
goingUp = startLevel <= depth;
// means going down the tree (towards the root) of the record
// true if we need to close groups in this case
goingDown = depth + 1 > nextLevel;
}
public void setID(int id) {
this.id = id;
}
@Override
// this implementation is buggy but the simpler one bellow has duplicates.
// it still works but generates more code than necessary
// a middle ground is necessary
// public int hashCode() {
// int hashCode = 0;
// if (goingUp) {
// hashCode += 1 * (1 + startLevel) + 2 * (1 + depth);
// }
// if (goingDown) {
// hashCode += 3 * (1 + depth) + 5 * (1 + nextLevel);
// }
// return hashCode;
// }
public int hashCode() {
int hashCode = 17;
hashCode += 31 * startLevel;
hashCode += 31 * depth;
hashCode += 31 * nextLevel;
hashCode += 31 * nextState;
hashCode += 31 * (defined ? 0 : 1);
return hashCode;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof Case) {
return equals((Case)obj);
}
return false;
}
// see comment for hashCode above
// public boolean equals(Case other) {
// if (goingUp && !other.goingUp || !goingUp && other.goingUp) {
// return false;
// }
// if (goingUp && other.goingUp && (startLevel != other.startLevel || depth != other.depth)) {
// return false;
// }
// if (goingDown && !other.goingDown || !goingDown && other.goingDown) {
// return false;
// }
// if (goingDown && other.goingDown && (depth != other.depth || nextLevel != other.nextLevel)) {
// return false;
// }
// return true;
// }
public boolean equals(Case other) {
return other != null
&& startLevel == other.startLevel
&& depth == other.depth
&& nextLevel == other.nextLevel
&& nextState == other.nextState
&& ((defined && other.defined) || (!defined && !other.defined));
}
public int getID() {
return id;
}
public int getStartLevel() {
return startLevel;
}
public int getDepth() {
return depth;
}
public int getNextLevel() {
return nextLevel;
}
public int getNextState() {
return nextState;
}
public boolean isGoingUp() {
return goingUp;
}
public boolean isGoingDown() {
return goingDown;
}
public boolean isDefined() {
return defined;
}
@Override
public String toString() {
return "Case " + startLevel + " -> " + depth + " -> " + nextLevel + "; goto sate_"+getNextState();
}
}
public static class State {
public final int id;
public final PrimitiveColumnIO primitiveColumnIO;
public final int maxDefinitionLevel;
public final int maxRepetitionLevel;
public final PrimitiveTypeName primitive;
public final ColumnReader column;
public final String[] fieldPath; // indexed by currentLevel
public final int[] indexFieldPath; // indexed by currentLevel
public final GroupConverter[] groupConverterPath;
public final PrimitiveConverter primitiveConverter;
public final String primitiveField;
public final int primitiveFieldIndex;
public final int[] nextLevel; //indexed by next r
private int[] definitionLevelToDepth; // indexed by current d
private State[] nextState; // indexed by next r
private Case[][][] caseLookup;
private List definedCases;
private List undefinedCases;
private State(int id, PrimitiveColumnIO primitiveColumnIO, ColumnReader column, int[] nextLevel, GroupConverter[] groupConverterPath, PrimitiveConverter primitiveConverter) {
this.id = id;
this.primitiveColumnIO = primitiveColumnIO;
this.maxDefinitionLevel = primitiveColumnIO.getDefinitionLevel();
this.maxRepetitionLevel = primitiveColumnIO.getRepetitionLevel();
this.column = column;
this.nextLevel = nextLevel;
this.groupConverterPath = groupConverterPath;
this.primitiveConverter = primitiveConverter;
this.primitive = primitiveColumnIO.getType().asPrimitiveType().getPrimitiveTypeName();
this.fieldPath = primitiveColumnIO.getFieldPath();
this.primitiveField = fieldPath[fieldPath.length - 1];
this.indexFieldPath = primitiveColumnIO.getIndexFieldPath();
this.primitiveFieldIndex = indexFieldPath[indexFieldPath.length - 1];
}
public int getDepth(int definitionLevel) {
return definitionLevelToDepth[definitionLevel];
}
public List getDefinedCases() {
return definedCases;
}
public List getUndefinedCases() {
return undefinedCases;
}
public Case getCase(int currentLevel, int d, int nextR) {
return caseLookup[currentLevel][d][nextR];
}
public State getNextState(int nextR) {
return nextState[nextR];
}
}
private final GroupConverter recordRootConverter;
private final RecordMaterializer recordMaterializer;
private State[] states;
private ColumnReader[] columnReaders;
private boolean shouldSkipCurrentRecord = false;
/**
* @param root the root of the schema
* @param recordMaterializer responsible of materializing the records
* @param validating whether we should validate against the schema
* @param columnStore where to read the column data from
*/
public RecordReaderImplementation(MessageColumnIO root, RecordMaterializer recordMaterializer, boolean validating, ColumnReadStoreImpl columnStore) {
this.recordMaterializer = recordMaterializer;
this.recordRootConverter = recordMaterializer.getRootConverter(); // TODO: validator(wrap(recordMaterializer), validating, root.getType());
PrimitiveColumnIO[] leaves = root.getLeaves().toArray(new PrimitiveColumnIO[0]);
columnReaders = new ColumnReader[leaves.length];
int[][] nextColumnIdxForRepLevel = new int[leaves.length][];
int[][] levelToClose = new int[leaves.length][];
GroupConverter[][] groupConverterPaths = new GroupConverter[leaves.length][];
PrimitiveConverter[] leafConverters = new PrimitiveConverter[leaves.length];
int[] firstIndexForLevel = new int[256]; // "256 levels of nesting ought to be enough for anybody"
// build the automaton
for (int i = 0; i < leaves.length; i++) {
PrimitiveColumnIO leafColumnIO = leaves[i];
//generate converters along the path from root to leaf
final int[] indexFieldPath = leafColumnIO.getIndexFieldPath();
groupConverterPaths[i] = new GroupConverter[indexFieldPath.length - 1];
GroupConverter current = this.recordRootConverter;
for (int j = 0; j < indexFieldPath.length - 1; j++) {
current = current.getConverter(indexFieldPath[j]).asGroupConverter();
groupConverterPaths[i][j] = current;
}
leafConverters[i] = current.getConverter(indexFieldPath[indexFieldPath.length - 1]).asPrimitiveConverter();
columnReaders[i] = columnStore.getColumnReader(leafColumnIO.getColumnDescriptor());
int maxRepetitionLevel = leafColumnIO.getRepetitionLevel();
nextColumnIdxForRepLevel[i] = new int[maxRepetitionLevel+1];
levelToClose[i] = new int[maxRepetitionLevel+1]; //next level
for (int nextRepLevel = 0; nextRepLevel <= maxRepetitionLevel; ++nextRepLevel) {
// remember which is the first for this level
if (leafColumnIO.isFirst(nextRepLevel)) {
firstIndexForLevel[nextRepLevel] = i;
}
int nextColIdx;
//TODO: when we use nextColumnIdxForRepLevel, should we provide current rep level or the rep level for next item
// figure out automaton transition
if (nextRepLevel == 0) { // 0 always means jump to the next (the last one being a special case)
nextColIdx = i + 1;
} else if (leafColumnIO.isLast(nextRepLevel)) { // when we are at the last of the next repetition level we jump back to the first
nextColIdx = firstIndexForLevel[nextRepLevel];
} else { // otherwise we just go back to the next.
nextColIdx = i + 1;
}
// figure out which level down the tree we need to go back
if (nextColIdx == leaves.length) { // reached the end of the record => close all levels
levelToClose[i][nextRepLevel] = 0;
} else if (leafColumnIO.isLast(nextRepLevel)) { // reached the end of this level => close the repetition level
ColumnIO parent = leafColumnIO.getParent(nextRepLevel);
levelToClose[i][nextRepLevel] = parent.getFieldPath().length - 1;
} else { // otherwise close until the next common parent
levelToClose[i][nextRepLevel] = getCommonParentLevel(
leafColumnIO.getFieldPath(),
leaves[nextColIdx].getFieldPath());
}
// sanity check: that would be a bug
if (levelToClose[i][nextRepLevel] > leaves[i].getFieldPath().length-1) {
throw new ParquetEncodingException(Arrays.toString(leaves[i].getFieldPath())+" -("+nextRepLevel+")-> "+levelToClose[i][nextRepLevel]);
}
nextColumnIdxForRepLevel[i][nextRepLevel] = nextColIdx;
}
}
states = new State[leaves.length];
for (int i = 0; i < leaves.length; i++) {
states[i] = new State(i, leaves[i], columnReaders[i], levelToClose[i], groupConverterPaths[i], leafConverters[i]);
int[] definitionLevelToDepth = new int[states[i].primitiveColumnIO.getDefinitionLevel() + 1];
// for each possible definition level, determine the depth at which to create groups
final ColumnIO[] path = states[i].primitiveColumnIO.getPath();
int depth = 0;
for (int d = 0; d < definitionLevelToDepth.length; ++d) {
while (depth < (states[i].fieldPath.length - 1)
&& d >= path[depth + 1].getDefinitionLevel()
) {
++ depth;
}
definitionLevelToDepth[d] = depth - 1;
}
states[i].definitionLevelToDepth = definitionLevelToDepth;
}
for (int i = 0; i < leaves.length; i++) {
State state = states[i];
int[] nextStateIds = nextColumnIdxForRepLevel[i];
state.nextState = new State[nextStateIds.length];
for (int j = 0; j < nextStateIds.length; j++) {
state.nextState[j] = nextStateIds[j] == states.length ? null : states[nextStateIds[j]];
}
}
for (int i = 0; i < states.length; i++) {
State state = states[i];
final Map definedCases = new HashMap<>();
final Map undefinedCases = new HashMap<>();
Case[][][] caseLookup = new Case[state.fieldPath.length][][];
for (int currentLevel = 0; currentLevel < state.fieldPath.length; ++ currentLevel) {
caseLookup[currentLevel] = new Case[state.maxDefinitionLevel+1][];
for (int d = 0; d <= state.maxDefinitionLevel; ++ d) {
caseLookup[currentLevel][d] = new Case[state.maxRepetitionLevel+1];
for (int nextR = 0; nextR <= state.maxRepetitionLevel; ++ nextR) {
int caseStartLevel = currentLevel;
int caseDepth = Math.max(state.getDepth(d), caseStartLevel - 1);
int caseNextLevel = Math.min(state.nextLevel[nextR], caseDepth + 1);
Case currentCase = new Case(caseStartLevel, caseDepth, caseNextLevel, getNextReader(state.id, nextR), d == state.maxDefinitionLevel);
Map cases = currentCase.isDefined() ? definedCases : undefinedCases;
if (!cases.containsKey(currentCase)) {
currentCase.setID(cases.size());
cases.put(currentCase, currentCase);
} else {
currentCase = cases.get(currentCase);
}
caseLookup[currentLevel][d][nextR] = currentCase;
}
}
}
state.caseLookup = caseLookup;
state.definedCases = new ArrayList<>(definedCases.values());
state.undefinedCases = new ArrayList<>(undefinedCases.values());
Comparator caseComparator = new Comparator() {
@Override
public int compare(Case o1, Case o2) {
return o1.id - o2.id;
}
};
Collections.sort(state.definedCases, caseComparator);
Collections.sort(state.undefinedCases, caseComparator);
}
}
//TODO: have those wrappers for a converter
private RecordConsumer validator(RecordConsumer recordConsumer, boolean validating, MessageType schema) {
return validating ? new ValidatingRecordConsumer(recordConsumer, schema) : recordConsumer;
}
private RecordConsumer wrap(RecordConsumer recordConsumer) {
if (LOG.isDebugEnabled()) {
return new RecordConsumerLoggingWrapper(recordConsumer);
}
return recordConsumer;
}
/**
* @see org.apache.parquet.io.RecordReader#read()
*/
@Override
public T read() {
int currentLevel = 0;
recordRootConverter.start();
State currentState = states[0];
do {
ColumnReader columnReader = currentState.column;
int d = columnReader.getCurrentDefinitionLevel();
// creating needed nested groups until the current field (opening tags)
int depth = currentState.definitionLevelToDepth[d];
for (; currentLevel <= depth; ++currentLevel) {
currentState.groupConverterPath[currentLevel].start();
}
// currentLevel = depth + 1 at this point
// set the current value
if (d >= currentState.maxDefinitionLevel) {
// not null
columnReader.writeCurrentValueToConverter();
}
columnReader.consume();
int nextR = currentState.maxRepetitionLevel == 0 ? 0 : columnReader.getCurrentRepetitionLevel();
// level to go to close current groups
int next = currentState.nextLevel[nextR];
for (; currentLevel > next; currentLevel--) {
currentState.groupConverterPath[currentLevel - 1].end();
}
currentState = currentState.nextState[nextR];
} while (currentState != null);
recordRootConverter.end();
T record = recordMaterializer.getCurrentRecord();
shouldSkipCurrentRecord = record == null;
if (shouldSkipCurrentRecord) {
recordMaterializer.skipCurrentRecord();
}
return record;
}
@Override
public boolean shouldSkipCurrentRecord() {
return shouldSkipCurrentRecord;
}
private static void log(String string) {
LOG.debug(string);
}
int getNextReader(int current, int nextRepetitionLevel) {
State nextState = states[current].nextState[nextRepetitionLevel];
return nextState == null ? states.length : nextState.id;
}
int getNextLevel(int current, int nextRepetitionLevel) {
return states[current].nextLevel[nextRepetitionLevel];
}
private int getCommonParentLevel(String[] previous, String[] next) {
int i = 0;
while (i < Math.min(previous.length, next.length) && previous[i].equals(next[i])) {
++i;
}
return i;
}
protected int getStateCount() {
return states.length;
}
protected State getState(int i) {
return states[i];
}
protected RecordMaterializer getMaterializer() {
return recordMaterializer;
}
protected Converter getRecordConsumer() {
return recordRootConverter;
}
protected Iterable getColumnReaders() {
// Converting the array to an iterable ensures that the array cannot be altered
return Arrays.asList(columnReaders);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy