org.apache.iceberg.flink.data.FlinkOrcWriters Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-flink-1.18 Show documentation
Show all versions of iceberg-flink-1.18 Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.flink.data;
import java.time.Instant;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.util.List;
import java.util.stream.Stream;
import org.apache.flink.table.data.ArrayData;
import org.apache.flink.table.data.DecimalData;
import org.apache.flink.table.data.MapData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.data.orc.GenericOrcWriters;
import org.apache.iceberg.orc.OrcValueWriter;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.orc.storage.common.type.HiveDecimal;
import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
import org.apache.orc.storage.ql.exec.vector.ColumnVector;
import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector;
import org.apache.orc.storage.ql.exec.vector.ListColumnVector;
import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
import org.apache.orc.storage.ql.exec.vector.MapColumnVector;
import org.apache.orc.storage.ql.exec.vector.TimestampColumnVector;
class FlinkOrcWriters {
private FlinkOrcWriters() {}
static OrcValueWriter strings() {
return StringWriter.INSTANCE;
}
static OrcValueWriter dates() {
return DateWriter.INSTANCE;
}
static OrcValueWriter times() {
return TimeWriter.INSTANCE;
}
static OrcValueWriter timestamps() {
return TimestampWriter.INSTANCE;
}
static OrcValueWriter timestampTzs() {
return TimestampTzWriter.INSTANCE;
}
static OrcValueWriter decimals(int precision, int scale) {
if (precision <= 18) {
return new Decimal18Writer(precision, scale);
} else if (precision <= 38) {
return new Decimal38Writer(precision, scale);
} else {
throw new IllegalArgumentException("Invalid precision: " + precision);
}
}
static OrcValueWriter list(
OrcValueWriter elementWriter, LogicalType elementType) {
return new ListWriter<>(elementWriter, elementType);
}
static OrcValueWriter map(
OrcValueWriter keyWriter,
OrcValueWriter valueWriter,
LogicalType keyType,
LogicalType valueType) {
return new MapWriter<>(keyWriter, valueWriter, keyType, valueType);
}
static OrcValueWriter struct(List> writers, List types) {
return new RowDataWriter(writers, types);
}
private static class StringWriter implements OrcValueWriter {
private static final StringWriter INSTANCE = new StringWriter();
@Override
public void nonNullWrite(int rowId, StringData data, ColumnVector output) {
byte[] value = data.toBytes();
((BytesColumnVector) output).setRef(rowId, value, 0, value.length);
}
}
private static class DateWriter implements OrcValueWriter {
private static final DateWriter INSTANCE = new DateWriter();
@Override
public void nonNullWrite(int rowId, Integer data, ColumnVector output) {
((LongColumnVector) output).vector[rowId] = data;
}
}
private static class TimeWriter implements OrcValueWriter {
private static final TimeWriter INSTANCE = new TimeWriter();
@Override
public void nonNullWrite(int rowId, Integer millis, ColumnVector output) {
// The time in flink is in millisecond, while the standard time in iceberg is microsecond.
// So we need to transform it to microsecond.
((LongColumnVector) output).vector[rowId] = millis * 1000L;
}
}
private static class TimestampWriter implements OrcValueWriter {
private static final TimestampWriter INSTANCE = new TimestampWriter();
@Override
public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) {
TimestampColumnVector cv = (TimestampColumnVector) output;
cv.setIsUTC(true);
// millis
OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC);
cv.time[rowId] =
offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000;
// truncate nanos to only keep microsecond precision.
cv.nanos[rowId] = (offsetDateTime.getNano() / 1_000) * 1_000;
}
}
private static class TimestampTzWriter implements OrcValueWriter {
private static final TimestampTzWriter INSTANCE = new TimestampTzWriter();
@SuppressWarnings("JavaInstantGetSecondsGetNano")
@Override
public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) {
TimestampColumnVector cv = (TimestampColumnVector) output;
// millis
Instant instant = data.toInstant();
cv.time[rowId] = instant.toEpochMilli();
// truncate nanos to only keep microsecond precision.
cv.nanos[rowId] = (instant.getNano() / 1_000) * 1_000;
}
}
private static class Decimal18Writer implements OrcValueWriter {
private final int precision;
private final int scale;
Decimal18Writer(int precision, int scale) {
this.precision = precision;
this.scale = scale;
}
@Override
public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) {
Preconditions.checkArgument(
scale == data.scale(),
"Cannot write value as decimal(%s,%s), wrong scale: %s",
precision,
scale,
data);
Preconditions.checkArgument(
data.precision() <= precision,
"Cannot write value as decimal(%s,%s), too large: %s",
precision,
scale,
data);
((DecimalColumnVector) output)
.vector[rowId].setFromLongAndScale(data.toUnscaledLong(), data.scale());
}
}
private static class Decimal38Writer implements OrcValueWriter {
private final int precision;
private final int scale;
Decimal38Writer(int precision, int scale) {
this.precision = precision;
this.scale = scale;
}
@Override
public void nonNullWrite(int rowId, DecimalData data, ColumnVector output) {
Preconditions.checkArgument(
scale == data.scale(),
"Cannot write value as decimal(%s,%s), wrong scale: %s",
precision,
scale,
data);
Preconditions.checkArgument(
data.precision() <= precision,
"Cannot write value as decimal(%s,%s), too large: %s",
precision,
scale,
data);
((DecimalColumnVector) output)
.vector[rowId].set(HiveDecimal.create(data.toBigDecimal(), false));
}
}
static class ListWriter implements OrcValueWriter {
private final OrcValueWriter elementWriter;
private final ArrayData.ElementGetter elementGetter;
ListWriter(OrcValueWriter elementWriter, LogicalType elementType) {
this.elementWriter = elementWriter;
this.elementGetter = ArrayData.createElementGetter(elementType);
}
@Override
@SuppressWarnings("unchecked")
public void nonNullWrite(int rowId, ArrayData data, ColumnVector output) {
ListColumnVector cv = (ListColumnVector) output;
cv.lengths[rowId] = data.size();
cv.offsets[rowId] = cv.childCount;
cv.childCount = (int) (cv.childCount + cv.lengths[rowId]);
// make sure the child is big enough.
growColumnVector(cv.child, cv.childCount);
for (int e = 0; e < cv.lengths[rowId]; ++e) {
Object value = elementGetter.getElementOrNull(data, e);
elementWriter.write((int) (e + cv.offsets[rowId]), (T) value, cv.child);
}
}
@Override
public Stream> metrics() {
return elementWriter.metrics();
}
}
static class MapWriter implements OrcValueWriter {
private final OrcValueWriter keyWriter;
private final OrcValueWriter valueWriter;
private final ArrayData.ElementGetter keyGetter;
private final ArrayData.ElementGetter valueGetter;
MapWriter(
OrcValueWriter keyWriter,
OrcValueWriter valueWriter,
LogicalType keyType,
LogicalType valueType) {
this.keyWriter = keyWriter;
this.valueWriter = valueWriter;
this.keyGetter = ArrayData.createElementGetter(keyType);
this.valueGetter = ArrayData.createElementGetter(valueType);
}
@Override
@SuppressWarnings("unchecked")
public void nonNullWrite(int rowId, MapData data, ColumnVector output) {
MapColumnVector cv = (MapColumnVector) output;
ArrayData keyArray = data.keyArray();
ArrayData valArray = data.valueArray();
// record the length and start of the list elements
cv.lengths[rowId] = data.size();
cv.offsets[rowId] = cv.childCount;
cv.childCount = (int) (cv.childCount + cv.lengths[rowId]);
// make sure the child is big enough
growColumnVector(cv.keys, cv.childCount);
growColumnVector(cv.values, cv.childCount);
// Add each element
for (int e = 0; e < cv.lengths[rowId]; ++e) {
int pos = (int) (e + cv.offsets[rowId]);
keyWriter.write(pos, (K) keyGetter.getElementOrNull(keyArray, e), cv.keys);
valueWriter.write(pos, (V) valueGetter.getElementOrNull(valArray, e), cv.values);
}
}
@Override
public Stream> metrics() {
return Stream.concat(keyWriter.metrics(), valueWriter.metrics());
}
}
static class RowDataWriter extends GenericOrcWriters.StructWriter {
private final List fieldGetters;
RowDataWriter(List> writers, List types) {
super(writers);
this.fieldGetters = Lists.newArrayListWithExpectedSize(types.size());
for (int i = 0; i < types.size(); i++) {
fieldGetters.add(RowData.createFieldGetter(types.get(i), i));
}
}
@Override
protected Object get(RowData struct, int index) {
return fieldGetters.get(index).getFieldOrNull(struct);
}
}
private static void growColumnVector(ColumnVector cv, int requestedSize) {
if (cv.isNull.length < requestedSize) {
// Use growth factor of 3 to avoid frequent array allocations
cv.ensureSize(requestedSize * 3, true);
}
}
}