
blue.strategic.parquet.ParquetWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet-floor Show documentation
Show all versions of parquet-floor Show documentation
A lightweight Java library that facilitates reading and writing Apache Parquet files without Hadoop dependencies.
package blue.strategic.parquet;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.io.DelegatingPositionOutputStream;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.PositionOutputStream;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
public final class ParquetWriter implements Closeable {
private final org.apache.parquet.hadoop.ParquetWriter writer;
public static ParquetWriter writeFile(MessageType schema, File out, Dehydrator dehydrator) throws IOException {
OutputFile f = new OutputFile() {
@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
return createOrOverwrite(blockSizeHint);
}
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
FileOutputStream fos = new FileOutputStream(out);
return new DelegatingPositionOutputStream(fos) {
@Override
public long getPos() throws IOException {
return fos.getChannel().position();
}
};
}
@Override
public boolean supportsBlockSize() {
return false;
}
@Override
public long defaultBlockSize() {
return 1024L;
}
};
return writeOutputFile(schema, f, dehydrator);
}
private static ParquetWriter writeOutputFile(MessageType schema, OutputFile file, Dehydrator dehydrator) throws IOException {
return new ParquetWriter<>(file, schema, dehydrator);
}
private ParquetWriter(OutputFile outputFile, MessageType schema, Dehydrator dehydrator) throws IOException {
this.writer = new Builder(outputFile)
.withType(schema)
.withDehydrator(dehydrator)
.withCompressionCodec(CompressionCodecName.SNAPPY)
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_2_0)
.build();
}
public void write(T record) throws IOException {
writer.write(record);
}
@Override
public void close() throws IOException {
this.writer.close();
}
private static final class Builder extends org.apache.parquet.hadoop.ParquetWriter.Builder> {
private MessageType schema;
private Dehydrator dehydrator;
private Builder(OutputFile file) {
super(file);
}
public ParquetWriter.Builder withType(MessageType schema) {
this.schema = schema;
return this;
}
public ParquetWriter.Builder withDehydrator(Dehydrator dehydrator) {
this.dehydrator = dehydrator;
return this;
}
@Override
protected ParquetWriter.Builder self() {
return this;
}
@Override
protected WriteSupport getWriteSupport(Configuration conf) {
return new SimpleWriteSupport<>(schema, dehydrator);
}
}
private static class SimpleWriteSupport extends WriteSupport {
private final MessageType schema;
private final Dehydrator dehydrator;
private final ValueWriter valueWriter = SimpleWriteSupport.this::writeField;
private RecordConsumer recordConsumer;
SimpleWriteSupport(MessageType schema, Dehydrator dehydrator) {
this.schema = schema;
this.dehydrator = dehydrator;
}
@Override
public WriteContext init(Configuration configuration) {
return new WriteContext(schema, Collections.emptyMap());
}
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
this.recordConsumer = recordConsumer;
}
@Override
public void write(T record) {
recordConsumer.startMessage();
dehydrator.dehydrate(record, valueWriter);
recordConsumer.endMessage();
}
@Override
public String getName() {
return "blue.strategic.parquet.ParquetWriter";
}
private void writeField(String name, Object value) {
int fieldIndex = schema.getFieldIndex(name);
PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType();
recordConsumer.startField(name, fieldIndex);
switch (type.getPrimitiveTypeName()) {
case INT32: recordConsumer.addInteger((int)value); break;
case INT64: recordConsumer.addLong((long)value); break;
case DOUBLE: recordConsumer.addDouble((double)value); break;
case BOOLEAN: recordConsumer.addBoolean((boolean)value); break;
case FLOAT: recordConsumer.addFloat((float)value); break;
case BINARY:
if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) {
recordConsumer.addBinary(Binary.fromString((String)value));
} else {
throw new UnsupportedOperationException("We don't support writing " + type.getLogicalTypeAnnotation());
}
break;
default:
throw new UnsupportedOperationException("We don't support writing " + type.getPrimitiveTypeName());
}
recordConsumer.endField(name, fieldIndex);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy