parquet.hadoop.ColumnChunkPageWriteStore Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.hadoop;
import static parquet.Log.INFO;
import static parquet.column.statistics.Statistics.getStatsBasedOnType;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import parquet.Log;
import parquet.bytes.BytesInput;
import parquet.bytes.ConcatenatingByteArrayCollector;
import parquet.column.ColumnDescriptor;
import parquet.column.Encoding;
import parquet.column.page.DictionaryPage;
import parquet.column.page.PageWriteStore;
import parquet.column.page.PageWriter;
import parquet.column.statistics.Statistics;
import parquet.format.converter.ParquetMetadataConverter;
import parquet.hadoop.CodecFactory.BytesCompressor;
import parquet.io.ParquetEncodingException;
import parquet.schema.MessageType;
class ColumnChunkPageWriteStore implements PageWriteStore {
private static final Log LOG = Log.getLog(ColumnChunkPageWriteStore.class);
private static ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
private static final class ColumnChunkPageWriter implements PageWriter {
private final ColumnDescriptor path;
private final BytesCompressor compressor;
private final ByteArrayOutputStream tempOutputStream = new ByteArrayOutputStream();
private final ConcatenatingByteArrayCollector buf;
private DictionaryPage dictionaryPage;
private long uncompressedLength;
private long compressedLength;
private long totalValueCount;
private int pageCount;
private Set encodings = new HashSet();
private Statistics totalStatistics;
private ColumnChunkPageWriter(ColumnDescriptor path, BytesCompressor compressor, int pageSize) {
this.path = path;
this.compressor = compressor;
this.buf = new ConcatenatingByteArrayCollector();
this.totalStatistics = getStatsBasedOnType(this.path.getType());
}
@Override
public void writePage(BytesInput bytes,
int valueCount,
Statistics statistics,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
long uncompressedSize = bytes.size();
if (uncompressedSize > Integer.MAX_VALUE) {
throw new ParquetEncodingException(
"Cannot write page larger than Integer.MAX_VALUE bytes: " +
uncompressedSize);
}
BytesInput compressedBytes = compressor.compress(bytes);
long compressedSize = compressedBytes.size();
if (compressedSize > Integer.MAX_VALUE) {
throw new ParquetEncodingException(
"Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
+ compressedSize);
}
tempOutputStream.reset();
parquetMetadataConverter.writeDataPageHeader(
(int)uncompressedSize,
(int)compressedSize,
valueCount,
statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
tempOutputStream);
this.uncompressedLength += uncompressedSize;
this.compressedLength += compressedSize;
this.totalValueCount += valueCount;
this.pageCount += 1;
this.totalStatistics.mergeStatistics(statistics);
// by concatenating before collecting instead of collecting twice,
// we only allocate one buffer to copy into instead of multiple.
buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes));
encodings.add(rlEncoding);
encodings.add(dlEncoding);
encodings.add(valuesEncoding);
}
@Override
public void writePageV2(
int rowCount, int nullCount, int valueCount,
BytesInput repetitionLevels, BytesInput definitionLevels,
Encoding dataEncoding, BytesInput data,
Statistics> statistics) throws IOException {
int rlByteLength = toIntWithCheck(repetitionLevels.size());
int dlByteLength = toIntWithCheck(definitionLevels.size());
int uncompressedSize = toIntWithCheck(
data.size() + repetitionLevels.size() + definitionLevels.size()
);
// TODO: decide if we compress
BytesInput compressedData = compressor.compress(data);
int compressedSize = toIntWithCheck(
compressedData.size() + repetitionLevels.size() + definitionLevels.size()
);
tempOutputStream.reset();
parquetMetadataConverter.writeDataPageV2Header(
uncompressedSize, compressedSize,
valueCount, nullCount, rowCount,
statistics,
dataEncoding,
rlByteLength,
dlByteLength,
tempOutputStream);
this.uncompressedLength += uncompressedSize;
this.compressedLength += compressedSize;
this.totalValueCount += valueCount;
this.pageCount += 1;
this.totalStatistics.mergeStatistics(statistics);
// by concatenating before collecting instead of collecting twice,
// we only allocate one buffer to copy into instead of multiple.
buf.collect(
BytesInput.concat(
BytesInput.from(tempOutputStream),
repetitionLevels,
definitionLevels,
compressedData)
);
encodings.add(dataEncoding);
}
private int toIntWithCheck(long size) {
if (size > Integer.MAX_VALUE) {
throw new ParquetEncodingException(
"Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " +
size);
}
return (int)size;
}
@Override
public long getMemSize() {
return buf.size();
}
public void writeToFileWriter(ParquetFileWriter writer) throws IOException {
writer.startColumn(path, totalValueCount, compressor.getCodecName());
if (dictionaryPage != null) {
writer.writeDictionaryPage(dictionaryPage);
encodings.add(dictionaryPage.getEncoding());
}
writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics, new ArrayList(encodings));
writer.endColumn();
if (INFO) {
LOG.info(
String.format(
"written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s",
buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, encodings)
+ (dictionaryPage != null ? String.format(
", dic { %,d entries, %,dB raw, %,dB comp}",
dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize())
: ""));
}
encodings.clear();
pageCount = 0;
}
@Override
public long allocatedSize() {
return buf.size();
}
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
if (this.dictionaryPage != null) {
throw new ParquetEncodingException("Only one dictionary page is allowed");
}
BytesInput dictionaryBytes = dictionaryPage.getBytes();
int uncompressedSize = (int)dictionaryBytes.size();
BytesInput compressedBytes = compressor.compress(dictionaryBytes);
this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
@Override
public String memUsageString(String prefix) {
return buf.memUsageString(prefix + " ColumnChunkPageWriter");
}
}
private final Map writers = new HashMap();
private final MessageType schema;
public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int pageSize) {
this.schema = schema;
for (ColumnDescriptor path : schema.getColumns()) {
writers.put(path, new ColumnChunkPageWriter(path, compressor, pageSize));
}
}
@Override
public PageWriter getPageWriter(ColumnDescriptor path) {
return writers.get(path);
}
public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
for (ColumnDescriptor path : schema.getColumns()) {
ColumnChunkPageWriter pageWriter = writers.get(path);
pageWriter.writeToFileWriter(writer);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy