parquet.hadoop.ParquetFileWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Presto
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.hadoop;
import static parquet.Log.DEBUG;
import static parquet.format.Util.writeFileMetaData;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import parquet.Log;
import parquet.Version;
import parquet.bytes.BytesInput;
import parquet.bytes.BytesUtils;
import parquet.column.ColumnDescriptor;
import parquet.column.page.DictionaryPage;
import parquet.column.statistics.Statistics;
import parquet.hadoop.metadata.ColumnPath;
import parquet.format.converter.ParquetMetadataConverter;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ColumnChunkMetaData;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.metadata.FileMetaData;
import parquet.hadoop.metadata.GlobalMetaData;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.io.ParquetEncodingException;
import parquet.schema.MessageType;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
/**
* Internal implementation of the Parquet file writer as a block container
*
* @author Julien Le Dem
*
*/
public class ParquetFileWriter {
private static final Log LOG = Log.getLog(ParquetFileWriter.class);
public static final String PARQUET_METADATA_FILE = "_metadata";
public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
public static final byte[] MAGIC = "PAR1".getBytes(Charset.forName("ASCII"));
public static final int CURRENT_VERSION = 1;
// File creation modes
public static enum Mode {
CREATE,
OVERWRITE
}
private static final ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
private final MessageType schema;
private final FSDataOutputStream out;
private BlockMetaData currentBlock;
private ColumnChunkMetaData currentColumn;
private long currentRecordCount;
private List blocks = new ArrayList();
private long uncompressedLength;
private long compressedLength;
private Set currentEncodings;
private CompressionCodecName currentChunkCodec;
private ColumnPath currentChunkPath;
private PrimitiveTypeName currentChunkType;
private long currentChunkFirstDataPage;
private long currentChunkDictionaryPageOffset;
private long currentChunkValueCount;
private Statistics currentStatistics;
/**
* Captures the order in which methods should be called
*
* @author Julien Le Dem
*
*/
private enum STATE {
NOT_STARTED {
STATE start() {
return STARTED;
}
},
STARTED {
STATE startBlock() {
return BLOCK;
}
STATE end() {
return ENDED;
}
},
BLOCK {
STATE startColumn() {
return COLUMN;
}
STATE endBlock() {
return STARTED;
}
},
COLUMN {
STATE endColumn() {
return BLOCK;
};
STATE write() {
return this;
}
},
ENDED;
STATE start() throws IOException { return error(); }
STATE startBlock() throws IOException { return error(); }
STATE startColumn() throws IOException { return error(); }
STATE write() throws IOException { return error(); }
STATE endColumn() throws IOException { return error(); }
STATE endBlock() throws IOException { return error(); }
STATE end() throws IOException { return error(); }
private final STATE error() throws IOException {
throw new IOException("The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: " + this.name());
}
}
private STATE state = STATE.NOT_STARTED;
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file) throws IOException {
this(configuration, schema, file, Mode.CREATE);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode) throws IOException {
super();
this.schema = schema;
FileSystem fs = file.getFileSystem(configuration);
boolean overwriteFlag = (mode == Mode.OVERWRITE);
this.out = fs.create(file, overwriteFlag);
}
/**
* start the file
* @throws IOException
*/
public void start() throws IOException {
state = state.start();
if (DEBUG) LOG.debug(out.getPos() + ": start");
out.write(MAGIC);
}
/**
* start a block
* @param recordCount the record count in this block
* @throws IOException
*/
public void startBlock(long recordCount) throws IOException {
state = state.startBlock();
if (DEBUG) LOG.debug(out.getPos() + ": start block");
// out.write(MAGIC); // TODO: add a magic delimiter
currentBlock = new BlockMetaData();
currentRecordCount = recordCount;
}
/**
* start a column inside a block
* @param descriptor the column descriptor
* @param valueCount the value count in this column
* @param statistics the statistics in this column
* @param compressionCodecName
* @throws IOException
*/
public void startColumn(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName) throws IOException {
state = state.startColumn();
if (DEBUG) LOG.debug(out.getPos() + ": start column: " + descriptor + " count=" + valueCount);
currentEncodings = new HashSet();
currentChunkPath = ColumnPath.get(descriptor.getPath());
currentChunkType = descriptor.getType();
currentChunkCodec = compressionCodecName;
currentChunkValueCount = valueCount;
currentChunkFirstDataPage = out.getPos();
compressedLength = 0;
uncompressedLength = 0;
// need to know what type of stats to initialize to
// better way to do this?
currentStatistics = Statistics.getStatsBasedOnType(currentChunkType);
}
/**
* writes a dictionary page page
* @param dictionaryPage the dictionary page
*/
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
state = state.write();
if (DEBUG) LOG.debug(out.getPos() + ": write dictionary page: " + dictionaryPage.getDictionarySize() + " values");
currentChunkDictionaryPageOffset = out.getPos();
int uncompressedSize = dictionaryPage.getUncompressedSize();
int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts
metadataConverter.writeDictionaryPageHeader(
uncompressedSize,
compressedPageSize,
dictionaryPage.getDictionarySize(),
dictionaryPage.getEncoding(),
out);
long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
this.uncompressedLength += uncompressedSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
if (DEBUG) LOG.debug(out.getPos() + ": write dictionary page content " + compressedPageSize);
dictionaryPage.getBytes().writeAllTo(out);
currentEncodings.add(dictionaryPage.getEncoding());
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
*/
@Deprecated
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
parquet.column.Encoding rlEncoding,
parquet.column.Encoding dlEncoding,
parquet.column.Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
if (DEBUG) LOG.debug(out.getPos() + ": write data page content " + compressedPageSize);
bytes.writeAllTo(out);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
*/
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Statistics statistics,
parquet.column.Encoding rlEncoding,
parquet.column.Encoding dlEncoding,
parquet.column.Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
uncompressedPageSize, compressedPageSize,
valueCount,
statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
if (DEBUG) LOG.debug(out.getPos() + ": write data page content " + compressedPageSize);
bytes.writeAllTo(out);
currentStatistics.mergeStatistics(statistics);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* writes a number of pages at once
* @param bytes bytes to be written including page headers
* @param uncompressedTotalPageSize total uncompressed size (without page headers)
* @param compressedTotalPageSize total compressed size (without page headers)
* @throws IOException
*/
void writeDataPages(BytesInput bytes,
long uncompressedTotalPageSize,
long compressedTotalPageSize,
Statistics totalStats,
List encodings) throws IOException {
state = state.write();
if (DEBUG) LOG.debug(out.getPos() + ": write data pages");
long headersSize = bytes.size() - compressedTotalPageSize;
this.uncompressedLength += uncompressedTotalPageSize + headersSize;
this.compressedLength += compressedTotalPageSize + headersSize;
if (DEBUG) LOG.debug(out.getPos() + ": write data pages content");
bytes.writeAllTo(out);
currentEncodings.addAll(encodings);
currentStatistics = totalStats;
}
/**
* end a column (once all rep, def and data have been written)
* @throws IOException
*/
public void endColumn() throws IOException {
state = state.endColumn();
if (DEBUG) LOG.debug(out.getPos() + ": end column");
currentBlock.addColumn(ColumnChunkMetaData.get(
currentChunkPath,
currentChunkType,
currentChunkCodec,
currentEncodings,
currentStatistics,
currentChunkFirstDataPage,
currentChunkDictionaryPageOffset,
currentChunkValueCount,
compressedLength,
uncompressedLength));
if (DEBUG) LOG.info("ended Column chumk: " + currentColumn);
currentColumn = null;
this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
this.uncompressedLength = 0;
this.compressedLength = 0;
}
/**
* ends a block once all column chunks have been written
* @throws IOException
*/
public void endBlock() throws IOException {
state = state.endBlock();
if (DEBUG) LOG.debug(out.getPos() + ": end block");
currentBlock.setRowCount(currentRecordCount);
blocks.add(currentBlock);
currentBlock = null;
}
/**
* ends a file once all blocks have been written.
* closes the file.
* @param extraMetaData the extra meta data to write in the footer
* @throws IOException
*/
public void end(Map extraMetaData) throws IOException {
state = state.end();
if (DEBUG) LOG.debug(out.getPos() + ": end");
ParquetMetadata footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
serializeFooter(footer, out);
out.close();
}
private static void serializeFooter(ParquetMetadata footer, FSDataOutputStream out) throws IOException {
long footerIndex = out.getPos();
parquet.format.FileMetaData parquetMetadata = new ParquetMetadataConverter().toParquetMetadata(CURRENT_VERSION, footer);
writeFileMetaData(parquetMetadata, out);
if (DEBUG) LOG.debug(out.getPos() + ": footer length = " + (out.getPos() - footerIndex));
BytesUtils.writeIntLittleEndian(out, (int)(out.getPos() - footerIndex));
out.write(MAGIC);
}
/**
* writes a _metadata and _common_metadata file
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @throws IOException
*/
public static void writeMetadataFile(Configuration configuration, Path outputPath, List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy