Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.hadoop;
import static org.apache.parquet.format.Util.writeFileMetaData;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Strings;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.GlobalMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopStreams;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.TypeUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Internal implementation of the Parquet file writer as a block container
*
* @author Julien Le Dem
*
*/
public class ParquetFileWriter {
private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class);
private static ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
public static final String PARQUET_METADATA_FILE = "_metadata";
public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
public static final byte[] MAGIC = "PAR1".getBytes(Charset.forName("ASCII"));
public static final int CURRENT_VERSION = 1;
// need to supply a buffer size when setting block size. this is the default
// for hadoop 1 to present. copying it avoids loading DFSConfigKeys.
private static final int DFS_BUFFER_SIZE_DEFAULT = 4096;
// visible for testing
static final Set BLOCK_FS_SCHEMES = new HashSet();
static {
BLOCK_FS_SCHEMES.add("hdfs");
BLOCK_FS_SCHEMES.add("webhdfs");
BLOCK_FS_SCHEMES.add("viewfs");
}
private static boolean supportsBlockSize(FileSystem fs) {
return BLOCK_FS_SCHEMES.contains(fs.getUri().getScheme());
}
// File creation modes
public static enum Mode {
CREATE,
OVERWRITE
}
private final MessageType schema;
private final FSDataOutputStream out;
private final AlignmentStrategy alignment;
// file data
private List blocks = new ArrayList();
// row group data
private BlockMetaData currentBlock; // appended to by endColumn
// row group data set at the start of a row group
private long currentRecordCount; // set in startBlock
// column chunk data accumulated as pages are written
private EncodingStats.Builder encodingStatsBuilder;
private Set currentEncodings;
private long uncompressedLength;
private long compressedLength;
private Statistics currentStatistics; // accumulated in writePage(s)
// column chunk data set at the start of a column
private CompressionCodecName currentChunkCodec; // set in startColumn
private ColumnPath currentChunkPath; // set in startColumn
private PrimitiveTypeName currentChunkType; // set in startColumn
private long currentChunkValueCount; // set in startColumn
private long currentChunkFirstDataPage; // set in startColumn (out.pos())
private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage
/**
* Captures the order in which methods should be called
*
* @author Julien Le Dem
*
*/
private enum STATE {
NOT_STARTED {
STATE start() {
return STARTED;
}
},
STARTED {
STATE startBlock() {
return BLOCK;
}
STATE end() {
return ENDED;
}
},
BLOCK {
STATE startColumn() {
return COLUMN;
}
STATE endBlock() {
return STARTED;
}
},
COLUMN {
STATE endColumn() {
return BLOCK;
};
STATE write() {
return this;
}
},
ENDED;
STATE start() throws IOException { return error(); }
STATE startBlock() throws IOException { return error(); }
STATE startColumn() throws IOException { return error(); }
STATE write() throws IOException { return error(); }
STATE endColumn() throws IOException { return error(); }
STATE endBlock() throws IOException { return error(); }
STATE end() throws IOException { return error(); }
private final STATE error() throws IOException {
throw new IOException("The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: " + this.name());
}
}
private STATE state = STATE.NOT_STARTED;
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file) throws IOException {
this(configuration, schema, file, Mode.CREATE, DEFAULT_BLOCK_SIZE,
MAX_PADDING_SIZE_DEFAULT);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode) throws IOException {
this(configuration, schema, file, mode, DEFAULT_BLOCK_SIZE,
MAX_PADDING_SIZE_DEFAULT);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @param rowGroupSize the row group size
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode, long rowGroupSize,
int maxPaddingSize)
throws IOException {
TypeUtil.checkValidWriteSchema(schema);
this.schema = schema;
FileSystem fs = file.getFileSystem(configuration);
boolean overwriteFlag = (mode == Mode.OVERWRITE);
if (supportsBlockSize(fs)) {
// use the default block size, unless row group size is larger
long dfsBlockSize = Math.max(fs.getDefaultBlockSize(file), rowGroupSize);
this.alignment = PaddingAlignment.get(
dfsBlockSize, rowGroupSize, maxPaddingSize);
this.out = fs.create(file, overwriteFlag, DFS_BUFFER_SIZE_DEFAULT,
fs.getDefaultReplication(file), dfsBlockSize);
} else {
this.alignment = NoAlignment.get(rowGroupSize);
this.out = fs.create(file, overwriteFlag);
}
this.encodingStatsBuilder = new EncodingStats.Builder();
}
/**
* FOR TESTING ONLY.
*
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param rowAndBlockSize the row group size
* @throws IOException if the file can not be created
*/
ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, long rowAndBlockSize, int maxPaddingSize)
throws IOException {
FileSystem fs = file.getFileSystem(configuration);
this.schema = schema;
this.alignment = PaddingAlignment.get(
rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
this.out = fs.create(file, true, DFS_BUFFER_SIZE_DEFAULT,
fs.getDefaultReplication(file), rowAndBlockSize);
this.encodingStatsBuilder = new EncodingStats.Builder();
}
/**
* start the file
* @throws IOException
*/
public void start() throws IOException {
state = state.start();
LOG.debug("{}: start", out.getPos());
out.write(MAGIC);
}
/**
* start a block
* @param recordCount the record count in this block
* @throws IOException
*/
public void startBlock(long recordCount) throws IOException {
state = state.startBlock();
LOG.debug("{}: start block", out.getPos());
// out.write(MAGIC); // TODO: add a magic delimiter
alignment.alignForRowGroup(out);
currentBlock = new BlockMetaData();
currentRecordCount = recordCount;
}
/**
* start a column inside a block
* @param descriptor the column descriptor
* @param valueCount the value count in this column
* @param compressionCodecName
* @throws IOException
*/
public void startColumn(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName) throws IOException {
state = state.startColumn();
encodingStatsBuilder.clear();
currentEncodings = new HashSet();
currentChunkPath = ColumnPath.get(descriptor.getPath());
currentChunkType = descriptor.getType();
currentChunkCodec = compressionCodecName;
currentChunkValueCount = valueCount;
currentChunkFirstDataPage = out.getPos();
compressedLength = 0;
uncompressedLength = 0;
// need to know what type of stats to initialize to
// better way to do this?
currentStatistics = Statistics.getStatsBasedOnType(currentChunkType);
}
/**
* writes a dictionary page page
* @param dictionaryPage the dictionary page
*/
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
state = state.write();
LOG.debug("{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize());
currentChunkDictionaryPageOffset = out.getPos();
int uncompressedSize = dictionaryPage.getUncompressedSize();
int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts
metadataConverter.writeDictionaryPageHeader(
uncompressedSize,
compressedPageSize,
dictionaryPage.getDictionarySize(),
dictionaryPage.getEncoding(),
out);
long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
this.uncompressedLength += uncompressedSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize);
dictionaryPage.getBytes().writeAllTo(out);
encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding());
currentEncodings.add(dictionaryPage.getEncoding());
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
*/
@Deprecated
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
bytes.writeAllTo(out);
encodingStatsBuilder.addDataEncoding(valuesEncoding);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
*/
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Statistics statistics,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
uncompressedPageSize, compressedPageSize,
valueCount,
statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
bytes.writeAllTo(out);
currentStatistics.mergeStatistics(statistics);
encodingStatsBuilder.addDataEncoding(valuesEncoding);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* writes a number of pages at once
* @param bytes bytes to be written including page headers
* @param uncompressedTotalPageSize total uncompressed size (without page headers)
* @param compressedTotalPageSize total compressed size (without page headers)
* @throws IOException
*/
void writeDataPages(BytesInput bytes,
long uncompressedTotalPageSize,
long compressedTotalPageSize,
Statistics totalStats,
Set rlEncodings,
Set dlEncodings,
List dataEncodings) throws IOException {
state = state.write();
LOG.debug("{}: write data pages", out.getPos());
long headersSize = bytes.size() - compressedTotalPageSize;
this.uncompressedLength += uncompressedTotalPageSize + headersSize;
this.compressedLength += compressedTotalPageSize + headersSize;
LOG.debug("{}: write data pages content", out.getPos());
bytes.writeAllTo(out);
encodingStatsBuilder.addDataEncodings(dataEncodings);
if (rlEncodings.isEmpty()) {
encodingStatsBuilder.withV2Pages();
}
currentEncodings.addAll(rlEncodings);
currentEncodings.addAll(dlEncodings);
currentEncodings.addAll(dataEncodings);
currentStatistics = totalStats;
}
/**
* end a column (once all rep, def and data have been written)
* @throws IOException
*/
public void endColumn() throws IOException {
state = state.endColumn();
LOG.debug("{}: end column", out.getPos());
currentBlock.addColumn(ColumnChunkMetaData.get(
currentChunkPath,
currentChunkType,
currentChunkCodec,
encodingStatsBuilder.build(),
currentEncodings,
currentStatistics,
currentChunkFirstDataPage,
currentChunkDictionaryPageOffset,
currentChunkValueCount,
compressedLength,
uncompressedLength));
this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
this.uncompressedLength = 0;
this.compressedLength = 0;
}
/**
* ends a block once all column chunks have been written
* @throws IOException
*/
public void endBlock() throws IOException {
state = state.endBlock();
LOG.debug("{}: end block", out.getPos());
currentBlock.setRowCount(currentRecordCount);
blocks.add(currentBlock);
currentBlock = null;
}
public void appendFile(Configuration conf, Path file) throws IOException {
ParquetFileReader.open(conf, file).appendTo(this);
}
public void appendRowGroups(FSDataInputStream file,
List rowGroups,
boolean dropColumns) throws IOException {
appendRowGroups(HadoopStreams.wrap(file), rowGroups, dropColumns);
}
public void appendRowGroups(SeekableInputStream file,
List rowGroups,
boolean dropColumns) throws IOException {
for (BlockMetaData block : rowGroups) {
appendRowGroup(file, block, dropColumns);
}
}
public void appendRowGroup(FSDataInputStream from, BlockMetaData rowGroup,
boolean dropColumns) throws IOException {
appendRowGroup(from, rowGroup, dropColumns);
}
public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup,
boolean dropColumns) throws IOException {
startBlock(rowGroup.getRowCount());
Map columnsToCopy =
new HashMap();
for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
columnsToCopy.put(chunk.getPath().toDotString(), chunk);
}
List columnsInOrder =
new ArrayList();
for (ColumnDescriptor descriptor : schema.getColumns()) {
String path = ColumnPath.get(descriptor.getPath()).toDotString();
ColumnChunkMetaData chunk = columnsToCopy.remove(path);
if (chunk != null) {
columnsInOrder.add(chunk);
} else {
throw new IllegalArgumentException(String.format(
"Missing column '%s', cannot copy row group: %s", path, rowGroup));
}
}
// complain if some columns would be dropped and that's not okay
if (!dropColumns && !columnsToCopy.isEmpty()) {
throw new IllegalArgumentException(String.format(
"Columns cannot be copied (missing from target schema): %s",
Strings.join(columnsToCopy.keySet(), ", ")));
}
// copy the data for all chunks
long start = -1;
long length = 0;
long blockCompressedSize = 0;
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
// get this chunk's start position in the new file
long newChunkStart = out.getPos() + length;
// add this chunk to be copied with any previous chunks
if (start < 0) {
// no previous chunk included, start at this chunk's starting pos
start = chunk.getStartingPos();
}
length += chunk.getTotalSize();
if ((i + 1) == columnsInOrder.size() ||
columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
// not contiguous. do the copy now.
copy(from, out, start, length);
// reset to start at the next column chunk
start = -1;
length = 0;
}
currentBlock.addColumn(ColumnChunkMetaData.get(
chunk.getPath(),
chunk.getType(),
chunk.getCodec(),
chunk.getEncodingStats(),
chunk.getEncodings(),
chunk.getStatistics(),
newChunkStart,
newChunkStart,
chunk.getValueCount(),
chunk.getTotalSize(),
chunk.getTotalUncompressedSize()));
blockCompressedSize += chunk.getTotalSize();
}
currentBlock.setTotalByteSize(blockCompressedSize);
endBlock();
}
// Buffers for the copy function.
private static final ThreadLocal COPY_BUFFER =
new ThreadLocal() {
@Override
protected byte[] initialValue() {
return new byte[8192];
}
};
/**
* Copy from a FS input stream to an output stream. Thread-safe
*
* @param from a {@link FSDataInputStream}
* @param to any {@link OutputStream}
* @param start where in the from stream to start copying
* @param length the number of bytes to copy
* @throws IOException
*/
private static void copy(SeekableInputStream from, FSDataOutputStream to,
long start, long length) throws IOException{
LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
from.seek(start);
long bytesCopied = 0;
byte[] buffer = COPY_BUFFER.get();
while (bytesCopied < length) {
long bytesLeft = length - bytesCopied;
int bytesRead = from.read(buffer, 0,
(buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
if (bytesRead < 0) {
throw new IllegalArgumentException(
"Unexpected end of input file at " + start + bytesCopied);
}
to.write(buffer, 0, bytesRead);
bytesCopied += bytesRead;
}
}
/**
* ends a file once all blocks have been written.
* closes the file.
* @param extraMetaData the extra meta data to write in the footer
* @throws IOException
*/
public void end(Map extraMetaData) throws IOException {
state = state.end();
LOG.debug("{}: end", out.getPos());
ParquetMetadata footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
serializeFooter(footer, out);
out.close();
}
private static void serializeFooter(ParquetMetadata footer, FSDataOutputStream out) throws IOException {
long footerIndex = out.getPos();
org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
writeFileMetaData(parquetMetadata, out);
LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
BytesUtils.writeIntLittleEndian(out, (int)(out.getPos() - footerIndex));
out.write(MAGIC);
}
/**
* writes a _metadata and _common_metadata file
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @throws IOException
*/
public static void writeMetadataFile(Configuration configuration, Path outputPath, List