Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.hadoop;
import static org.apache.parquet.format.Util.writeFileMetaData;
import static org.apache.parquet.hadoop.ParquetWriter.DEFAULT_BLOCK_SIZE;
import static org.apache.parquet.hadoop.ParquetWriter.MAX_PADDING_SIZE_DEFAULT;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Preconditions;
import org.apache.parquet.Strings;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.GlobalMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.hadoop.util.HadoopStreams;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.PositionOutputStream;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.TypeUtil;
import com.facebook.presto.hive.$internal.org.slf4j.Logger;
import com.facebook.presto.hive.$internal.org.slf4j.LoggerFactory;
/**
* Internal implementation of the Parquet file writer as a block container
*/
public class ParquetFileWriter {
private static final Logger LOG = LoggerFactory.getLogger(ParquetFileWriter.class);
private static ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
public static final String PARQUET_METADATA_FILE = "_metadata";
public static final String MAGIC_STR = "PAR1";
public static final byte[] MAGIC = MAGIC_STR.getBytes(Charset.forName("ASCII"));
public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
public static final int CURRENT_VERSION = 1;
// File creation modes
public static enum Mode {
CREATE,
OVERWRITE
}
private final MessageType schema;
private final PositionOutputStream out;
private final AlignmentStrategy alignment;
// file data
private List blocks = new ArrayList();
// row group data
private BlockMetaData currentBlock; // appended to by endColumn
// row group data set at the start of a row group
private long currentRecordCount; // set in startBlock
// column chunk data accumulated as pages are written
private EncodingStats.Builder encodingStatsBuilder;
private Set currentEncodings;
private long uncompressedLength;
private long compressedLength;
private Statistics currentStatistics; // accumulated in writePage(s)
// column chunk data set at the start of a column
private CompressionCodecName currentChunkCodec; // set in startColumn
private ColumnPath currentChunkPath; // set in startColumn
private PrimitiveType currentChunkType; // set in startColumn
private long currentChunkValueCount; // set in startColumn
private long currentChunkFirstDataPage; // set in startColumn (out.pos())
private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage
// set when end is called
private ParquetMetadata footer = null;
/**
* Captures the order in which methods should be called
*/
private enum STATE {
NOT_STARTED {
STATE start() {
return STARTED;
}
},
STARTED {
STATE startBlock() {
return BLOCK;
}
STATE end() {
return ENDED;
}
},
BLOCK {
STATE startColumn() {
return COLUMN;
}
STATE endBlock() {
return STARTED;
}
},
COLUMN {
STATE endColumn() {
return BLOCK;
};
STATE write() {
return this;
}
},
ENDED;
STATE start() throws IOException { return error(); }
STATE startBlock() throws IOException { return error(); }
STATE startColumn() throws IOException { return error(); }
STATE write() throws IOException { return error(); }
STATE endColumn() throws IOException { return error(); }
STATE endBlock() throws IOException { return error(); }
STATE end() throws IOException { return error(); }
private final STATE error() throws IOException {
throw new IOException("The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: " + this.name());
}
}
private STATE state = STATE.NOT_STARTED;
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file) throws IOException {
this(HadoopOutputFile.fromPath(file, configuration),
schema, Mode.CREATE, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode) throws IOException {
this(HadoopOutputFile.fromPath(file, configuration),
schema, mode, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT);
}
/**
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param mode file creation mode
* @param rowGroupSize the row group size
* @param maxPaddingSize the maximum padding
* @throws IOException if the file can not be created
* @deprecated will be removed in 2.0.0
*/
@Deprecated
public ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, Mode mode, long rowGroupSize,
int maxPaddingSize)
throws IOException {
this(HadoopOutputFile.fromPath(file, configuration),
schema, mode, rowGroupSize, maxPaddingSize);
}
/**
* @param file OutputFile to create or overwrite
* @param schema the schema of the data
* @param mode file creation mode
* @param rowGroupSize the row group size
* @param maxPaddingSize the maximum padding
* @throws IOException if the file can not be created
*/
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
long rowGroupSize, int maxPaddingSize)
throws IOException {
TypeUtil.checkValidWriteSchema(schema);
this.schema = schema;
long blockSize = rowGroupSize;
if (file.supportsBlockSize()) {
blockSize = Math.max(file.defaultBlockSize(), rowGroupSize);
this.alignment = PaddingAlignment.get(blockSize, rowGroupSize, maxPaddingSize);
} else {
this.alignment = NoAlignment.get(rowGroupSize);
}
if (mode == Mode.OVERWRITE) {
this.out = file.createOrOverwrite(blockSize);
} else {
this.out = file.create(blockSize);
}
this.encodingStatsBuilder = new EncodingStats.Builder();
}
/**
* FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
*
* @param configuration Hadoop configuration
* @param schema the schema of the data
* @param file the file to write to
* @param rowAndBlockSize the row group size
* @param maxPaddingSize the maximum padding
* @throws IOException if the file can not be created
*/
ParquetFileWriter(Configuration configuration, MessageType schema,
Path file, long rowAndBlockSize, int maxPaddingSize)
throws IOException {
FileSystem fs = file.getFileSystem(configuration);
this.schema = schema;
this.alignment = PaddingAlignment.get(
rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
this.out = HadoopStreams.wrap(
fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
this.encodingStatsBuilder = new EncodingStats.Builder();
}
/**
* start the file
* @throws IOException if there is an error while writing
*/
public void start() throws IOException {
state = state.start();
LOG.debug("{}: start", out.getPos());
out.write(MAGIC);
}
/**
* start a block
* @param recordCount the record count in this block
* @throws IOException if there is an error while writing
*/
public void startBlock(long recordCount) throws IOException {
state = state.startBlock();
LOG.debug("{}: start block", out.getPos());
// out.write(MAGIC); // TODO: add a magic delimiter
alignment.alignForRowGroup(out);
currentBlock = new BlockMetaData();
currentRecordCount = recordCount;
}
/**
* start a column inside a block
* @param descriptor the column descriptor
* @param valueCount the value count in this column
* @param compressionCodecName a compression codec name
* @throws IOException if there is an error while writing
*/
public void startColumn(ColumnDescriptor descriptor,
long valueCount,
CompressionCodecName compressionCodecName) throws IOException {
state = state.startColumn();
encodingStatsBuilder.clear();
currentEncodings = new HashSet();
currentChunkPath = ColumnPath.get(descriptor.getPath());
currentChunkType = descriptor.getPrimitiveType();
currentChunkCodec = compressionCodecName;
currentChunkValueCount = valueCount;
currentChunkFirstDataPage = out.getPos();
compressedLength = 0;
uncompressedLength = 0;
// The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
currentStatistics = null;
}
/**
* writes a dictionary page page
* @param dictionaryPage the dictionary page
* @throws IOException if there is an error while writing
*/
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
state = state.write();
LOG.debug("{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize());
currentChunkDictionaryPageOffset = out.getPos();
int uncompressedSize = dictionaryPage.getUncompressedSize();
int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts
metadataConverter.writeDictionaryPageHeader(
uncompressedSize,
compressedPageSize,
dictionaryPage.getDictionarySize(),
dictionaryPage.getEncoding(),
out);
long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
this.uncompressedLength += uncompressedSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize);
dictionaryPage.getBytes().writeAllTo(out);
encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding());
currentEncodings.add(dictionaryPage.getEncoding());
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
* @throws IOException if there is an error while writing
*/
@Deprecated
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
uncompressedPageSize, compressedPageSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
bytes.writeAllTo(out);
encodingStatsBuilder.addDataEncoding(valuesEncoding);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* writes a single page
* @param valueCount count of values
* @param uncompressedPageSize the size of the data once uncompressed
* @param bytes the compressed data for the page without header
* @param statistics statistics for the page
* @param rlEncoding encoding of the repetition level
* @param dlEncoding encoding of the definition level
* @param valuesEncoding encoding of values
* @throws IOException if there is an error while writing
*/
public void writeDataPage(
int valueCount, int uncompressedPageSize,
BytesInput bytes,
Statistics statistics,
Encoding rlEncoding,
Encoding dlEncoding,
Encoding valuesEncoding) throws IOException {
state = state.write();
long beforeHeader = out.getPos();
LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
int compressedPageSize = (int)bytes.size();
metadataConverter.writeDataPageHeader(
uncompressedPageSize, compressedPageSize,
valueCount,
statistics,
rlEncoding,
dlEncoding,
valuesEncoding,
out);
long headerSize = out.getPos() - beforeHeader;
this.uncompressedLength += uncompressedPageSize + headerSize;
this.compressedLength += compressedPageSize + headerSize;
LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
bytes.writeAllTo(out);
// Copying the statistics if it is not initialized yet so we have the correct typed one
if (currentStatistics == null) {
currentStatistics = statistics.copy();
} else {
currentStatistics.mergeStatistics(statistics);
}
encodingStatsBuilder.addDataEncoding(valuesEncoding);
currentEncodings.add(rlEncoding);
currentEncodings.add(dlEncoding);
currentEncodings.add(valuesEncoding);
}
/**
* writes a number of pages at once
* @param bytes bytes to be written including page headers
* @param uncompressedTotalPageSize total uncompressed size (without page headers)
* @param compressedTotalPageSize total compressed size (without page headers)
* @throws IOException if there is an error while writing
*/
void writeDataPages(BytesInput bytes,
long uncompressedTotalPageSize,
long compressedTotalPageSize,
Statistics totalStats,
Set rlEncodings,
Set dlEncodings,
List dataEncodings) throws IOException {
state = state.write();
LOG.debug("{}: write data pages", out.getPos());
long headersSize = bytes.size() - compressedTotalPageSize;
this.uncompressedLength += uncompressedTotalPageSize + headersSize;
this.compressedLength += compressedTotalPageSize + headersSize;
LOG.debug("{}: write data pages content", out.getPos());
bytes.writeAllTo(out);
encodingStatsBuilder.addDataEncodings(dataEncodings);
if (rlEncodings.isEmpty()) {
encodingStatsBuilder.withV2Pages();
}
currentEncodings.addAll(rlEncodings);
currentEncodings.addAll(dlEncodings);
currentEncodings.addAll(dataEncodings);
currentStatistics = totalStats;
}
/**
* end a column (once all rep, def and data have been written)
* @throws IOException if there is an error while writing
*/
public void endColumn() throws IOException {
state = state.endColumn();
LOG.debug("{}: end column", out.getPos());
currentBlock.addColumn(ColumnChunkMetaData.get(
currentChunkPath,
currentChunkType,
currentChunkCodec,
encodingStatsBuilder.build(),
currentEncodings,
currentStatistics,
currentChunkFirstDataPage,
currentChunkDictionaryPageOffset,
currentChunkValueCount,
compressedLength,
uncompressedLength));
this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
this.uncompressedLength = 0;
this.compressedLength = 0;
}
/**
* ends a block once all column chunks have been written
* @throws IOException if there is an error while writing
*/
public void endBlock() throws IOException {
state = state.endBlock();
LOG.debug("{}: end block", out.getPos());
currentBlock.setRowCount(currentRecordCount);
blocks.add(currentBlock);
currentBlock = null;
}
/**
* @param conf a configuration
* @param file a file path to append the contents of to this file
* @throws IOException if there is an error while reading or writing
* @deprecated will be removed in 2.0.0; use {@link #appendFile(InputFile)} instead
*/
@Deprecated
public void appendFile(Configuration conf, Path file) throws IOException {
ParquetFileReader.open(conf, file).appendTo(this);
}
public void appendFile(InputFile file) throws IOException {
ParquetFileReader.open(file).appendTo(this);
}
/**
* @param file a file stream to read from
* @param rowGroups row groups to copy
* @param dropColumns whether to drop columns from the file that are not in this file's schema
* @throws IOException if there is an error while reading or writing
* @deprecated will be removed in 2.0.0;
* use {@link #appendRowGroups(SeekableInputStream,List,boolean)} instead
*/
@Deprecated
public void appendRowGroups(FSDataInputStream file,
List rowGroups,
boolean dropColumns) throws IOException {
appendRowGroups(HadoopStreams.wrap(file), rowGroups, dropColumns);
}
public void appendRowGroups(SeekableInputStream file,
List rowGroups,
boolean dropColumns) throws IOException {
for (BlockMetaData block : rowGroups) {
appendRowGroup(file, block, dropColumns);
}
}
/**
* @param from a file stream to read from
* @param rowGroup row group to copy
* @param dropColumns whether to drop columns from the file that are not in this file's schema
* @throws IOException if there is an error while reading or writing
* @deprecated will be removed in 2.0.0;
* use {@link #appendRowGroup(SeekableInputStream,BlockMetaData,boolean)} instead
*/
@Deprecated
public void appendRowGroup(FSDataInputStream from, BlockMetaData rowGroup,
boolean dropColumns) throws IOException {
appendRowGroup(HadoopStreams.wrap(from), rowGroup, dropColumns);
}
public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup,
boolean dropColumns) throws IOException {
startBlock(rowGroup.getRowCount());
Map columnsToCopy =
new HashMap();
for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
columnsToCopy.put(chunk.getPath().toDotString(), chunk);
}
List columnsInOrder =
new ArrayList();
for (ColumnDescriptor descriptor : schema.getColumns()) {
String path = ColumnPath.get(descriptor.getPath()).toDotString();
ColumnChunkMetaData chunk = columnsToCopy.remove(path);
if (chunk != null) {
columnsInOrder.add(chunk);
} else {
throw new IllegalArgumentException(String.format(
"Missing column '%s', cannot copy row group: %s", path, rowGroup));
}
}
// complain if some columns would be dropped and that's not okay
if (!dropColumns && !columnsToCopy.isEmpty()) {
throw new IllegalArgumentException(String.format(
"Columns cannot be copied (missing from target schema): %s",
Strings.join(columnsToCopy.keySet(), ", ")));
}
// copy the data for all chunks
long start = -1;
long length = 0;
long blockCompressedSize = 0;
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
// get this chunk's start position in the new file
long newChunkStart = out.getPos() + length;
// add this chunk to be copied with any previous chunks
if (start < 0) {
// no previous chunk included, start at this chunk's starting pos
start = chunk.getStartingPos();
}
length += chunk.getTotalSize();
if ((i + 1) == columnsInOrder.size() ||
columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
// not contiguous. do the copy now.
copy(from, out, start, length);
// reset to start at the next column chunk
start = -1;
length = 0;
}
currentBlock.addColumn(ColumnChunkMetaData.get(
chunk.getPath(),
chunk.getPrimitiveType(),
chunk.getCodec(),
chunk.getEncodingStats(),
chunk.getEncodings(),
chunk.getStatistics(),
newChunkStart,
newChunkStart,
chunk.getValueCount(),
chunk.getTotalSize(),
chunk.getTotalUncompressedSize()));
blockCompressedSize += chunk.getTotalSize();
}
currentBlock.setTotalByteSize(blockCompressedSize);
endBlock();
}
// Buffers for the copy function.
private static final ThreadLocal COPY_BUFFER =
new ThreadLocal() {
@Override
protected byte[] initialValue() {
return new byte[8192];
}
};
/**
* Copy from a FS input stream to an output stream. Thread-safe
*
* @param from a {@link SeekableInputStream}
* @param to any {@link PositionOutputStream}
* @param start where in the from stream to start copying
* @param length the number of bytes to copy
* @throws IOException if there is an error while reading or writing
*/
private static void copy(SeekableInputStream from, PositionOutputStream to,
long start, long length) throws IOException{
LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
from.seek(start);
long bytesCopied = 0;
byte[] buffer = COPY_BUFFER.get();
while (bytesCopied < length) {
long bytesLeft = length - bytesCopied;
int bytesRead = from.read(buffer, 0,
(buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
if (bytesRead < 0) {
throw new IllegalArgumentException(
"Unexpected end of input file at " + start + bytesCopied);
}
to.write(buffer, 0, bytesRead);
bytesCopied += bytesRead;
}
}
/**
* ends a file once all blocks have been written.
* closes the file.
* @param extraMetaData the extra meta data to write in the footer
* @throws IOException if there is an error while writing
*/
public void end(Map extraMetaData) throws IOException {
state = state.end();
LOG.debug("{}: end", out.getPos());
this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
serializeFooter(footer, out);
out.close();
}
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
long footerIndex = out.getPos();
org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
writeFileMetaData(parquetMetadata, out);
LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
out.write(MAGIC);
}
public ParquetMetadata getFooter() {
Preconditions.checkState(state == STATE.ENDED, "Cannot return unfinished footer.");
return footer;
}
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @param files a list of files to merge metadata from
* @param conf a configuration
* @return merged parquet metadata for the files
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List files, Configuration conf) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List blocks = new ArrayList();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(), blocks);
}
/**
* Given a list of metadata files, merge them into a single metadata file.
* Requires that the schemas be compatible, and the extraMetaData be exactly equal.
* This is useful when merging 2 directories of parquet files into a single directory, as long
* as both directories were written with compatible schemas and equal extraMetaData.
* @param files a list of files to merge metadata from
* @param outputPath path to write merged metadata to
* @param conf a configuration
* @throws IOException if there is an error while reading or writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMergedMetadataFile(List files, Path outputPath, Configuration conf) throws IOException {
ParquetMetadata merged = mergeMetadataFiles(files, conf);
writeMetadataFile(outputPath, merged, outputPath.getFileSystem(conf));
}
/**
* writes a _metadata and _common_metadata file
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMetadataFile(Configuration configuration, Path outputPath, List