org.apache.iceberg.ManifestReader Maven / Gradle / Ivy
Show all versions of iceberg-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import org.apache.iceberg.avro.Avro;
import org.apache.iceberg.avro.AvroIterable;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.io.CloseableGroup;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.types.Types;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.iceberg.expressions.Expressions.alwaysTrue;
/**
* Reader for manifest files.
*
* Readers are created using the builder from {@link #read(InputFile, Function)}.
*/
public class ManifestReader extends CloseableGroup implements Filterable {
private static final Logger LOG = LoggerFactory.getLogger(ManifestReader.class);
private static final List ALL_COLUMNS = ImmutableList.of("*");
static final List CHANGE_COLUMNS = ImmutableList.of(
"file_path", "file_format", "partition", "record_count", "file_size_in_bytes");
static final List CHANGE_WITH_STATS_COLUMNS = ImmutableList.builder()
.addAll(CHANGE_COLUMNS)
.add("value_counts", "null_value_counts", "lower_bounds", "upper_bounds")
.build();
/**
* Returns a new {@link ManifestReader} for an {@link InputFile}.
*
* Note: Most callers should use {@link #read(InputFile, Function)} to ensure that the
* schema used by filters is the latest table schema. This should be used only when reading a
* manifest without filters.
*
* @param file an InputFile
* @return a manifest reader
*/
public static ManifestReader read(InputFile file) {
return new ManifestReader(file, null);
}
/**
* Returns a new {@link ManifestReader} for an {@link InputFile}.
*
* @param file an InputFile
* @param specLookup a function to look up the manifest's partition spec by ID
* @return a manifest reader
*/
public static ManifestReader read(InputFile file, Function specLookup) {
return new ManifestReader(file, specLookup);
}
private final InputFile file;
private final Map metadata;
private final PartitionSpec spec;
private final Schema fileSchema;
// lazily initialized
private List cachedAdds = null;
private List cachedDeletes = null;
private ManifestReader(InputFile file, Function specLookup) {
this.file = file;
try {
try (AvroIterable headerReader = Avro.read(file)
.project(ManifestEntry.getSchema(Types.StructType.of()).select("status"))
.build()) {
this.metadata = headerReader.getMetadata();
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
int specId = TableMetadata.INITIAL_SPEC_ID;
String specProperty = metadata.get("partition-spec-id");
if (specProperty != null) {
specId = Integer.parseInt(specProperty);
}
if (specLookup != null) {
this.spec = specLookup.apply(specId);
} else {
Schema schema = SchemaParser.fromJson(metadata.get("schema"));
this.spec = PartitionSpecParser.fromJsonFields(schema, specId, metadata.get("partition-spec"));
}
this.fileSchema = new Schema(DataFile.getType(spec.partitionType()).fields());
}
public InputFile file() {
return file;
}
public Schema schema() {
return fileSchema;
}
public PartitionSpec spec() {
return spec;
}
@Override
public FilteredManifest select(Collection columns) {
return new FilteredManifest(this, alwaysTrue(), alwaysTrue(), fileSchema, columns, true);
}
@Override
public FilteredManifest project(Schema fileProjection) {
return new FilteredManifest(this, alwaysTrue(), alwaysTrue(), fileProjection, ALL_COLUMNS, true);
}
@Override
public FilteredManifest filterPartitions(Expression expr) {
return new FilteredManifest(this, expr, alwaysTrue(), fileSchema, ALL_COLUMNS, true);
}
@Override
public FilteredManifest filterRows(Expression expr) {
return new FilteredManifest(this, alwaysTrue(), expr, fileSchema, ALL_COLUMNS, true);
}
public FilteredManifest caseSensitive(boolean caseSensitive) {
return new FilteredManifest(this, alwaysTrue(), alwaysTrue(), fileSchema, ALL_COLUMNS, caseSensitive);
}
public List addedFiles() {
if (cachedAdds == null) {
cacheChanges();
}
return cachedAdds;
}
public List deletedFiles() {
if (cachedDeletes == null) {
cacheChanges();
}
return cachedDeletes;
}
private void cacheChanges() {
List adds = Lists.newArrayList();
List deletes = Lists.newArrayList();
try (CloseableIterable entries = entries(fileSchema.select(CHANGE_COLUMNS))) {
for (ManifestEntry entry : entries) {
switch (entry.status()) {
case ADDED:
adds.add(entry.copyWithoutStats());
break;
case DELETED:
deletes.add(entry.copyWithoutStats());
break;
default:
}
}
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to close manifest entries");
}
this.cachedAdds = adds;
this.cachedDeletes = deletes;
}
CloseableIterable entries() {
return entries(fileSchema);
}
CloseableIterable entries(Schema fileProjection) {
FileFormat format = FileFormat.fromFileName(file.location());
Preconditions.checkArgument(format != null, "Unable to determine format of manifest: %s", file);
switch (format) {
case AVRO:
AvroIterable reader = Avro.read(file)
.project(ManifestEntry.wrapFileSchema(fileProjection.asStruct()))
.rename("manifest_entry", ManifestEntry.class.getName())
.rename("partition", PartitionData.class.getName())
.rename("r102", PartitionData.class.getName())
.rename("data_file", GenericDataFile.class.getName())
.rename("r2", GenericDataFile.class.getName())
.reuseContainers()
.build();
addCloseable(reader);
return reader;
default:
throw new UnsupportedOperationException("Invalid format for manifest file: " + format);
}
}
@Override
public Iterator iterator() {
return iterator(alwaysTrue(), fileSchema);
}
// visible for use by PartialManifest
Iterator iterator(Expression partFilter, Schema fileProjection) {
return Iterables.transform(Iterables.filter(
entries(fileProjection),
entry -> entry.status() != ManifestEntry.Status.DELETED),
ManifestEntry::file).iterator();
}
}