org.apache.parquet.column.EncodingStats Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.column;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
import static org.apache.parquet.column.Encoding.RLE_DICTIONARY;
/**
* EncodingStats track dictionary and data page encodings for a single column within a row group.
* These are used when filtering row groups. For example, to filter a row group based on a column's
* dictionary, all of the data pages in that column must be dictionary-encoded. This class provides
* convenience methods for those checks, like {@link #hasNonDictionaryEncodedPages()}.
*/
public class EncodingStats {
final Map dictStats;
final Map dataStats;
private final boolean usesV2Pages;
private EncodingStats(Map dictStats,
Map dataStats,
boolean usesV2Pages) {
this.dictStats = dictStats;
this.dataStats = dataStats;
this.usesV2Pages = usesV2Pages;
}
public Set getDictionaryEncodings() {
return dictStats.keySet();
}
public Set getDataEncodings() {
return dataStats.keySet();
}
public int getNumDictionaryPagesEncodedAs(Encoding enc) {
final Number pageCount = dictStats.get(enc);
return (pageCount == null) ? 0 : pageCount.intValue();
}
public int getNumDataPagesEncodedAs(Encoding enc) {
final Number pageCount = dataStats.get(enc);
return (pageCount == null) ? 0 : pageCount.intValue();
}
public boolean hasDictionaryPages() {
return !dictStats.isEmpty();
}
public boolean hasDictionaryEncodedPages() {
Set encodings = dataStats.keySet();
return (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY));
}
public boolean hasNonDictionaryEncodedPages() {
if (dataStats.isEmpty()) {
return false; // no pages
}
// this modifies the set, so copy it
Set encodings = new HashSet<>(dataStats.keySet());
if (!encodings.remove(RLE_DICTIONARY) &&
!encodings.remove(PLAIN_DICTIONARY)) {
return true; // not dictionary encoded
}
if (encodings.isEmpty()) {
return false;
}
// at least one non-dictionary encoding is present
return true;
}
public boolean usesV2Pages() {
return usesV2Pages;
}
/**
* Used to build {@link EncodingStats} from metadata or to accumulate stats as pages are written.
*/
public static class Builder {
private final Map dictStats = new LinkedHashMap<>();
private final Map dataStats = new LinkedHashMap<>();
private boolean usesV2Pages = false;
public Builder clear() {
this.usesV2Pages = false;
dictStats.clear();
dataStats.clear();
return this;
}
public Builder withV2Pages() {
this.usesV2Pages = true;
return this;
}
public Builder addDictEncoding(Encoding encoding) {
return addDictEncoding(encoding, 1);
}
public Builder addDictEncoding(Encoding encoding, int numPages) {
dictStats.computeIfAbsent(encoding, enc -> new AtomicInteger(0))
.addAndGet(numPages);
return this;
}
public Builder addDataEncodings(Collection encodings) {
for (Encoding encoding : encodings) {
addDataEncoding(encoding);
}
return this;
}
public Builder addDataEncoding(Encoding encoding) {
return addDataEncoding(encoding, 1);
}
public Builder addDataEncoding(Encoding encoding, int numPages) {
dataStats.computeIfAbsent(encoding, enc -> new AtomicInteger(0))
.addAndGet(numPages);
return this;
}
public EncodingStats build() {
return new EncodingStats(
Collections.unmodifiableMap(new LinkedHashMap<>(dictStats)),
Collections.unmodifiableMap(new LinkedHashMap<>(dataStats)),
usesV2Pages);
}
}
}