uk.gov.gchq.gaffer.sparkaccumulo.operation.rfilereaderrdd.RFileReaderIterator Maven / Gradle / Ivy
/*
* Copyright 2017-2020 Crown Copyright
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package uk.gov.gchq.gaffer.sparkaccumulo.operation.rfilereaderrdd;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat;
import org.apache.accumulo.core.client.mapreduce.lib.impl.InputConfigurator;
import org.apache.accumulo.core.client.sample.SamplerConfiguration;
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.conf.SiteConfiguration;
import org.apache.accumulo.core.data.ArrayByteSequence;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile;
import org.apache.accumulo.core.file.rfile.RFile;
import org.apache.accumulo.core.iterators.IteratorEnvironment;
import org.apache.accumulo.core.iterators.IteratorUtil;
import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
import org.apache.accumulo.core.iterators.system.MultiIterator;
import org.apache.accumulo.core.iterators.system.VisibilityFilter;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.Pair;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.Partition;
import org.apache.spark.TaskContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* A {@code RFileReaderIterator} is a {@link java.util.Iterator} formed by merging iterators over
* a set of RFiles.
*/
public class RFileReaderIterator implements java.util.Iterator> {
private static final Logger LOGGER = LoggerFactory.getLogger(RFileReaderIterator.class);
private final Partition partition;
private final TaskContext taskContext;
private final List> iterators = new ArrayList<>();
private SortedKeyValueIterator mergedIterator = null;
private SortedKeyValueIterator iteratorAfterIterators = null;
private Configuration configuration;
private Set auths;
public RFileReaderIterator(final Partition partition,
final TaskContext taskContext,
final Configuration configuration,
final Set auths) {
this.partition = partition;
this.taskContext = taskContext;
this.configuration = configuration;
this.auths = auths;
try {
init();
} catch (final IOException e) {
throw new RuntimeException("IOException initialising RFileReaderIterator", e);
}
}
@Override
public boolean hasNext() {
return iteratorAfterIterators.hasTop();
}
@Override
public Map.Entry next() {
final Map.Entry next = new AbstractMap.SimpleEntry<>(new Key(iteratorAfterIterators.getTopKey()),
new Value(iteratorAfterIterators.getTopValue()));
try {
iteratorAfterIterators.next();
} catch (final IOException e) {
// Swallow
}
return next;
}
private void init() throws IOException {
final AccumuloTablet accumuloTablet = (AccumuloTablet) partition;
LOGGER.info("Initialising RFileReaderIterator for files {}", StringUtils.join(accumuloTablet.getFiles(), ','));
final AccumuloConfiguration accumuloConfiguration = SiteConfiguration.getInstance();
// Required column families according to the configuration
final Set requiredColumnFamilies = InputConfigurator
.getFetchedColumns(AccumuloInputFormat.class, configuration)
.stream()
.map(Pair::getFirst)
.map(c -> new ArrayByteSequence(c.toString()))
.collect(Collectors.toSet());
LOGGER.info("RFileReaderIterator will read column families of {}", StringUtils.join(requiredColumnFamilies, ','));
// Column families
final List> iterators = new ArrayList<>();
for (final String filename : accumuloTablet.getFiles()) {
final Path path = new Path(filename);
final FileSystem fs = path.getFileSystem(configuration);
final RFile.Reader rFileReader = new RFile.Reader(
new CachableBlockFile.Reader(fs, path, configuration, null, null, accumuloConfiguration));
iterators.add(rFileReader);
}
mergedIterator = new MultiIterator(iterators, true);
// Apply visibility filtering iterator
if (null != auths) {
final Authorizations authorizations = new Authorizations(auths.toArray(new String[auths.size()]));
final SortedKeyValueIterator visibilityFilter = VisibilityFilter.wrap(mergedIterator, authorizations, new byte[]{});
final IteratorSetting visibilityIteratorSetting = new IteratorSetting(1, "auth", VisibilityFilter.class);
visibilityFilter.init(mergedIterator, visibilityIteratorSetting.getOptions(), null);
iteratorAfterIterators = visibilityFilter;
LOGGER.info("Set authorizations to {}", authorizations);
} else {
iteratorAfterIterators = mergedIterator;
}
// Apply iterator stack
final List iteratorSettings = getIteratorSettings();
iteratorSettings.sort(Comparator.comparingInt(IteratorSetting::getPriority));
for (final IteratorSetting is : iteratorSettings) {
iteratorAfterIterators = applyIterator(iteratorAfterIterators, is);
}
taskContext.addTaskCompletionListener(context -> close());
final Range range = new Range(accumuloTablet.getStartRow(), true, accumuloTablet.getEndRow(), false);
iteratorAfterIterators.seek(range, requiredColumnFamilies, true);
LOGGER.info("Initialised iterator");
}
private SortedKeyValueIterator applyIterator(final SortedKeyValueIterator source,
final IteratorSetting is) {
try {
SortedKeyValueIterator result = Class.forName(is.getIteratorClass())
.asSubclass(SortedKeyValueIterator.class).newInstance();
result.init(source, is.getOptions(), new IteratorEnvironment() {
@Override
public SortedKeyValueIterator reserveMapFileReader(final String mapFileName) {
return null;
}
@Override
public AccumuloConfiguration getConfig() {
return null;
}
@Override
public IteratorUtil.IteratorScope getIteratorScope() {
return IteratorUtil.IteratorScope.majc;
}
@Override
public boolean isFullMajorCompaction() {
return false;
}
@Override
public void registerSideChannel(final SortedKeyValueIterator iter) {
}
@Override
public Authorizations getAuthorizations() {
return null;
}
@Override
public IteratorEnvironment cloneWithSamplingEnabled() {
return null;
}
@Override
public boolean isSamplingEnabled() {
return false;
}
@Override
public SamplerConfiguration getSamplerConfiguration() {
return null;
}
});
return result;
} catch (final IOException | InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException("Exception creating iterator of class " + is.getIteratorClass());
}
}
private List getIteratorSettings() {
return InputConfigurator.getIterators(AccumuloInputFormat.class, configuration);
}
private void close() {
for (final SortedKeyValueIterator iterator : iterators) {
RFile.Reader reader = null;
try {
reader = (RFile.Reader) iterator;
LOGGER.debug("Closing RFile.Reader {}", reader);
reader.close();
} catch (final IOException e) {
LOGGER.error("IOException closing reader {}", reader);
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy