All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.jet.hadoop.impl.ReadHadoopNewApiP Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2024 Hazelcast Inc.
 *
 * Licensed under the Hazelcast Community License (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://hazelcast.com/hazelcast-community-license
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.jet.hadoop.impl;

import com.hazelcast.cluster.Address;
import com.hazelcast.function.BiFunctionEx;
import com.hazelcast.function.ConsumerEx;
import com.hazelcast.internal.serialization.InternalSerializationService;
import com.hazelcast.jet.JetException;
import com.hazelcast.jet.Traverser;
import com.hazelcast.jet.core.AbstractProcessor;
import com.hazelcast.jet.core.Processor;
import com.hazelcast.jet.core.ProcessorSupplier;
import com.hazelcast.jet.hadoop.HadoopSources;
import com.hazelcast.jet.impl.execution.init.Contexts.ProcCtx;
import com.hazelcast.jet.impl.util.Util;
import com.hazelcast.jet.pipeline.file.impl.FileTraverser;
import com.hazelcast.logging.ILogger;
import com.hazelcast.logging.Logger;
import com.hazelcast.security.PermissionsUtil;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.InvalidInputException;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.io.Serial;
import java.lang.reflect.Constructor;
import java.security.Permission;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

import static com.hazelcast.jet.Traversers.traverseIterable;
import static com.hazelcast.jet.hadoop.HadoopSources.COPY_ON_READ;
import static com.hazelcast.internal.util.ExceptionUtil.sneakyThrow;
import static com.hazelcast.jet.impl.util.Util.uncheckCall;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;
import static org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR;

/**
 * See {@link HadoopSources#inputFormat}.
 */
public final class ReadHadoopNewApiP extends AbstractProcessor {

    private static final Class[] EMPTY_ARRAY = new Class[0];

    private final Configuration configuration;
    private final List splits;
    private final BiFunctionEx projectionFn;

    private HadoopFileTraverser traverser;

    private ReadHadoopNewApiP(
            @Nonnull Configuration configuration,
            @Nonnull List splits,
            @Nonnull BiFunctionEx projectionFn
    ) {
        this.configuration = configuration;
        this.splits = splits;
        this.projectionFn = projectionFn;
    }

    @Override
    protected void init(@Nonnull Context context) {
        InternalSerializationService serializationService = ((ProcCtx) context).serializationService();

        // we clone the projection of key/value if configured so because some of the
        // record-readers return the same object for `reader.getCurrentKey()`
        // and `reader.getCurrentValue()` which is mutated for each `reader.nextKeyValue()`.
        BiFunctionEx projectionFn = this.projectionFn;
        if (configuration.getBoolean(COPY_ON_READ, true)) {
            BiFunctionEx actualProjectionFn = projectionFn;
            projectionFn = (key, value) -> {
                R result = actualProjectionFn.apply(key, value);
                return result == null ? null : serializationService.toObject(serializationService.toData(result));
            };
        }

        traverser = new HadoopFileTraverser<>(configuration, splits, projectionFn);
    }

    @Override
    public boolean isCooperative() {
        return false;
    }

    @Override
    public boolean complete() {
        return emitFromTraverser(traverser);
    }

    @Override
    public void close() throws Exception {
        if (traverser != null) {
            traverser.close();
        }
    }

    @SuppressWarnings("unchecked")
    private static  InputFormat extractInputFormat(Configuration configuration) throws Exception {
        Class inputFormatClass = configuration.getClass(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
        Constructor constructor = inputFormatClass.getDeclaredConstructor(EMPTY_ARRAY);
        constructor.setAccessible(true);

        InputFormat inputFormat = (InputFormat) constructor.newInstance();
        ReflectionUtils.setConf(inputFormat, configuration);
        return inputFormat;
    }

    private static  List getSplits(Configuration configuration) throws Exception {
        InputFormat inputFormat = extractInputFormat(configuration);
        Job job = Job.getInstance(configuration);
        try {
            return inputFormat.getSplits(job);
        } catch (InvalidInputException e) {
            String directory = configuration.get(INPUT_DIR, "");
            boolean ignoreFileNotFound = configuration.getBoolean(HadoopSources.IGNORE_FILE_NOT_FOUND, true);
            if (ignoreFileNotFound) {
                ILogger logger = Logger.getLogger(ReadHadoopNewApiP.class);
                logger.fine("The directory '" + directory + "' does not exist. This source will emit 0 items.");
                return emptyList();
            } else {
                throw new JetException("The input " + directory + " matches no files");
            }
        }
    }

    public static class MetaSupplier extends ReadHdfsMetaSupplierBase {
        @Serial
        private static final long serialVersionUID = 1L;

        /**
         * The instance is either {@link SerializableConfiguration} or {@link
         * SerializableJobConf}, which are serializable.
         */
        @SuppressFBWarnings("SE_BAD_FIELD")
        private final Configuration configuration;
        private final ConsumerEx configureFn;
        private final BiFunctionEx projectionFn;
        private final Permission permission;

        private transient Map> assigned;

        public MetaSupplier(
                @Nullable Permission permission,
                @Nonnull Configuration configuration,
                @Nonnull ConsumerEx configureFn,
                @Nonnull BiFunctionEx projectionFn) {
            this.permission = permission;
            this.configuration = configuration;
            this.configureFn = configureFn;
            this.projectionFn = projectionFn;
        }

        @Override
        public void init(@Nonnull Context context) throws Exception {
            super.init(context);
            PermissionsUtil.checkPermission(configureFn, context);
            updateConfiguration();

            if (shouldSplitOnMembers(configuration)) {
                assigned = new HashMap<>();
            } else {
                List splits = getSplits(configuration);
                IndexedInputSplit[] indexedInputSplits = new IndexedInputSplit[splits.size()];
                Arrays.setAll(indexedInputSplits, i -> new IndexedInputSplit(i, splits.get(i)));
                Address[] addresses = context.partitionAssignment().keySet().toArray(Address[]::new);
                assigned = assignSplitsToMembers(indexedInputSplits, addresses);
                printAssignments(assigned);
            }
        }

        @Nonnull
        @Override
        public Function get(@Nonnull List
addresses) { return address -> new Supplier<>(configuration, assigned.getOrDefault(address, emptyList()), projectionFn); } @Override public FileTraverser traverser() throws Exception { updateConfiguration(); return new HadoopFileTraverser<>(configuration, getSplits(configuration), projectionFn); } private void updateConfiguration() { configureFn.accept(configuration); } @Override public Permission getRequiredPermission() { return permission; } @Override public boolean closeIsCooperative() { return true; } } private static final class Supplier implements ProcessorSupplier { @Serial private static final long serialVersionUID = 1L; /** * The instance is either {@link SerializableConfiguration} or {@link * SerializableJobConf}, which are serializable. */ @SuppressFBWarnings("SE_BAD_FIELD") private final Configuration configuration; private final BiFunctionEx projectionFn; private final List assignedSplits; private Supplier( @Nonnull Configuration configuration, @Nonnull List assignedSplits, @Nonnull BiFunctionEx projectionFn ) { this.configuration = configuration; this.projectionFn = projectionFn; this.assignedSplits = assignedSplits; } @Nonnull @Override public List get(int count) { List inputSplits; if (shouldSplitOnMembers(configuration)) { inputSplits = uncheckCall(() -> getSplits(configuration)); } else { inputSplits = assignedSplits.stream().map(IndexedInputSplit::getNewSplit).collect(toList()); } return Util.distributeObjects(count, inputSplits) .values().stream() .map(splits -> new ReadHadoopNewApiP<>(configuration, splits, projectionFn)) .collect(toList()); } } /** * If all the input paths are of LocalFileSystem and not marked as shared * (see {@link HadoopSources#SHARED_LOCAL_FS}), split the input paths on * members. */ private static boolean shouldSplitOnMembers(Configuration configuration) { // If the local file system is marked as shared, don't split on members if (configuration.getBoolean(HadoopSources.SHARED_LOCAL_FS, false)) { return false; } // Local file system is not marked as shared, throw exception if // there are local file system and remote file system in the inputs. Job job = uncheckCall(() -> Job.getInstance(configuration)); Path[] inputPaths = FileInputFormat.getInputPaths(job); boolean hasLocalFileSystem = false; boolean hasRemoteFileSystem = false; for (Path inputPath : inputPaths) { if (isLocalFileSystem(inputPath, configuration)) { hasLocalFileSystem = true; } else { hasRemoteFileSystem = true; } } if (hasLocalFileSystem && hasRemoteFileSystem) { throw new IllegalArgumentException( "LocalFileSystem should be marked as shared when used with other remote file systems"); } return hasLocalFileSystem; } private static boolean isLocalFileSystem(Path inputPath, Configuration configuration) { FileSystem fileSystem = uncheckCall(() -> inputPath.getFileSystem(configuration)); return fileSystem instanceof LocalFileSystem || fileSystem instanceof RawLocalFileSystem; } private static final class HadoopFileTraverser implements FileTraverser { private final Configuration configuration; private final InputFormat inputFormat; private final BiFunctionEx projectionFn; private final Traverser delegate; private RecordReader reader; private HadoopFileTraverser( Configuration configuration, List splits, BiFunctionEx projectionFn ) { this.configuration = configuration; this.inputFormat = uncheckCall(() -> extractInputFormat(configuration)); this.projectionFn = projectionFn; this.delegate = traverseIterable(splits).flatMap(this::traverseSplit); } private Traverser traverseSplit(InputSplit split) { try { TaskAttemptContextImpl attemptContext = new TaskAttemptContextImpl(configuration, new TaskAttemptID()); reader = inputFormat.createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); } catch (IOException | InterruptedException e) { throw sneakyThrow(e); } return () -> { try { while (reader.nextKeyValue()) { R projectedRecord = projectionFn.apply(reader.getCurrentKey(), reader.getCurrentValue()); if (projectedRecord != null) { return projectedRecord; } } reader.close(); return null; } catch (Exception e) { throw sneakyThrow(e); } }; } @Override public R next() { return delegate.next(); } @Override public void close() throws IOException { if (reader != null) { reader.close(); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy