org.apache.pinot.plugin.ingestion.batch.spark.SparkSegmentTarPushJobRunner Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.plugin.ingestion.batch.spark;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.pinot.segment.local.utils.SegmentPushUtils;
import org.apache.pinot.spi.env.PinotConfiguration;
import org.apache.pinot.spi.filesystem.PinotFS;
import org.apache.pinot.spi.filesystem.PinotFSFactory;
import org.apache.pinot.spi.ingestion.batch.runner.IngestionJobRunner;
import org.apache.pinot.spi.ingestion.batch.spec.Constants;
import org.apache.pinot.spi.ingestion.batch.spec.PinotFSSpec;
import org.apache.pinot.spi.ingestion.batch.spec.SegmentGenerationJobSpec;
import org.apache.pinot.spi.plugin.PluginManager;
import org.apache.pinot.spi.utils.retry.AttemptsExceededException;
import org.apache.pinot.spi.utils.retry.RetriableOperationException;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
public class SparkSegmentTarPushJobRunner implements IngestionJobRunner, Serializable {
private SegmentGenerationJobSpec _spec;
public SparkSegmentTarPushJobRunner() {
}
public SparkSegmentTarPushJobRunner(SegmentGenerationJobSpec spec) {
init(spec);
}
@Override
public void init(SegmentGenerationJobSpec spec) {
_spec = spec;
}
@Override
public void run() {
//init all file systems
List pinotFSSpecs = _spec.getPinotFSSpecs();
for (PinotFSSpec pinotFSSpec : pinotFSSpecs) {
PinotFSFactory.register(pinotFSSpec.getScheme(), pinotFSSpec.getClassName(), new PinotConfiguration(pinotFSSpec));
}
//Get outputFS for writing output pinot segments
URI outputDirURI;
try {
outputDirURI = new URI(_spec.getOutputDirURI());
if (outputDirURI.getScheme() == null) {
outputDirURI = new File(_spec.getOutputDirURI()).toURI();
}
} catch (URISyntaxException e) {
throw new RuntimeException("outputDirURI is not valid - '" + _spec.getOutputDirURI() + "'");
}
PinotFS outputDirFS = PinotFSFactory.create(outputDirURI.getScheme());
//Get list of files to process
String[] files;
try {
files = outputDirFS.listFiles(outputDirURI, true);
} catch (IOException e) {
throw new RuntimeException("Unable to list all files under outputDirURI - '" + outputDirURI + "'");
}
List segmentsToPush = new ArrayList<>();
for (String file : files) {
if (file.endsWith(Constants.TAR_GZ_FILE_EXT)) {
segmentsToPush.add(file);
}
}
int pushParallelism = _spec.getPushJobSpec().getPushParallelism();
if (pushParallelism < 1) {
pushParallelism = segmentsToPush.size();
}
if (pushParallelism == 1) {
// Push from driver
try {
SegmentPushUtils.pushSegments(_spec, outputDirFS, segmentsToPush);
} catch (RetriableOperationException | AttemptsExceededException e) {
throw new RuntimeException(e);
}
} else {
JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate());
JavaRDD pathRDD = sparkContext.parallelize(segmentsToPush, pushParallelism);
URI finalOutputDirURI = outputDirURI;
// Prevent using lambda expression in Spark to avoid potential serialization exceptions, use inner function
// instead.
pathRDD.foreach(new VoidFunction() {
@Override
public void call(String segmentTarPath)
throws Exception {
PluginManager.get().init();
for (PinotFSSpec pinotFSSpec : pinotFSSpecs) {
PinotFSFactory
.register(pinotFSSpec.getScheme(), pinotFSSpec.getClassName(), new PinotConfiguration(pinotFSSpec));
}
try {
SegmentPushUtils.pushSegments(_spec, PinotFSFactory.create(finalOutputDirURI.getScheme()),
Arrays.asList(segmentTarPath));
} catch (RetriableOperationException | AttemptsExceededException e) {
throw new RuntimeException(e);
}
}
});
}
}
}