All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource Maven / Gradle / Ivy

There is a newer version: 0.8.0
Show newest version
/*
 * Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package gobblin.source.extractor.hadoop;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.extract.AbstractSource;
import gobblin.source.workunit.Extract;
import gobblin.source.workunit.WorkUnit;
import gobblin.util.HadoopUtils;


/**
 * An implementation of {@link gobblin.source.Source} that uses a Hadoop {@link FileInputFormat} to get a
 * {@link FileSplit} per {@link Extractor} return by {@link #getExtractor(WorkUnitState)} and a
 * {@link RecordReader} to read the {@link FileSplit}.
 *
 * 

* This class is equivalent to {@link HadoopFileInputSource} in terms of functionality except that it uses * the old Hadoop API. *

* *

* This class can read either keys of type {@link #} or values of type {@link #} supported by the * given {@link FileInputFormat}, configurable by {@link HadoopFileInputSource#FILE_INPUT_READ_KEYS_KEY}. * It will read keys if the property is set to {@code true}, otherwise it will read values. By default, * it will read values through the given {@link FileInputFormat}. *

* *

* A concrete implementation of this class should implement {@link #getFileInputFormat(State, JobConf)} * and {@link #getExtractor(WorkUnitState, RecordReader, FileSplit, boolean)}, which returns a * {@link OldApiHadoopFileInputExtractor} that needs an concrete implementation. *

* * @param output schema type * @param output data record type * @param key type expected by the {@link FileInputFormat} * @param value type expected by the {@link FileInputFormat} * * @author Yinan Li */ public abstract class OldApiHadoopFileInputSource extends AbstractSource { @Override public List getWorkunits(SourceState state) { JobConf jobConf = new JobConf(new Configuration()); for (String key : state.getPropertyNames()) { jobConf.set(key, state.getProp(key)); } if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { FileInputFormat.addInputPath(jobConf, new Path(inputPath)); } } try { FileInputFormat fileInputFormat = getFileInputFormat(state, jobConf); InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt( HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED)); if (fileSplits == null || fileSplits.length == 0) { return ImmutableList.of(); } Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null; String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY); List workUnits = Lists.newArrayListWithCapacity(fileSplits.length); for (InputSplit inputSplit : fileSplits) { // Create one WorkUnit per InputSplit FileSplit fileSplit = (FileSplit) inputSplit; Extract extract = createExtract(tableType, tableNamespace, tableName); WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString()); workUnits.add(workUnit); } return workUnits; } catch (IOException ioe) { throw new RuntimeException("Failed to get workunits", ioe); } } @Override public Extractor getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); } JobConf jobConf = new JobConf(new Configuration()); for (String key : workUnitState.getPropertyNames()) { jobConf.set(key, workUnitState.getProp(key)); } String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); FileInputFormat fileInputFormat = getFileInputFormat(workUnitState, jobConf); RecordReader recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); boolean readKeys = workUnitState.getPropAsBoolean( HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY, HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); } @Override public void shutdown(SourceState state) { } /** * Get a {@link FileInputFormat} instance used to get {@link FileSplit}s and a {@link RecordReader} * for every {@link FileSplit}. * *

* This default implementation simply creates a new instance of a {@link FileInputFormat} class * specified using the configuration property {@link HadoopFileInputSource#FILE_INPUT_FORMAT_CLASS_KEY}. *

* * @param state a {@link State} object carrying configuration properties * @param jobConf a Hadoop {@link JobConf} object carrying Hadoop configurations * @return a {@link FileInputFormat} instance */ @SuppressWarnings("unchecked") protected FileInputFormat getFileInputFormat(State state, JobConf jobConf) { Preconditions.checkArgument(state.contains(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY)); try { return (FileInputFormat) ReflectionUtils.newInstance( Class.forName(state.getProp(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY)), new Configuration()); } catch (ClassNotFoundException cnfe) { throw new RuntimeException(cnfe); } } /** * Get a {@link OldApiHadoopFileInputExtractor} instance. * * @param workUnitState a {@link WorkUnitState} object carrying Gobblin configuration properties * @param recordReader a Hadoop {@link RecordReader} object used to read input records * @param fileSplit the {@link FileSplit} to read input records from * @param readKeys whether the {@link OldApiHadoopFileInputExtractor} should read keys of type {@link #}; * by default values of type {@link #>V>} are read. * @return a {@link OldApiHadoopFileInputExtractor} instance */ protected abstract OldApiHadoopFileInputExtractor getExtractor(WorkUnitState workUnitState, RecordReader recordReader, FileSplit fileSplit, boolean readKeys); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy