gobblin.source.extractor.filebased.FileBasedSource Maven / Gradle / Ivy
                 Go to download
                
        
                    Show more of this group  Show more artifacts with this name
Show all versions of gobblin-core Show documentation
                Show all versions of gobblin-core Show documentation
Gobblin Ingestion Framework
                
            /*
 * Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */
package gobblin.source.extractor.filebased;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import gobblin.source.extractor.extract.AbstractSource;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.source.workunit.Extract;
import gobblin.source.workunit.WorkUnit;
import gobblin.source.workunit.Extract.TableType;
/**
 * This class is a base class for file based sources, it provides default
 * functionality for keeping track of which files have already been pulled
 * by the framework and for determining which files need to be pulled in this run
 * @author stakiar
 */
public abstract class FileBasedSource extends AbstractSource {
  private static final Logger log = LoggerFactory.getLogger(FileBasedSource.class);
  protected TimestampAwareFileBasedHelper fsHelper;
  private String splitPattern = ":::";
  /**
   * Initialize the logger.
   *
   * @param state Source state
   */
  protected void initLogger(SourceState state) {
    StringBuilder sb = new StringBuilder();
    sb.append("[");
    sb.append(Strings.nullToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY)));
    sb.append("]");
    MDC.put("sourceInfo", sb.toString());
  }
  /**
   * This method takes the snapshot seen in the previous run, and compares it to the list
   * of files currently in the source - it then decided which files it needs to pull
   * and distributes those files across the workunits; it does this comparison by comparing
   * the names of the files currently in the source vs. the names retrieved from the
   * previous state
   * @param state is the source state
   * @return a list of workunits for the framework to run
   */
  @Override
  public List getWorkunits(SourceState state) {
    initLogger(state);
    try {
      initFileSystemHelper(state);
    } catch (FileBasedHelperException e) {
      Throwables.propagate(e);
    }
    log.info("Getting work units");
    String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    String entityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY);
    // Override extract table name
    String extractTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
    // If extract table name is not found then consider entity name as extract table name
    if (Strings.isNullOrEmpty(extractTableName)) {
      extractTableName = entityName;
    }
    TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
    List previousWorkunits = Lists.newArrayList(state.getPreviousWorkUnitStates());
    List prevFsSnapshot = Lists.newArrayList();
    // Get list of files seen in the previous run
    if (!previousWorkunits.isEmpty()
        && previousWorkunits.get(0).getWorkunit().contains(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT)) {
      prevFsSnapshot =
          previousWorkunits.get(0).getWorkunit().getPropAsList(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT);
    }
    // Get list of files that need to be pulled
    List currentFsSnapshot = this.getcurrentFsSnapshot(state);
    HashSet filesWithTimeToPull = new HashSet<>(currentFsSnapshot);
    filesWithTimeToPull.removeAll(prevFsSnapshot);
    List filesToPull = new ArrayList<>();
    Iterator it = filesWithTimeToPull.iterator();
    while (it.hasNext()) {
      String filesWithoutTimeToPull[] = it.next().split(this.splitPattern);
      filesToPull.add(filesWithoutTimeToPull[0]);
    }
    List workUnits = Lists.newArrayList();
    if (!filesToPull.isEmpty()) {
      log.info("Will pull the following files in this run: " + Arrays.toString(filesToPull.toArray()));
      int numPartitions = state.contains((ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS))
          && state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) <= filesToPull.size()
              ? state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) : filesToPull.size();
      if (numPartitions <= 0) {
        throw new IllegalArgumentException("The number of partitions should be positive");
      }
      int filesPerPartition = filesToPull.size() % numPartitions == 0 ? filesToPull.size() / numPartitions
          : filesToPull.size() / numPartitions + 1;
      int workUnitCount = 0;
      // Distribute the files across the workunits
      for (int fileOffset = 0; fileOffset < filesToPull.size(); fileOffset += filesPerPartition) {
        SourceState partitionState = new SourceState();
        partitionState.addAll(state);
        // Eventually these setters should be integrated with framework support for generalized watermark handling
        partitionState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT,
            StringUtils.join(currentFsSnapshot, ","));
        List partitionFilesToPull = filesToPull.subList(fileOffset,
            fileOffset + filesPerPartition > filesToPull.size() ? filesToPull.size() : fileOffset + filesPerPartition);
        partitionState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL,
            StringUtils.join(partitionFilesToPull, ","));
        if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, false)) {
          if (partitionFilesToPull.size() != 1) {
            throw new RuntimeException("Cannot preserve the file name if a workunit is given multiple files");
          }
          partitionState.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR,
              partitionState.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
        }
        // Use extract table name to create extract
        Extract extract = partitionState.createExtract(tableType, nameSpaceName, extractTableName);
        workUnits.add(partitionState.createWorkUnit(extract));
        workUnitCount++;
      }
      log.info("Total number of work units for the current run: " + workUnitCount);
    }
    List previousWorkUnits = this.getPreviousWorkUnitsForRetry(state);
    log.info("Total number of work units from the previous failed runs: " + previousWorkUnits.size());
    workUnits.addAll(previousWorkUnits);
    return workUnits;
  }
  /**
   * This method is responsible for connecting to the source and taking
   * a snapshot of the folder where the data is present, it then returns
   * a list of the files in String format
   * @param state is used to connect to the source
   * @return a list of file name or paths present on the external data
   * directory
   */
  public List getcurrentFsSnapshot(State state) {
    List results = new ArrayList<>();
    String path = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY) + "/*"
        + state.getProp(ConfigurationKeys.SOURCE_ENTITY) + "*";
    try {
      log.info("Running ls command with input " + path);
      results = this.fsHelper.ls(path);
      for (int i = 0; i < results.size(); i++) {
        String filePath = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY) + "/" + results.get(i);
        results.set(i, filePath + this.splitPattern + this.fsHelper.getFileMTime(filePath));
      }
    } catch (FileBasedHelperException e) {
      log.error("Not able to fetch the filename/file modified time to " + e.getMessage() + " will not pull any files",
          e);
    }
    return results;
  }
  @Override
  public void shutdown(SourceState state) {
    if (this.fsHelper != null) {
      log.info("Shutting down the FileSystemHelper connection");
      try {
        this.fsHelper.close();
      } catch (FileBasedHelperException e) {
        log.error("Unable to shutdown FileSystemHelper", e);
      }
    }
  }
  public abstract void initFileSystemHelper(State state) throws FileBasedHelperException;
}
                © 2015 - 2025 Weber Informatics LLC | Privacy Policy