All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.commons.tap.VersionedTap Maven / Gradle / Ivy

package com.twitter.scalding.commons.tap;

import java.io.IOException;

import com.twitter.scalding.commons.datastores.VersionedStore;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.tap.hadoop.Hfs;

public class VersionedTap extends Hfs {
  public static enum TapMode {SOURCE, SINK}

  public Long version = null;

  // a sane default for the number of versions of your data to keep around
  private int versionsToKeep = 3;

  // source-specific
  public TapMode mode;

  // sink-specific
  private String newVersionPath;

  public VersionedTap(String dir, Scheme scheme, TapMode mode)
      throws IOException {
    super(scheme, dir);
    this.mode = mode;
  }


  public VersionedTap setVersion(long version) {
    this.version = version;
    return this;
  }

  /**
    * Sets the number of versions of your data to keep. Unneeded versions are cleaned up on creation
    * of a new one. Pass a negative number to keep all versions.
    */
  public VersionedTap setVersionsToKeep(int versionsToKeep) {
    this.versionsToKeep = versionsToKeep;
    return this;
  }

  public int getVersionsToKeep() {
    return this.versionsToKeep;
  }

  public String getOutputDirectory() {
    return getPath().toString();
  }

  public VersionedStore getStore(Configuration conf) throws IOException {
    return new VersionedStore(getPath().getFileSystem(conf), getOutputDirectory());
  }

  public String getSourcePath(Configuration conf) {
    VersionedStore store;
    try {
      store = getStore(conf);
      String sourcePath = (version != null) ? store.versionPath(version) : store.mostRecentVersionPath();
      if (sourcePath == null) {
        throw new RuntimeException("Could not find valid source path for VersionTap with root: " + store.getRoot());
      }
      return sourcePath;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  public String getSinkPath(Configuration conf) {
    try {
      VersionedStore store = getStore(conf);
      String sinkPath = (version == null) ? store.createVersion() : store.createVersion(version);
      if (sinkPath == null) {
        throw new RuntimeException("Could not find valid sink path for VersionTap with root: " + store.getRoot());
      }
      return sinkPath;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  @Override
  public void sourceConfInit(FlowProcess process, Configuration conf) {
    super.sourceConfInit(process, conf);
    conf.unset("mapred.input.dir"); // need this to unset any paths set in super.sourceConfInit
    Path fullyQualifiedPath = getFileSystem(conf).makeQualified(new Path(getSourcePath(conf)));
    HadoopUtil.addInputPath(conf, fullyQualifiedPath);
  }

  @Override
  public void sinkConfInit(FlowProcess process, Configuration conf) {
    super.sinkConfInit(process, conf);

    if (newVersionPath == null)
      newVersionPath = getSinkPath(conf);

    Path fullyQualifiedPath = getFileSystem(conf).makeQualified(new Path(newVersionPath));
    HadoopUtil.setOutputPath(conf, fullyQualifiedPath);
  }

  @Override
  public boolean resourceExists(Configuration jc) throws IOException {
    return getStore(jc).mostRecentVersion() != null;
  }

  @Override
  public boolean createResource(Configuration jc) throws IOException {
    throw new UnsupportedOperationException("Not supported yet.");
  }

  @Override
  public boolean deleteResource(Configuration jc) throws IOException {
    throw new UnsupportedOperationException("Not supported yet.");
  }

  @Override
  public String getIdentifier() {
    String outDir = getOutputDirectory();
    String versionString = (version == null) ? "LATEST" : version.toString();
    return outDir + Path.SEPARATOR
           + ((mode == TapMode.SINK) ? "sink" : "source")
           + Path.SEPARATOR + versionString;
  }

  @Override
  public long getModifiedTime(Configuration conf) throws IOException {
    VersionedStore store = getStore(conf);
    return (mode == TapMode.SINK) ? 0 : store.mostRecentVersion();
  }

  @Override
  public boolean commitResource(Configuration conf) throws IOException {
    VersionedStore store = getStore(conf);

    if (newVersionPath != null) {
      store.succeedVersion(newVersionPath);
      markSuccessfulOutputDir(new Path(newVersionPath), conf);
      newVersionPath = null;
      store.cleanup(getVersionsToKeep());
    }

    return true;
  }

  private static void markSuccessfulOutputDir(Path path, Configuration conf) throws IOException {
      FileSystem fs = path.getFileSystem(conf);
      // create a file in the folder to mark it
      if (fs.exists(path)) {
          Path filePath = new Path(path, VersionedStore.HADOOP_SUCCESS_FLAG);
          fs.create(filePath).close();
      }
  }

  @Override
  public boolean rollbackResource(Configuration conf) throws IOException {
    if (newVersionPath != null) {
      getStore(conf).failVersion(newVersionPath);
      newVersionPath = null;
    }

    return true;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy