 
                        
        
                        
        org.apache.hadoop.tools.rumen.Folder Maven / Gradle / Ivy
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.tools.rumen;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Random;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Folder extends Configured implements Tool {
  private long outputDuration = -1;
  private long inputCycle = -1;
  private double concentration = 1.0;
  private long randomSeed = 0; // irrelevant if seeded == false
  private boolean seeded = false;
  private boolean debug = false;
  private boolean allowMissorting = false;
  private int skewBufferLength = 0;
  private long startsAfter = -1;
  static final private Log LOG = LogFactory.getLog(Folder.class);
  private DeskewedJobTraceReader reader = null;
  private Outputter outGen = null;
  private List tempPaths = new LinkedList();
  private Path tempDir = null;
  private long firstJobSubmitTime;
  private double timeDilation;
  private double transcriptionRateFraction;
  private int transcriptionRateInteger;
  private Random random;
  static private final long TICKS_PER_SECOND = 1000L;
  // error return codes
  static private final int NON_EXISTENT_FILES = 1;
  static private final int NO_INPUT_CYCLE_LENGTH = 2;
  static private final int EMPTY_JOB_TRACE = 3;
  static private final int OUT_OF_ORDER_JOBS = 4;
  static private final int ALL_JOBS_SIMULTANEOUS = 5;
  static private final int IO_ERROR = 6;
  static private final int OTHER_ERROR = 7;
  private Set closees = new HashSet();
  private Set deletees = new HashSet();
  static long parseDuration(String durationString) {
    String numeral = durationString.substring(0, durationString.length() - 1);
    char durationCode = durationString.charAt(durationString.length() - 1);
    long result = Integer.parseInt(numeral);
    if (result <= 0) {
      throw new IllegalArgumentException("Negative durations are not allowed");
    }
    switch (durationCode) {
    case 'D':
    case 'd':
      return 24L * 60L * 60L * TICKS_PER_SECOND * result;
    case 'H':
    case 'h':
      return 60L * 60L * TICKS_PER_SECOND * result;
    case 'M':
    case 'm':
      return 60L * TICKS_PER_SECOND * result;
    case 'S':
    case 's':
      return TICKS_PER_SECOND * result;
    default:
      throw new IllegalArgumentException("Missing or invalid duration code");
    }
  }
  private int initialize(String[] args) throws IllegalArgumentException {
    String tempDirName = null;
    String inputPathName = null;
    String outputPathName = null;
    for (int i = 0; i < args.length; ++i) {
      String thisArg = args[i];
      if (thisArg.equalsIgnoreCase("-starts-after")) {
        startsAfter = parseDuration(args[++i]);
      } else if (thisArg.equalsIgnoreCase("-output-duration")) {
        outputDuration = parseDuration(args[++i]);
      } else if (thisArg.equalsIgnoreCase("-input-cycle")) {
        inputCycle = parseDuration(args[++i]);
      } else if (thisArg.equalsIgnoreCase("-concentration")) {
        concentration = Double.parseDouble(args[++i]);
      } else if (thisArg.equalsIgnoreCase("-debug")) {
        debug = true;
      } else if (thisArg.equalsIgnoreCase("-allow-missorting")) {
        allowMissorting = true;
      } else if (thisArg.equalsIgnoreCase("-seed")) {
        seeded = true;
        randomSeed = Long.parseLong(args[++i]);
      } else if (thisArg.equalsIgnoreCase("-skew-buffer-length")) {
        skewBufferLength = Integer.parseInt(args[++i]);
      } else if (thisArg.equalsIgnoreCase("-temp-directory")) {
        tempDirName = args[++i];
      } else if (thisArg.equals("") || thisArg.startsWith("-")) {
        throw new IllegalArgumentException("Illegal switch argument, "
            + thisArg + " at position " + i);
      } else {
        inputPathName = thisArg;
        outputPathName = args[++i];
        if (i != args.length - 1) {
          throw new IllegalArgumentException("Too many non-switch arguments");
        }
      }
    }
    try {
      Configuration conf = getConf();
      Path inPath = new Path(inputPathName);
      reader =
          new DeskewedJobTraceReader(new JobTraceReader(inPath, conf),
              skewBufferLength, !allowMissorting);
      Path outPath = new Path(outputPathName);
      outGen = new DefaultOutputter();
      outGen.init(outPath, conf);
      tempDir =
          tempDirName == null ? outPath.getParent() : new Path(tempDirName);
      FileSystem fs = tempDir.getFileSystem(getConf());
      if (!fs.getFileStatus(tempDir).isDirectory()) {
        throw new IOException("Your temp directory is not a directory");
      }
      if (inputCycle <= 0) {
        LOG.error("You must have an input cycle length.");
        return NO_INPUT_CYCLE_LENGTH;
      }
      if (outputDuration <= 0) {
        outputDuration = 60L * 60L * TICKS_PER_SECOND;
      }
      if (inputCycle <= 0) {
        inputCycle = outputDuration;
      }
      timeDilation = (double) outputDuration / (double) inputCycle;
      random = seeded ? new Random(randomSeed) : new Random();
      if (debug) {
        randomSeed = random.nextLong();
        LOG.warn("This run effectively has a -seed of " + randomSeed);
        random = new Random(randomSeed);
        seeded = true;
      }
    } catch (IOException e) {
      e.printStackTrace(System.err);
      return NON_EXISTENT_FILES;
    }
    return 0;
  }
  @Override
  public int run(String[] args) throws IOException {
    int result = initialize(args);
    if (result != 0) {
      return result;
    }
    return run();
  }
  public int run() throws IOException {
    class JobEntryComparator implements
        Comparator> {
      public int compare(Pair p1,
          Pair p2) {
        LoggedJob j1 = p1.first();
        LoggedJob j2 = p2.first();
        return (j1.getSubmitTime() < j2.getSubmitTime()) ? -1 : (j1
            .getSubmitTime() == j2.getSubmitTime()) ? 0 : 1;
      }
    }
    // we initialize an empty heap so if we take an error before establishing
    // a real one the finally code goes through
    Queue> heap =
        new PriorityQueue>();
    try {
      LoggedJob job = reader.nextJob();
      if (job == null) {
        LOG.error("The job trace is empty");
        return EMPTY_JOB_TRACE;
      }
      
      // If starts-after time is specified, skip the number of jobs till we reach
      // the starting time limit.
      if (startsAfter > 0) {
        LOG.info("starts-after time is specified. Initial job submit time : " 
                 + job.getSubmitTime());
        long approximateTime = job.getSubmitTime() + startsAfter;
        job = reader.nextJob();
        long skippedCount = 0;
        while (job != null && job.getSubmitTime() < approximateTime) {
          job = reader.nextJob();
          skippedCount++;
        }
        LOG.debug("Considering jobs with submit time greater than " 
                  + startsAfter + " ms. Skipped " + skippedCount + " jobs.");
        if (job == null) {
          LOG.error("No more jobs to process in the trace with 'starts-after'"+
                    " set to " + startsAfter + "ms.");
          return EMPTY_JOB_TRACE;
        }
        LOG.info("The first job has a submit time of " + job.getSubmitTime());
      }
      firstJobSubmitTime = job.getSubmitTime();
      long lastJobSubmitTime = firstJobSubmitTime;
      int numberJobs = 0;
      long currentIntervalEnd = Long.MIN_VALUE;
      Path nextSegment = null;
      Outputter tempGen = null;
      if (debug) {
        LOG.debug("The first job has a submit time of " + firstJobSubmitTime);
      }
      final Configuration conf = getConf();
      try {
        // At the top of this loop, skewBuffer has at most
        // skewBufferLength entries.
        while (job != null) {
          final Random tempNameGenerator = new Random();
          lastJobSubmitTime = job.getSubmitTime();
          ++numberJobs;
          if (job.getSubmitTime() >= currentIntervalEnd) {
            if (tempGen != null) {
              tempGen.close();
            }
            
            nextSegment = null;
            for (int i = 0; i < 3 && nextSegment == null; ++i) {
              try {
                nextSegment =
                    new Path(tempDir, "segment-" + tempNameGenerator.nextLong()
                        + ".json.gz");
                if (debug) {
                  LOG.debug("The next segment name is " + nextSegment);
                }
                FileSystem fs = nextSegment.getFileSystem(conf);
                try {
                  if (!fs.exists(nextSegment)) {
                    break;
                  }
                  continue;
                } catch (IOException e) {
                  // no code -- file did not already exist
                }
              } catch (IOException e) {
                // no code -- file exists now, or directory bad. We try three
                // times.
              }
            }
            if (nextSegment == null) {
              throw new RuntimeException("Failed to create a new file!");
            }
            
            if (debug) {
              LOG.debug("Creating " + nextSegment
                  + " for a job with a submit time of " + job.getSubmitTime());
            }
            deletees.add(nextSegment);
            tempPaths.add(nextSegment);
            tempGen = new DefaultOutputter();
            tempGen.init(nextSegment, conf);
            long currentIntervalNumber =
                (job.getSubmitTime() - firstJobSubmitTime) / inputCycle;
            currentIntervalEnd =
                firstJobSubmitTime + ((currentIntervalNumber + 1) * inputCycle);
          }
          // the temp files contain UDadjusted times, but each temp file's
          // content is in the same input cycle interval.
          if (tempGen != null) {
            tempGen.output(job);
          }
          job = reader.nextJob();
        }
      } catch (DeskewedJobTraceReader.OutOfOrderException e) {
        return OUT_OF_ORDER_JOBS;
      } finally {
        if (tempGen != null) {
          tempGen.close();
        }
      }
      if (lastJobSubmitTime <= firstJobSubmitTime) {
        LOG.error("All of your job[s] have the same submit time."
            + "  Please just use your input file.");
        return ALL_JOBS_SIMULTANEOUS;
      }
      double submitTimeSpan = lastJobSubmitTime - firstJobSubmitTime;
      LOG.warn("Your input trace spans "
          + (lastJobSubmitTime - firstJobSubmitTime) + " ticks.");
      double foldingRatio =
          submitTimeSpan * (numberJobs + 1) / numberJobs / inputCycle;
      if (debug) {
        LOG.warn("run: submitTimeSpan = " + submitTimeSpan + ", numberJobs = "
            + numberJobs + ", inputCycle = " + inputCycle);
      }
      if (reader.neededSkewBufferSize() > 0) {
        LOG.warn("You needed a -skew-buffer-length of "
            + reader.neededSkewBufferSize() + " but no more, for this input.");
      }
      double tProbability = timeDilation * concentration / foldingRatio;
      if (debug) {
        LOG.warn("run: timeDilation = " + timeDilation + ", concentration = "
            + concentration + ", foldingRatio = " + foldingRatio);
        LOG.warn("The transcription probability is " + tProbability);
      }
      transcriptionRateInteger = (int) Math.floor(tProbability);
      transcriptionRateFraction = tProbability - Math.floor(tProbability);
      // Now read all the inputs in parallel
      heap =
          new PriorityQueue>(tempPaths.size(),
              new JobEntryComparator());
      for (Path tempPath : tempPaths) {
        JobTraceReader thisReader = new JobTraceReader(tempPath, conf);
        closees.add(thisReader);
        LoggedJob streamFirstJob = thisReader.getNext();
        long thisIndex =
            (streamFirstJob.getSubmitTime() - firstJobSubmitTime) / inputCycle;
        if (debug) {
          LOG.debug("A job with submit time of "
              + streamFirstJob.getSubmitTime() + " is in interval # "
              + thisIndex);
        }
        adjustJobTimes(streamFirstJob);
        if (debug) {
          LOG.debug("That job's submit time is adjusted to "
              + streamFirstJob.getSubmitTime());
        }
        heap
            .add(new Pair(streamFirstJob, thisReader));
      }
      Pair next = heap.poll();
      while (next != null) {
        maybeOutput(next.first());
        if (debug) {
          LOG.debug("The most recent job has an adjusted submit time of "
              + next.first().getSubmitTime());
          LOG.debug(" Its replacement in the heap will come from input engine "
              + next.second());
        }
        LoggedJob replacement = next.second().getNext();
        if (replacement == null) {
          next.second().close();
          if (debug) {
            LOG.debug("That input engine is depleted.");
          }
        } else {
          adjustJobTimes(replacement);
          if (debug) {
            LOG.debug("The replacement has an adjusted submit time of "
                + replacement.getSubmitTime());
          }
          heap.add(new Pair(replacement, next
              .second()));
        }
        next = heap.poll();
      }
    } finally {
      IOUtils.cleanup(null, reader);
      if (outGen != null) {
        outGen.close();
      }
      for (Pair heapEntry : heap) {
        heapEntry.second().close();
      }
      for (Closeable closee : closees) {
        closee.close();
      }
      if (!debug) {
        Configuration conf = getConf();
        for (Path deletee : deletees) {
          FileSystem fs = deletee.getFileSystem(conf);
          try {
            fs.delete(deletee, false);
          } catch (IOException e) {
            // no code
          }
        }
      }
    }
    return 0;
  }
  private void maybeOutput(LoggedJob job) throws IOException {
    for (int i = 0; i < transcriptionRateInteger; ++i) {
      outGen.output(job);
    }
    if (random.nextDouble() < transcriptionRateFraction) {
      outGen.output(job);
    }
  }
  private void adjustJobTimes(LoggedJob adjustee) {
    long offsetInCycle =
        (adjustee.getSubmitTime() - firstJobSubmitTime) % inputCycle;
    long outputOffset = (long) ((double) offsetInCycle * timeDilation);
    long adjustment =
        firstJobSubmitTime + outputOffset - adjustee.getSubmitTime();
    adjustee.adjustTimes(adjustment);
  }
  /**
   * @param args
   */
  public static void main(String[] args) {
    Folder instance = new Folder();
    int result = 0;
    try {
      result = ToolRunner.run(instance, args);
    } catch (IOException e) {
      e.printStackTrace(System.err);
      System.exit(IO_ERROR);
    } catch (Exception e) {
      e.printStackTrace(System.err);
      System.exit(OTHER_ERROR);
    }
    if (result != 0) {
      System.exit(result);
    }
    return;
  }
}
                    © 2015 - 2025 Weber Informatics LLC | Privacy Policy