All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datatorrent.lib.io.fs.FileStitcher Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.datatorrent.lib.io.fs;

import java.io.IOException;
import java.io.OutputStream;
import java.util.Queue;

import javax.validation.constraints.NotNull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import com.google.common.collect.Queues;

import com.datatorrent.api.Context;
import com.datatorrent.api.Context.DAGContext;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.annotation.OutputPortFieldAnnotation;
import com.datatorrent.lib.io.block.BlockWriter;
import com.datatorrent.lib.io.fs.Synchronizer.StitchBlock;
import com.datatorrent.lib.io.fs.Synchronizer.StitchedFileMetaData;

/**
 * This is generic File Stitcher which can be used to merge data from one or
 * more files into single stitched file. StitchedFileMetaData defines
 * constituents of the stitched file.
 * 
 * This class uses Reconciler to
 *
 * @since 3.4.0
 */
public class FileStitcher extends AbstractReconciler
{
  /**
   * Filesystem on which application is running
   */
  protected transient FileSystem appFS;

  /**
   * Destination file system
   */
  protected transient FileSystem outputFS;

  /**
   * Path for destination directory
   */
  @NotNull
  protected String filePath;

  /**
   * Path for blocks directory
   */
  protected transient String blocksDirectoryPath;
  
  /**
   * Directory under application directory where blocks gets stored
   */
  private String blocksDirectory = BlockWriter.DEFAULT_BLOCKS_DIR;

  protected static final String PART_FILE_EXTENTION = "._COPYING_";

  /**
   * Queue maintaining successful files
   */
  protected Queue successfulFiles = Queues.newLinkedBlockingQueue();

  /**
   * Queue maintaining skipped files
   */
  protected Queue skippedFiles = Queues.newLinkedBlockingQueue();

  /**
   * Queue maintaining failed files
   */
  protected Queue failedFiles = Queues.newLinkedBlockingQueue();

  /**
   * Output port for emitting completed stitched files metadata
   */
  @OutputPortFieldAnnotation(optional = true)
  public final transient DefaultOutputPort completedFilesMetaOutput = new DefaultOutputPort();

  private boolean writeChecksum = true;
  protected transient Path tempOutFilePath;

  @Override
  public void setup(Context.OperatorContext context)
  {

    blocksDirectoryPath = context.getValue(DAGContext.APPLICATION_PATH) + Path.SEPARATOR + blocksDirectory;

    try {
      outputFS = getOutputFSInstance();
      outputFS.setWriteChecksum(writeChecksum);
    } catch (IOException ex) {
      throw new RuntimeException("Exception in getting output file system.", ex);
    }
    try {
      appFS = getAppFSInstance();
    } catch (IOException ex) {
      try {
        outputFS.close();
      } catch (IOException e) {
        throw new RuntimeException("Exception in closing output file system.", e);
      }
      throw new RuntimeException("Exception in getting application file system.", ex);
    }

    super.setup(context); // Calling it at the end as the reconciler thread uses resources allocated above.
  }

  /* 
   * Calls super.endWindow() and sets counters 
   * @see com.datatorrent.api.BaseOperator#endWindow()
   */
  @Override
  public void endWindow()
  {
    T stitchedFileMetaData;
    int size = doneTuples.size();
    for (int i = 0; i < size; i++) {
      stitchedFileMetaData = doneTuples.peek();
      // If a tuple is present in doneTuples, it has to be also present in successful/failed/skipped
      // as processCommittedData adds tuple in successful/failed/skipped
      // and then reconciler thread add that in doneTuples 
      if (successfulFiles.contains(stitchedFileMetaData)) {
        successfulFiles.remove(stitchedFileMetaData);
        LOG.debug("File copy successful: {}", stitchedFileMetaData.getStitchedFileRelativePath());
      } else if (skippedFiles.contains(stitchedFileMetaData)) {
        skippedFiles.remove(stitchedFileMetaData);
        LOG.debug("File copy skipped: {}", stitchedFileMetaData.getStitchedFileRelativePath());
      } else if (failedFiles.contains(stitchedFileMetaData)) {
        failedFiles.remove(stitchedFileMetaData);
        LOG.debug("File copy failed: {}", stitchedFileMetaData.getStitchedFileRelativePath());
      } else {
        throw new RuntimeException("Tuple present in doneTuples but not in sucessful /skipped/ failed files: "
            + stitchedFileMetaData.getStitchedFileRelativePath());
      }
      completedFilesMetaOutput.emit(stitchedFileMetaData);
      committedTuples.remove(stitchedFileMetaData);
      doneTuples.poll();
    }
  }

  /**
   * 
   * @return Application FileSystem instance
   * @throws IOException
   */
  protected FileSystem getAppFSInstance() throws IOException
  {
    return FileSystem.newInstance((new Path(blocksDirectoryPath)).toUri(), new Configuration());
  }

  /**
   * 
   * @return Destination FileSystem instance
   * @throws IOException
   */
  protected FileSystem getOutputFSInstance() throws IOException
  {
    return FileSystem.newInstance((new Path(filePath)).toUri(), new Configuration());
  }

  @Override
  public void teardown()
  {
    super.teardown();

    boolean gotException = false;
    try {
      if (appFS != null) {
        appFS.close();
        appFS = null;
      }
    } catch (IOException e) {
      gotException = true;
    }

    try {
      if (outputFS != null) {
        outputFS.close();
        outputFS = null;
      }
    } catch (IOException e) {
      gotException = true;
    }
    if (gotException) {
      throw new RuntimeException("Exception while closing file systems.");
    }
  }

  /**
   * Enques incoming data for for processing
   */
  @Override
  protected void processTuple(T stitchedFileMetaData)
  {
    LOG.debug("stitchedFileMetaData: {}", stitchedFileMetaData);
    enqueueForProcessing(stitchedFileMetaData);
  }

  /**
   * Stitches the output file when all blocks for that file are commited
   */
  @Override
  protected void processCommittedData(T stitchedFileMetaData)
  {
    try {
      mergeOutputFile(stitchedFileMetaData);
    } catch (IOException e) {
      throw new RuntimeException("Unable to merge file: " + stitchedFileMetaData.getStitchedFileRelativePath(), e);
    }
  }

  /**
   * Read data from block files and write to output file. Information about
   * which block files should be read is specified in outFileMetadata
   * 
   * @param stitchedFileMetaData
   * @throws IOException
   */

  protected void mergeOutputFile(T stitchedFileMetaData) throws IOException
  {
    mergeBlocks(stitchedFileMetaData);
    successfulFiles.add(stitchedFileMetaData);
    LOG.debug("Completed processing file: {} ", stitchedFileMetaData.getStitchedFileRelativePath());
  }

  protected void mergeBlocks(T stitchedFileMetaData) throws IOException
  {
    //when writing to tmp files there can be vagrant tmp files which we have to clean
    final Path dst = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath());
    PathFilter tempFileFilter = new PathFilter()
    {
      @Override
      public boolean accept(Path path)
      {
        return path.getName().startsWith(dst.getName()) && path.getName().endsWith(PART_FILE_EXTENTION);
      }
    };
    if (outputFS.exists(dst.getParent())) {
      FileStatus[] statuses = outputFS.listStatus(dst.getParent(), tempFileFilter);
      for (FileStatus status : statuses) {
        String statusName = status.getPath().getName();
        LOG.debug("deleting vagrant file {}", statusName);
        outputFS.delete(status.getPath(), true);
      }
    }
    tempOutFilePath = new Path(filePath,
        stitchedFileMetaData.getStitchedFileRelativePath() + '.' + System.currentTimeMillis() + PART_FILE_EXTENTION);
    try {
      writeTempOutputFile(stitchedFileMetaData);
      moveToFinalFile(stitchedFileMetaData);
    } catch (BlockNotFoundException e) {
      LOG.warn("Block file {} not found. Assuming recovery mode for file {}. ", e.getBlockPath(),
          stitchedFileMetaData.getStitchedFileRelativePath());
      //Remove temp output file
      outputFS.delete(tempOutFilePath, false);
    }
  }

  /**
   * Writing all Stitch blocks to temporary file
   * 
   * @param stitchedFileMetaData
   * @throws IOException
   * @throws BlockNotFoundException
   */
  protected OutputStream writeTempOutputFile(T stitchedFileMetaData) throws IOException, BlockNotFoundException
  {
    OutputStream outputStream = getOutputStream(tempOutFilePath);
    try {
      for (StitchBlock outputBlock : stitchedFileMetaData.getStitchBlocksList()) {
        outputBlock.writeTo(appFS, blocksDirectoryPath, outputStream);
      }
    } finally {
      outputStream.close();
    }
    return outputStream;
  }

  protected OutputStream getOutputStream(Path partFilePath) throws IOException
  {
    return outputFS.create(partFilePath);
  }

  /**
   * Moving temp output file to final file
   * 
   * @param stitchedFileMetaData
   * @throws IOException
   */
  protected void moveToFinalFile(T stitchedFileMetaData) throws IOException
  {
    Path destination = new Path(filePath, stitchedFileMetaData.getStitchedFileRelativePath());
    moveToFinalFile(tempOutFilePath, destination);
  }

  /**
   * Moving temp output file to final file
   * 
   * @param tempOutFilePath
   *          Temporary output file
   * @param destination
   *          Destination directory path
   * @throws IOException
   */
  protected void moveToFinalFile(Path tempOutFilePath, Path destination) throws IOException
  {
    Path src = Path.getPathWithoutSchemeAndAuthority(tempOutFilePath);
    Path dst = Path.getPathWithoutSchemeAndAuthority(destination);

    boolean moveSuccessful = false;
    if (!outputFS.exists(dst.getParent())) {
      outputFS.mkdirs(dst.getParent());
    }
    if (outputFS.exists(dst)) {
      outputFS.delete(dst, false);
    }
    moveSuccessful = outputFS.rename(src, dst);

    if (moveSuccessful) {
      LOG.debug("File {} moved successfully to destination folder.", dst);
    } else {
      throw new RuntimeException("Unable to move file from " + src + " to " + dst);
    }
  }
  
  /**
   * Directory under application directory where blocks gets stored
   * @return blocks directory
   */
  public String getBlocksDirectory()
  {
    return blocksDirectory;
  }
  
  /**
   * Directory under application directory where blocks gets stored
   * @param blocksDirectory blocks directory
   */
  public void setBlocksDirectory(String blocksDirectory)
  {
    this.blocksDirectory = blocksDirectory;
  }

  /**
   * Path for destination directory
   * @return path for destination directory
   */
  public String getFilePath()
  {
    return filePath;
  }

  /**
   * Path for destination directory
   * @param filePath path for destination directory
   */
  public void setFilePath(String filePath)
  {
    this.filePath = filePath;
  }

  /**
   * Flag to control writing checksum
   * @return write checksum flag status
   */
  public boolean isWriteChecksum()
  {
    return writeChecksum;
  }

  /**
   * Flag to control writing checksum
   * @param writeChecksum write checksum flag status
   */
  public void setWriteChecksum(boolean writeChecksum)
  {
    this.writeChecksum = writeChecksum;
  }

  protected static final Logger LOG = LoggerFactory.getLogger(FileStitcher.class);

  /**
   * Defining new type of exception for missing block. Currently, methods
   * catching this exception assumes that block is missing because of explicit
   * deletion by File output module (for completed files)
   *
   */
  public static class BlockNotFoundException extends Exception
  {

    private static final long serialVersionUID = -7409415466834194798L;

    Path blockPath;

    /**
     * @param blockPath
     */
    public BlockNotFoundException(Path blockPath)
    {
      super();
      this.blockPath = blockPath;
    }

    /**
     * @return the blockPath
     */
    public Path getBlockPath()
    {
      return blockPath;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy