All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.datatorrent.lib.io.fs.AbstractFileSplitter Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.lib.io.fs;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;

import javax.annotation.Nullable;
import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;

import com.google.common.base.Preconditions;
import com.datatorrent.api.AutoMetric;
import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.common.util.BaseOperator;
import com.datatorrent.lib.io.block.BlockMetadata;

/**
 * An abstract File Splitter.
 *
 * @since 3.2.0
 */
@org.apache.hadoop.classification.InterfaceStability.Evolving
public abstract class AbstractFileSplitter extends BaseOperator
{
  protected Long blockSize;
  private int sequenceNo;

  /**
   * This is a threshold on the no. of blocks emitted per window. A lot of blocks emitted
   * per window can overwhelm the downstream operators. This setting helps to control that.
   */
  @Min(1)
  protected int blocksThreshold;

  protected transient long blockCount;

  protected BlockMetadataIterator blockMetadataIterator;

  protected transient int operatorId;
  protected transient Context.OperatorContext context;
  protected transient long currentWindowId;

  @AutoMetric
  protected int filesProcessed;

  public final transient DefaultOutputPort filesMetadataOutput = new DefaultOutputPort<>();
  public final transient DefaultOutputPort blocksMetadataOutput =
      new DefaultOutputPort<>();

  public AbstractFileSplitter()
  {
    blocksThreshold = Integer.MAX_VALUE;
  }

  @Override
  public void setup(Context.OperatorContext context)
  {
    Preconditions.checkArgument(blockSize == null || blockSize > 0, "invalid block size");

    operatorId = context.getId();
    this.context = context;
    currentWindowId = context.getValue(Context.OperatorContext.ACTIVATION_WINDOW_ID);
    if (blockSize == null) {
      blockSize = getDefaultBlockSize();
    }
  }

  @Override
  public void beginWindow(long windowId)
  {
    filesProcessed = 0;
    blockCount = 0;
    currentWindowId = windowId;
  }

  protected void process()
  {
    if (blockMetadataIterator != null && blockCount < blocksThreshold) {
      emitBlockMetadata();
    }

    FileInfo fileInfo;
    while (blockCount < blocksThreshold && (fileInfo = getFileInfo()) != null) {
      if (!processFileInfo(fileInfo)) {
        break;
      }
    }
  }

  /**
   * @return {@link FileInfo}
   */
  protected abstract FileInfo getFileInfo();

  /**
   * @param fileInfo file info
   * @return true if blocks threshold is reached; false otherwise
   */
  protected boolean processFileInfo(FileInfo fileInfo)
  {
    try {
      FileMetadata fileMetadata = buildFileMetadata(fileInfo);
      filesMetadataOutput.emit(fileMetadata);
      filesProcessed++;
      if (!fileMetadata.isDirectory()) {
        blockMetadataIterator = new BlockMetadataIterator(this, fileMetadata, blockSize);
        if (!emitBlockMetadata()) {
          //block threshold reached
          return false;
        }
      }
      return true;
    } catch (IOException e) {
      throw new RuntimeException("creating metadata", e);
    }
  }

  /**
   * @return true if all the blocks were emitted; false otherwise
   */
  protected boolean emitBlockMetadata()
  {
    while (blockMetadataIterator.hasNext()) {
      if (blockCount++ < blocksThreshold) {
        this.blocksMetadataOutput.emit(blockMetadataIterator.next());
      } else {
        return false;
      }
    }
    blockMetadataIterator = null;
    return true;
  }

  /**
   * Builds block metadata
   *
   * @param pos                 offset of the block
   * @param lengthOfFileInBlock length of the block in file
   * @param blockNumber         block number
   * @param fileMetadata        file metadata
   * @param isLast              last block of the file
   * @return block file metadata
   */
  protected BlockMetadata.FileBlockMetadata buildBlockMetadata(long pos, long lengthOfFileInBlock, int blockNumber,
      FileMetadata fileMetadata, boolean isLast)
  {
    BlockMetadata.FileBlockMetadata fileBlockMetadata = createBlockMetadata(fileMetadata);
    fileBlockMetadata.setBlockId(fileMetadata.getBlockIds()[blockNumber - 1]);
    fileBlockMetadata.setOffset(pos);
    fileBlockMetadata.setLength(lengthOfFileInBlock);
    fileBlockMetadata.setLastBlock(isLast);
    fileBlockMetadata.setPreviousBlockId(blockNumber == 1 ? -1 : fileMetadata.getBlockIds()[blockNumber - 2]);

    return fileBlockMetadata;
  }

  /**
   * Can be overridden for creating block metadata of a type that extends {@link BlockMetadata.FileBlockMetadata}
   */
  protected BlockMetadata.FileBlockMetadata createBlockMetadata(FileMetadata fileMetadata)
  {
    return new BlockMetadata.FileBlockMetadata(fileMetadata.getFilePath());
  }

  /**
   * Creates file-metadata and populates no. of blocks in the metadata.
   *
   * @param fileInfo file information
   * @return file-metadata
   * @throws IOException
   */
  protected FileMetadata buildFileMetadata(FileInfo fileInfo) throws IOException
  {
    LOG.debug("file {}", fileInfo.getFilePath());
    FileMetadata fileMetadata = createFileMetadata(fileInfo);
    LOG.debug("fileMetadata {}", fileMetadata);
    Path path = new Path(fileInfo.getFilePath());

    fileMetadata.setFileName(path.getName());

    FileStatus status = getFileStatus(path);
    fileMetadata.setDirectory(status.isDirectory());
    fileMetadata.setFileLength(status.getLen());

    if (fileInfo.getDirectoryPath() == null) { // Direct filename is given as input.
      fileMetadata.setRelativePath(status.getPath().getName());
    } else {
      String relativePath = getRelativePathWithFolderName(fileInfo);
      fileMetadata.setRelativePath(relativePath);
    }

    if (!status.isDirectory()) {
      int noOfBlocks = (int)((status.getLen() / blockSize) + (((status.getLen() % blockSize) == 0) ? 0 : 1));
      if (fileMetadata.getDataOffset() >= status.getLen()) {
        noOfBlocks = 0;
      }
      fileMetadata.setNumberOfBlocks(noOfBlocks);
      populateBlockIds(fileMetadata);
    }
    return fileMetadata;
  }

  /*
   * As folder name was given to input for copy, prefix folder name to the sub items to copy.
   */
  private String getRelativePathWithFolderName(FileInfo fileInfo)
  {
    String parentDir = new Path(fileInfo.getDirectoryPath()).getName();
    return parentDir + File.separator + fileInfo.getRelativeFilePath();
  }

  /**
   * This can be over-ridden to create file metadata of type that extends {@link FileSplitterInput.FileMetadata}
   *
   * @param fileInfo file information
   * @return file-metadata
   */
  protected FileMetadata createFileMetadata(FileInfo fileInfo)
  {
    return new FileMetadata(fileInfo.getFilePath());
  }

  protected void populateBlockIds(FileMetadata fileMetadata)
  {
    // block ids are 32 bits of operatorId | 32 bits of sequence number
    long[] blockIds = new long[fileMetadata.getNumberOfBlocks()];
    long longLeftSide = ((long)operatorId) << 32;
    for (int i = 0; i < fileMetadata.getNumberOfBlocks(); i++) {
      blockIds[i] = longLeftSide | sequenceNo++ & 0xFFFFFFFFL;
    }
    fileMetadata.setBlockIds(blockIds);
  }

  /**
   * Get default block size which is used when the user hasn't specified block size.
   *
   * @return default block size.
   */
  protected abstract long getDefaultBlockSize();

  /**
   * Get status of a file.
   *
   * @param path path of a file
   * @return file status
   */
  protected abstract FileStatus getFileStatus(Path path) throws IOException;

  public void setBlockSize(Long blockSize)
  {
    this.blockSize = blockSize;
  }

  public Long getBlockSize()
  {
    return blockSize;
  }

  public void setBlocksThreshold(int threshold)
  {
    this.blocksThreshold = threshold;
  }

  public int getBlocksThreshold()
  {
    return blocksThreshold;
  }

  /**
   * An {@link Iterator} for Block-Metadatas of a file.
   */
  protected static class BlockMetadataIterator implements Iterator
  {
    private final FileMetadata fileMetadata;
    private final long blockSize;

    private long pos;
    private int blockNumber;

    private final AbstractFileSplitter splitter;

    protected BlockMetadataIterator()
    {
      //for kryo
      fileMetadata = null;
      blockSize = -1;
      splitter = null;
    }

    protected BlockMetadataIterator(AbstractFileSplitter splitter, FileMetadata fileMetadata, long blockSize)
    {
      this.splitter = splitter;
      this.fileMetadata = fileMetadata;
      this.blockSize = blockSize;
      this.pos = fileMetadata.getDataOffset();
      this.blockNumber = 0;
    }

    @Override
    public boolean hasNext()
    {
      return pos < fileMetadata.getFileLength();
    }

    @SuppressWarnings("StatementWithEmptyBody")
    @Override
    public BlockMetadata.FileBlockMetadata next()
    {
      long length;
      while ((length = blockSize * ++blockNumber) <= pos) {
      }
      boolean isLast = length >= fileMetadata.getFileLength();
      long lengthOfFileInBlock = isLast ? fileMetadata.getFileLength() : length;
      BlockMetadata.FileBlockMetadata fileBlock = splitter.buildBlockMetadata(pos, lengthOfFileInBlock, blockNumber,
          fileMetadata, isLast);
      pos = lengthOfFileInBlock;
      return fileBlock;
    }

    @Override
    public void remove()
    {
      throw new UnsupportedOperationException("remove not supported");
    }
  }

  /**
   * Represents the file metadata - file path, name, no. of blocks, etc.
   */
  public static class FileMetadata
  {
    @NotNull
    private String filePath;
    private String fileName;
    private int numberOfBlocks;
    private long dataOffset;
    private long fileLength;
    private long discoverTime;
    private long[] blockIds;
    private boolean isDirectory;
    private String relativePath;

    @SuppressWarnings("unused")
    protected FileMetadata()
    {
      //for kryo
      filePath = null;
      discoverTime = System.currentTimeMillis();
    }

    /**
     * Constructs file metadata
     *
     * @param filePath file path
     */
    public FileMetadata(@NotNull String filePath)
    {
      this.filePath = filePath;
      discoverTime = System.currentTimeMillis();
    }
    
    protected FileMetadata(FileMetadata fileMetadata)
    {
      this();
      filePath = fileMetadata.filePath;
      fileName = fileMetadata.fileName;
      numberOfBlocks = fileMetadata.numberOfBlocks;
      dataOffset = fileMetadata.dataOffset;
      fileLength = fileMetadata.fileLength;
      discoverTime = fileMetadata.discoverTime;
      blockIds = fileMetadata.blockIds;
      isDirectory = fileMetadata.isDirectory;
      relativePath = fileMetadata.relativePath;
    }

    /**
     * Returns the total number of blocks.
     */
    public int getNumberOfBlocks()
    {
      return numberOfBlocks;
    }

    /**
     * Sets the total number of blocks.
     */
    public void setNumberOfBlocks(int numberOfBlocks)
    {
      this.numberOfBlocks = numberOfBlocks;
    }

    /**
     * Returns the file name.
     */
    public String getFileName()
    {
      return fileName;
    }

    /**
     * Sets the file name.
     */
    public void setFileName(String fileName)
    {
      this.fileName = fileName;
    }

    /**
     * Sets the file path.
     */
    public void setFilePath(String filePath)
    {
      this.filePath = filePath;
    }

    /**
     * Returns the file path.
     */
    public String getFilePath()
    {
      return filePath;
    }

    /**
     * Returns the data offset.
     */
    public long getDataOffset()
    {
      return dataOffset;
    }

    /**
     * Sets the data offset.
     */
    public void setDataOffset(long offset)
    {
      this.dataOffset = offset;
    }

    /**
     * Returns the file length.
     */
    public long getFileLength()
    {
      return fileLength;
    }

    /**
     * Sets the file length.
     */
    public void setFileLength(long fileLength)
    {
      this.fileLength = fileLength;
    }

    /**
     * Returns the file discover time.
     */
    public long getDiscoverTime()
    {
      return discoverTime;
    }

    /**
     * Sets the discover time.
     */
    public void setDiscoverTime(long discoverTime)
    {
      this.discoverTime = discoverTime;
    }

    /**
     * Returns the block ids associated with the file.
     */
    public long[] getBlockIds()
    {
      return blockIds;
    }

    /**
     * Sets the blocks ids of the file.
     */
    public void setBlockIds(long[] blockIds)
    {
      this.blockIds = blockIds;
    }

    /**
     * Sets whether the file metadata is a directory.
     */
    public void setDirectory(boolean isDirectory)
    {
      this.isDirectory = isDirectory;
    }

    /**
     * @return true if it is a directory; false otherwise.
     */
    public boolean isDirectory()
    {
      return isDirectory;
    }

    /**
     * Sets relative file path
     * @return relativePath
     */
    public String getRelativePath()
    {
      return relativePath;
    }

    /**
     * Gets relative file path
     * @param relativePath
     */
    public void setRelativePath(String relativePath)
    {
      this.relativePath = relativePath;
    }

    @Override
    public String toString()
    {
      return "FileMetadata [fileName=" + fileName + ", numberOfBlocks=" + numberOfBlocks + ", isDirectory=" + isDirectory + ", relativePath=" + relativePath + "]";
    }

  }

  /**
   * A class that encapsulates file path.
   */
  public static class FileInfo
  {
    protected final String directoryPath;
    protected final String relativeFilePath;

    protected FileInfo()
    {
      directoryPath = null;
      relativeFilePath = null;
    }

    public FileInfo(@Nullable String directoryPath, @NotNull String relativeFilePath)
    {
      this.directoryPath = directoryPath;
      this.relativeFilePath = relativeFilePath;
    }

    /**
     * @return directory path
     */
    public String getDirectoryPath()
    {
      return directoryPath;
    }

    /**
     * @return path relative to directory
     */
    public String getRelativeFilePath()
    {
      return relativeFilePath;
    }

    /**
     * @return full path of the file
     */
    public String getFilePath()
    {
      if (directoryPath == null) {
        return relativeFilePath;
      }
      return new Path(directoryPath, relativeFilePath).toUri().getPath();
    }
  }

  private static final Logger LOG = LoggerFactory.getLogger(AbstractFileSplitter.class);
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy