gobblin.hive.policy.HiveSnapshotRegistrationPolicy Maven / Gradle / Ivy
 The newest version!
        
        /*
 * Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */
package gobblin.hive.policy;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import com.google.common.base.Optional;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import gobblin.annotation.Alpha;
import gobblin.configuration.State;
import gobblin.hive.HiveTable;
import gobblin.hive.spec.HiveSpec;
import gobblin.hive.spec.SimpleHiveSpec;
/**
 * A {@link gobblin.hive.policy.HiveRegistrationPolicy} for registering snapshots.
 *
 * @author Ziyang Liu
 */
@Alpha
public class HiveSnapshotRegistrationPolicy extends HiveRegistrationPolicyBase {
  public static final String SNAPSHOT_PATH_PATTERN = "snapshot.path.pattern";
  protected final Optional snapshotPathPattern;
  protected HiveSnapshotRegistrationPolicy(State props) throws IOException {
    super(props);
    this.snapshotPathPattern = props.contains(SNAPSHOT_PATH_PATTERN)
        ? Optional.of(Pattern.compile(props.getProp(SNAPSHOT_PATH_PATTERN))) : Optional. absent();
  }
  /**
   * @param path The root directory of snapshots. This directory may contain zero or more snapshots.
   */
  @Override
  public Collection getHiveSpecs(Path path) throws IOException {
    List tables = getTables(path);
    if (tables.isEmpty()) {
      return ImmutableList. of();
    }
    Collection specs = Lists.newArrayList();
    for (HiveTable table : tables) {
      specs.add(new SimpleHiveSpec.Builder<>(path).withTable(table).withPartition(getPartition(path, table)).build());
    }
    return specs;
  }
  /**
   * Get {@link HiveTable}s using the latest snapshot (returned by {@link #getLatestSnapshot(Path)}.
   */
  @Override
  protected List getTables(Path path) throws IOException {
    Path latestSnapshot = getLatestSnapshot(path);
    if (latestSnapshot == null) {
      return ImmutableList. of();
    }
    return super.getTables(latestSnapshot);
  }
  /**
   * Get the latest snapshot in the given {@link Path}.
   *
   * 
   *   The lastest snapshot is a sub-directory of the input {@link Path} that has the largest folder
   *   name alphabetically. If property {@link #SNAPSHOT_PATH_PATTERN} is set, only those sub-directories
   *   whose full path matches the given pattern are considered.
   * 
   */
  protected Path getLatestSnapshot(Path path) throws IOException {
    FileStatus statuses[] = this.fs.listStatus(path, new PathFilter() {
      @Override
      public boolean accept(Path p) {
        try {
          if (!HiveSnapshotRegistrationPolicy.this.fs.isDirectory(p)) {
            return false;
          }
        } catch (IOException e) {
          throw Throwables.propagate(e);
        }
        return !HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.isPresent()
            || HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.get().matcher(p.toString()).matches();
      }
    });
    if (statuses.length == 0) {
      return null;
    }
    Arrays.sort(statuses, new Comparator() {
      @Override
      public int compare(FileStatus o1, FileStatus o2) {
        return o2.getPath().getName().compareTo(o1.getPath().getName());
      }
    });
    return statuses[0].getPath();
  }
}
             © 2015 - 2025 Weber Informatics LLC | Privacy Policy