All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gobblin.hive.policy.HiveSnapshotRegistrationPolicy Maven / Gradle / Ivy

There is a newer version: 0.8.0
Show newest version
/*
 * Copyright (C) 2014-2016 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package gobblin.hive.policy;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import com.google.common.base.Optional;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;

import gobblin.annotation.Alpha;
import gobblin.configuration.State;
import gobblin.hive.HiveTable;
import gobblin.hive.spec.HiveSpec;
import gobblin.hive.spec.SimpleHiveSpec;


/**
 * A {@link gobblin.hive.policy.HiveRegistrationPolicy} for registering snapshots.
 *
 * @author ziliu
 */
@Alpha
public class HiveSnapshotRegistrationPolicy extends HiveRegistrationPolicyBase {

  public static final String SNAPSHOT_PATH_PATTERN = "snapshot.path.pattern";

  protected final Optional snapshotPathPattern;

  protected HiveSnapshotRegistrationPolicy(State props) throws IOException {
    super(props);
    this.snapshotPathPattern = props.contains(SNAPSHOT_PATH_PATTERN)
        ? Optional.of(Pattern.compile(props.getProp(SNAPSHOT_PATH_PATTERN))) : Optional. absent();
  }

  /**
   * @param path The root directory of snapshots. This directory may contain zero or more snapshots.
   */
  @Override
  public Collection getHiveSpecs(Path path) throws IOException {
    List tables = getTables(path);

    if (tables.isEmpty()) {
      return ImmutableList. of();
    }
    Collection specs = Lists.newArrayList();
    for (HiveTable table : tables) {
      specs.add(new SimpleHiveSpec.Builder<>(path).withTable(table).withPartition(getPartition(path, table)).build());
    }
    return specs;
  }

  /**
   * Get {@link HiveTable}s using the latest snapshot (returned by {@link #getLatestSnapshot(Path)}.
   */
  @Override
  protected List getTables(Path path) throws IOException {
    Path latestSnapshot = getLatestSnapshot(path);
    if (latestSnapshot == null) {
      return ImmutableList. of();
    }

    return super.getTables(latestSnapshot);
  }

  /**
   * Get the latest snapshot in the given {@link Path}.
   *
   * 

* The lastest snapshot is a sub-directory of the input {@link Path} that has the largest folder * name alphabetically. If property {@link #SNAPSHOT_PATH_PATTERN} is set, only those sub-directories * whose full path matches the given pattern are considered. *

*/ protected Path getLatestSnapshot(Path path) throws IOException { FileStatus statuses[] = this.fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path p) { try { if (!HiveSnapshotRegistrationPolicy.this.fs.isDirectory(p)) { return false; } } catch (IOException e) { throw Throwables.propagate(e); } return !HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.isPresent() || HiveSnapshotRegistrationPolicy.this.snapshotPathPattern.get().matcher(p.toString()).matches(); } }); if (statuses.length == 0) { return null; } Arrays.sort(statuses, new Comparator() { @Override public int compare(FileStatus o1, FileStatus o2) { return o2.getPath().getName().compareTo(o1.getPath().getName()); } }); return statuses[0].getPath(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy