All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.compaction.suite.CompactionAvroSuite Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.gobblin.compaction.suite;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;

import lombok.extern.slf4j.Slf4j;

import org.apache.gobblin.compaction.action.CompactionCompleteAction;
import org.apache.gobblin.compaction.action.CompactionCompleteFileOperationAction;
import org.apache.gobblin.compaction.action.CompactionHiveRegistrationAction;
import org.apache.gobblin.compaction.action.CompactionMarkDirectoryAction;
import org.apache.gobblin.compaction.mapreduce.CompactionAvroJobConfigurator;
import org.apache.gobblin.compaction.verify.CompactionAuditCountVerifier;
import org.apache.gobblin.compaction.verify.CompactionThresholdVerifier;
import org.apache.gobblin.compaction.verify.CompactionTimeRangeVerifier;
import org.apache.gobblin.compaction.verify.CompactionVerifier;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.dataset.FileSystemDataset;

/**
 * A type of {@link CompactionSuite} which implements all components needed for avro file compaction.
 */
@Slf4j
public class CompactionAvroSuite implements CompactionSuite {
  public static final String SERIALIZE_COMPACTION_FILE_PATH_NAME = "compaction-file-path-name";
  private State state;
  private CompactionAvroJobConfigurator configurator = null;

  /**
   * Constructor
   */
  public CompactionAvroSuite (State state) {
    this.state = state;
  }

  /**
   * Implementation of {@link CompactionSuite#getDatasetsFinderVerifiers()}
   * @return A list of {@link CompactionVerifier} instances which will be verified after
   *         {@link FileSystemDataset} is found but before a {@link org.apache.gobblin.source.workunit.WorkUnit}
   *         is created.
   */
  public List> getDatasetsFinderVerifiers() {
    List> list = new LinkedList<>();
    list.add(new CompactionTimeRangeVerifier(state));
    list.add(new CompactionThresholdVerifier(state));
    list.add(new CompactionAuditCountVerifier(state));
    return list;
  }

  /**
   * Implementation of {@link CompactionSuite#getMapReduceVerifiers()}
   * @return A list of {@link CompactionVerifier} instances which will be verified before
   *         {@link org.apache.gobblin.compaction.mapreduce.MRCompactionTask} starts the map-reduce job
   */
  public List> getMapReduceVerifiers() {
    List> list = new ArrayList<>();
    return list;
  }

  /**
   * Serialize a dataset {@link FileSystemDataset} to a {@link State}
   * @param dataset A dataset needs serialization
   * @param state   A state that is used to save {@link org.apache.gobblin.dataset.Dataset}
   */
  public void save (FileSystemDataset dataset, State state) {
    state.setProp(SERIALIZE_COMPACTION_FILE_PATH_NAME, dataset.datasetURN());
  }

  /**
   * Deserialize a new {@link FileSystemDataset} from a given {@link State}
   *
   * @param state a type of {@link org.apache.gobblin.runtime.TaskState}
   * @return A new instance of {@link FileSystemDataset}
   */
  public FileSystemDataset load (final State state) {
    return new FileSystemDataset() {
      @Override
      public Path datasetRoot() {
        return new Path(state.getProp(SERIALIZE_COMPACTION_FILE_PATH_NAME));
      }

      @Override
      public String datasetURN() {
        return state.getProp(SERIALIZE_COMPACTION_FILE_PATH_NAME);
      }
    };
  }

  /**
   * Some post actions are required after compaction job (map-reduce) is finished.
   *
   * @return  A list of {@link CompactionCompleteAction}s which needs to be executed after
   *          map-reduce is done.
   */
  public List> getCompactionCompleteActions() {
    ArrayList> array = new ArrayList<>();
    array.add(new CompactionCompleteFileOperationAction(state, configurator));
    array.add(new CompactionHiveRegistrationAction(state));
    array.add(new CompactionMarkDirectoryAction(state, configurator));
    return array;
  }

  /**
   * Constructs a map-reduce job suitable for avro compaction. The detailed configuration
   * work is delegated to {@link CompactionAvroJobConfigurator#createJob(FileSystemDataset)}
   *
   * @param  dataset a top level input path which contains all avro files those need to be compacted
   * @return a map-reduce job which will compact avro files against {@link org.apache.gobblin.dataset.Dataset}
   */
  public Job createJob (FileSystemDataset dataset) throws IOException {
    configurator = new CompactionAvroJobConfigurator(this.state);
    return configurator.createJob(dataset);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy