org.apache.gobblin.compaction.verify.CompactionTimeRangeVerifier Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.compaction.verify;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Period;
import org.joda.time.format.PeriodFormatter;
import org.joda.time.format.PeriodFormatterBuilder;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Maps;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.gobblin.compaction.dataset.TimeBasedSubDirDatasetsFinder;
import org.apache.gobblin.compaction.mapreduce.MRCompactor;
import org.apache.gobblin.compaction.parser.CompactionPathParser;
import org.apache.gobblin.compaction.source.CompactionSource;
import org.apache.gobblin.configuration.State;
import org.apache.gobblin.dataset.FileSystemDataset;
/**
* A simple class which verify current dataset belongs to a specific time range. Will skip to do
* compaction if dataset is not in a correct time range.
*/
@Slf4j
@AllArgsConstructor
public class CompactionTimeRangeVerifier implements CompactionVerifier {
public final static String COMPACTION_VERIFIER_TIME_RANGE = COMPACTION_VERIFIER_PREFIX + "time-range";
protected State state;
public Result verify (FileSystemDataset dataset) {
final DateTime earliest;
final DateTime latest;
try {
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
DateTime folderTime = result.getTime();
DateTimeZone timeZone = DateTimeZone.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
DateTime compactionStartTime = new DateTime(this.state.getPropAsLong(CompactionSource.COMPACTION_INIT_TIME), timeZone);
PeriodFormatter formatter = new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours()
.appendSuffix("h").toFormatter();
// Dataset name is like 'Identity/MemberAccount' or 'PageViewEvent'
String datasetName = result.getDatasetName();
// get earliest time
String maxTimeAgoStrList = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MAX_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO);
String maxTimeAgoStr = getMachedLookbackTime(datasetName, maxTimeAgoStrList, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO);
Period maxTimeAgo = formatter.parsePeriod(maxTimeAgoStr);
earliest = compactionStartTime.minus(maxTimeAgo);
// get latest time
String minTimeAgoStrList = this.state.getProp(TimeBasedSubDirDatasetsFinder.COMPACTION_TIMEBASED_MIN_TIME_AGO, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO);
String minTimeAgoStr = getMachedLookbackTime(datasetName, minTimeAgoStrList, TimeBasedSubDirDatasetsFinder.DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO);
Period minTimeAgo = formatter.parsePeriod(minTimeAgoStr);
latest = compactionStartTime.minus(minTimeAgo);
if (earliest.isBefore(folderTime) && latest.isAfter(folderTime)) {
log.debug("{} falls in the user defined time range", dataset.datasetRoot());
return new Result(true, "");
}
} catch (Exception e) {
log.error("{} cannot be verified because of {}", dataset.datasetRoot(), ExceptionUtils.getFullStackTrace(e));
return new Result(false, e.toString());
}
return new Result(false, dataset.datasetRoot() + " is not in between " + earliest + " and " + latest);
}
public String getName() {
return COMPACTION_VERIFIER_TIME_RANGE;
}
public boolean isRetriable () {
return false;
}
/**
* Find the correct lookback time for a given dataset.
*
* @param datasetsAndLookBacks Lookback string for multiple datasets. Datasets is represented by Regex pattern.
* Multiple 'datasets and lookback' pairs were joined by semi-colon. A default
* lookback time can be given without any Regex prefix. If nothing found, we will use
* {@param sysDefaultLookback}.
*
* Example Format: [Regex1]:[T1];[Regex2]:[T2];[DEFAULT_T];[Regex3]:[T3]
* Ex. Identity.*:1d2h;22h;BizProfile.BizCompany:3h (22h is default lookback time)
*
* @param sysDefaultLookback If user doesn't specify any lookback time for {@param datasetName}, also there is no default
* lookback time inside {@param datasetsAndLookBacks}, this system default lookback time is return.
*
* @param datasetName A description of dataset without time partition information. Example 'Identity/MemberAccount' or 'PageViewEvent'
* @return The lookback time matched with given dataset.
*/
public static String getMachedLookbackTime (String datasetName, String datasetsAndLookBacks, String sysDefaultLookback) {
String defaultLookback = sysDefaultLookback;
for (String entry : Splitter.on(";").trimResults()
.omitEmptyStrings().splitToList(datasetsAndLookBacks)) {
List datasetAndLookbackTime = Splitter.on(":").trimResults().omitEmptyStrings().splitToList(entry);
if (datasetAndLookbackTime.size() == 1) {
defaultLookback = datasetAndLookbackTime.get(0);
} else if (datasetAndLookbackTime.size() == 2) {
String regex = datasetAndLookbackTime.get(0);
if (Pattern.compile(regex).matcher(datasetName).find()) {
return datasetAndLookbackTime.get(1);
}
} else {
log.error("Invalid format in {}, {} cannot find its lookback time", datasetsAndLookBacks, datasetName);
}
}
return defaultLookback;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy