com.expedia.dsp.data.squeeze.mappers.AvroCompactionMapper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of datasqueeze Show documentation
Show all versions of datasqueeze Show documentation
Hadoop utility to compact small files
/**
* Copyright (C) 2018 Expedia Group
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.expedia.dsp.data.squeeze.mappers;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.FileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.AvroFSInput;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.hotels.avro.compatibility.Compatibility;
import lombok.extern.slf4j.Slf4j;
/**
* Mapper class for AVRO input file format.
*
* @author Samarth Kulkarni
*/
@Slf4j
public class AvroCompactionMapper extends Mapper, Object,
Text, AvroValue> {
private BaseMapper baseMapper;
private Map compatibityMap;
private Configuration configuration;
/**
* {@inheritDoc}
*/
protected void setup(final Context context) throws IOException, InterruptedException {
baseMapper = new BaseMapper(context);
compatibityMap = new HashMap();
configuration = context.getConfiguration();
}
@Override
protected void map(final AvroKey key, final Object value, final Context context) throws IOException, InterruptedException {
final FileSplit fileSplit = (FileSplit) context.getInputSplit();
final Path filePath = fileSplit.getPath();
AvroValue record = new AvroValue(key.datum());
if (isValidData(key, filePath)) {
final Text mapperKey = baseMapper.getKey(filePath.toString());
context.write(mapperKey, record);
}
}
private boolean isValidData(AvroKey key, Path filePath) throws IOException {
if (compatibityMap.containsKey(filePath.toString())) {
return compatibityMap.get(filePath.toString());
}
if (getHashOfData(key).equals(getHashOfDefaultObject())) {
if (isSchemaCompatible(key.datum().getSchema(), filePath)) {
compatibityMap.put(filePath.toString(), true);
return true;
} else {
log.info("File {} is not compatible with input schema", filePath.toString());
compatibityMap.put(filePath.toString(), false);
return false;
}
}
compatibityMap.put(filePath.toString(), true);
return true;
}
private boolean isSchemaCompatible(Schema inputSchema, Path filePath) throws IOException {
GenericDatumReader