All Downloads are FREE. Search and download functionalities are using the official Maven repository.

datafu.hourglass.schemas.PartitionCollapsingSchemas Maven / Gradle / Ivy

The newest version!
/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.schemas;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.mapred.Pair;
import org.apache.commons.lang.NullArgumentException;

/**
 * Generates the Avro schemas used by {@link datafu.hourglass.jobs.AbstractPartitionCollapsingIncrementalJob} and its derivations.
 * 
 * @author "Matthew Hayes"
 *
 */
public class PartitionCollapsingSchemas implements Serializable
{  
  private static String DATED_INTERMEDIATE_VALUE_SCHEMA_NAME = "DatedMapValue";
  private static String KEY_SCHEMA = "key.schema";
  private static String INTERMEDIATE_VALUE_SCHEMA = "intermediate.value.schema";
  private static String OUTPUT_VALUE_SCHEMA = "output.value.schema";
  
  private final String _outputSchemaName;
  private final String _outputSchemaNamespace;
  private transient Schema _keySchema;
  private transient Schema _intermediateValueSchema;
  private transient Schema _outputValueSchema;
  
  // generated schemas
  private transient Schema _mapOutputSchema;
  private transient Schema _dateIntermediateValueSchema;
  private transient Schema _mapOutputValueSchema;
  private transient Schema _reduceOutputSchema;
  private transient Map _mapInputSchemas;
  
  //schemas are stored here so the object can be serialized
  private Map conf;

  private Map _inputSchemas;
  
  public PartitionCollapsingSchemas(TaskSchemas schemas, Map inputSchemas, String outputSchemaName, String outputSchemaNamespace)
  {
    if (schemas == null)
    {
      throw new NullArgumentException("schemas");
    }
    if (inputSchemas == null)
    {
      throw new NullArgumentException("inputSchema");
    }
    if (outputSchemaName == null)
    {
      throw new NullArgumentException("outputSchemaName");
    }
    if (outputSchemaName == outputSchemaNamespace)
    {
      throw new NullArgumentException("outputSchemaNamespace");
    }
    _outputSchemaName = outputSchemaName;
    _outputSchemaNamespace = outputSchemaNamespace;
    
    conf = new HashMap();
    conf.put(KEY_SCHEMA, schemas.getKeySchema().toString());
    conf.put(INTERMEDIATE_VALUE_SCHEMA, schemas.getIntermediateValueSchema().toString());
    conf.put(OUTPUT_VALUE_SCHEMA, schemas.getOutputValueSchema().toString());
    
    _inputSchemas = new HashMap();
    for (Entry schema : inputSchemas.entrySet())
    {
      _inputSchemas.put(schema.getKey(), schema.getValue().toString());
    }
  }
    
  public Map getMapInputSchemas()
  {    
    if (_mapInputSchemas == null)
    {
      _mapInputSchemas = new HashMap();
      
      for (Entry schemaPair : _inputSchemas.entrySet())
      {
        Schema schema = new Schema.Parser().parse(schemaPair.getValue());
        
        List mapInputSchemas = new ArrayList();
        
        if (schema.getType() == Type.UNION)
        {
          mapInputSchemas.addAll(schema.getTypes());
        }
        else
        {
          mapInputSchemas.add(schema);
        }
        
        // feedback from output (optional)
        mapInputSchemas.add(getReduceOutputSchema());
        
        _mapInputSchemas.put(schemaPair.getKey(), Schema.createUnion(mapInputSchemas));
      }
      
      
    }
    return Collections.unmodifiableMap(_mapInputSchemas);
  }
    
  public Schema getMapOutputSchema()
  {
    if (_mapOutputSchema == null)
    {
      _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), 
                                            getMapOutputValueSchema());
    }
    return _mapOutputSchema;
  }
  
  public Schema getKeySchema()
  {
    if (_keySchema == null)
    {
      _keySchema = new Schema.Parser().parse(conf.get(KEY_SCHEMA));
    }
    return _keySchema;
  }
      
  public Schema getMapOutputKeySchema()
  {
    return getKeySchema();
  }  
  
  public Schema getReduceOutputSchema()
  {
    if (_reduceOutputSchema == null)
    {
      _reduceOutputSchema = Schema.createRecord(_outputSchemaName, null, _outputSchemaNamespace, false);            
      List fields = Arrays.asList(new Field("key",getKeySchema(), null, null),
                                         new Field("value", getOutputValueSchema(), null, null));    
      _reduceOutputSchema.setFields(fields);
    }
    return _reduceOutputSchema;
  }
    
  public Schema getDatedIntermediateValueSchema()
  {
    if (_dateIntermediateValueSchema == null)
    {
      _dateIntermediateValueSchema = Schema.createRecord(DATED_INTERMEDIATE_VALUE_SCHEMA_NAME, null, _outputSchemaNamespace, false);
      List intermediateValueFields = Arrays.asList(new Field("value", getIntermediateValueSchema(), null, null),
                                                         new Field("time", Schema.create(Type.LONG), null, null));
      _dateIntermediateValueSchema.setFields(intermediateValueFields);
    }
    return _dateIntermediateValueSchema;
  }
  
  public Schema getOutputValueSchema()
  {
    if (_outputValueSchema == null)
    {
      _outputValueSchema = new Schema.Parser().parse(conf.get(OUTPUT_VALUE_SCHEMA));
    }
    return _outputValueSchema;
  }
  
  public Schema getIntermediateValueSchema()
  {
    if (_intermediateValueSchema == null)
    {
      _intermediateValueSchema = new Schema.Parser().parse(conf.get(INTERMEDIATE_VALUE_SCHEMA));
    }
    return _intermediateValueSchema;
  }
    
  public Schema getMapOutputValueSchema()
  {    
    if (_mapOutputValueSchema == null)
    {
      List unionSchemas = new ArrayList();
      
      unionSchemas.add(getIntermediateValueSchema());
      
      // intermediate values tagged with the date
      unionSchemas.add(getDatedIntermediateValueSchema());
      
      // feedback from output of second pass
      if (!unionSchemas.contains(getOutputValueSchema()))
      {
        unionSchemas.add(getOutputValueSchema());
      }
      
      _mapOutputValueSchema = Schema.createUnion(unionSchemas);
    }
    return _mapOutputValueSchema;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy