All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.dicodeproject.analysis.generic.GenericTableReducer Maven / Gradle / Ivy

Go to download

The examples module provides glue code implementation for extracting common phrases, key word distributions and more from tweets stored on HDFS/HBase. It builds on Mahout for more sophisticated analysis.

The newest version!
/**
 * Copyright (C) 2010, 2011 Neofonie GmbH
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.dicodeproject.analysis.generic;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import twitter4j.internal.org.json.JSONException;
import twitter4j.internal.org.json.JSONObject;

/**
 * Sums up counts for hashtags (or more general: words) and writes the counts to
 * HBase.
 * 
 */

public class GenericTableReducer extends TableReducer {

  private static final String COLUMN = "result";
  private static final String FAMILY = "d";
  private static final String RESULT_ID = "resultId";
  
  private static final int LIMIT = 100; // TODO: make configurable
  private int count = 0;
  private boolean flushed = false;
  private String dateKey;
  private JSONObject j;
  private List> entries;

  /**
   * Creates the Row Key from current date and query/topic
   */
  protected void setup(Context context) throws IOException, InterruptedException {
    SimpleDateFormat df = new SimpleDateFormat("yyyyMMdd");
    dateKey = context.getConfiguration().get(RESULT_ID).toLowerCase() + "_";
    dateKey += df.format(new Date());
    j = new JSONObject();
    entries = new ArrayList>();

  }

  /**
   * Aggregates top Hashtags in JSON format write to HBase
   */
  public void reduce(IntWritable key, Iterable values, Context context) throws IOException,
      InterruptedException {

    long ct = key.get();

    for (Text val : values) {

      if (count < LIMIT) {
        Map entry = new HashMap();
        entry.put(val.toString(), Integer.valueOf((int) ct * -1));
        entries.add(entry);
        count++;
      } else {
        break;
      }
    }
    if (count == LIMIT && !flushed) {
      this.writeData(context);
    }
  }

  /**
   * Write result to HBase
   * 
   * @param context
   * @throws IOException
   * @throws InterruptedException
   */
  private void writeData(Context context) throws IOException, InterruptedException {

    try {
      j.put("result", entries);
      ImmutableBytesWritable dummy = new ImmutableBytesWritable(Bytes.toBytes(dateKey));
      Put put = new Put(Bytes.toBytes(dateKey));
      put.add(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUMN), Bytes.toBytes(j.toString()));
      context.write(dummy, put);
      flushed = true;
    } catch (JSONException e) {
      // TODO: exception handling!
    }
  }

  /**
   * Write data if < Limit
   */
  protected void cleanup(Context context) throws IOException, InterruptedException {
    if (!flushed) {
      this.writeData(context);
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy