org.infinispan.hadoop.flink.sample.WordFrequency Maven / Gradle / Ivy

Go to download
package org.infinispan.hadoop.flink.sample;

import java.util.List;
import java.util.stream.IntStream;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.hadoopcompatibility.HadoopInputs;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.infinispan.hadoop.InfinispanConfiguration;
import org.infinispan.hadoop.InfinispanInputFormat;

/**
 * Apache Flink job to calculate histograms with number of words on phrases.
 * 
 * This demonstrates connecting Flink with Infinispan by using {@link org.infinispan.hadoop.InfinispanInputFormat} and
 * {@link org.infinispan.hadoop.InfinispanOutputFormat}.
 * 
Data is located in the cache "phrases" of Infinispan server with types <Long,String> and each entry has a
 * phrase with 1 or more words 

 * Output should be: 

 * HISTOGRAM:
 * 

 * 2 word phrases: ****** (6)

 * 3 word phrases: ************ (12)

 * 4 word phrases: ***************************************** (41)

 * 5 word phrases: ********** (10)

 * 
 */
public class WordFrequency {

   public static void main(String[] args) throws Exception {
      if (args.length < 1) {
         System.err.println("Usage: WordFrequency ");
         System.exit(1);
      }

      // Configure the Infinispan InputFormat wrapping it in a Hadoop Job class.
      // In this sample, no writing to Infinispan will happen, so no need to configure
      // an InfinispanOutputFormat
      Configuration configuration = new Configuration();
      configuration.set(InfinispanConfiguration.INPUT_REMOTE_CACHE_SERVER_LIST, args[0]);
      configuration.set(InfinispanConfiguration.INPUT_REMOTE_CACHE_NAME, "phrases");
      Job job = Job.getInstance(configuration, "Infinispan Integration");
      InfinispanInputFormat infinispanInputFormat = new InfinispanInputFormat<>();

      // Obtain the Execution environment from Flink
      final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

      // Create a DataSource that reads data using the InfinispanInputFormat

      DataSource> infinispanDS = env.createInput(
            HadoopInputs.createHadoopInput(infinispanInputFormat, Integer.class, String.class, job)
      );

      // Count entries
      long count = infinispanDS.count();

      // Obtain the values from entries
      DataSet values = infinispanDS.map(entry -> entry.f1).returns(String.class);

      // Obtain phrase lengths (length,1) for the values
      DataSet> lengthsCount = values.flatMap(new FlatMapFunction>() {
         @Override
         public void flatMap(String s, Collector> collector) throws Exception {
            collector.collect(new Tuple2<>(s.split(" ").length, 1));
         }
      });

      // Create the histogram
      List> results = lengthsCount.groupBy(0).sum(1).collect();

      // Format results and print to stdout
      printResults(count, results);
   }

   private static void printResults(long entries, List> results) {
      System.out.printf("TOTAL PHRASES ANALYZED: %d. HISTOGRAM:\n", entries);
      results.forEach(t -> {
         Integer wordNumber = t.f0;
         Integer count = t.f1;
         System.out.printf("%-3d word phrases:", wordNumber);
         IntStream.range(1, count).boxed().forEach(c -> System.out.print("*"));
         System.out.printf("(%d)\n", count);
      });
   }

}