All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.infinispan.hadoop.flink.sample.WordFrequency Maven / Gradle / Ivy

package org.infinispan.hadoop.flink.sample;

import java.util.List;
import java.util.stream.IntStream;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.hadoopcompatibility.HadoopInputs;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.infinispan.hadoop.InfinispanConfiguration;
import org.infinispan.hadoop.InfinispanInputFormat;

/**
 * Apache Flink job to calculate histograms with number of words on phrases.
 * 

* This demonstrates connecting Flink with Infinispan by using {@link org.infinispan.hadoop.InfinispanInputFormat} and * {@link org.infinispan.hadoop.InfinispanOutputFormat}. *

Data is located in the cache "phrases" of Infinispan server with types <Long,String> and each entry has a * phrase with 1 or more words
* Output should be:

* HISTOGRAM: *

* 2 word phrases: ****** (6)
* 3 word phrases: ************ (12)
* 4 word phrases: ***************************************** (41)
* 5 word phrases: ********** (10)
*

*/ public class WordFrequency { public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: WordFrequency "); System.exit(1); } // Configure the Infinispan InputFormat wrapping it in a Hadoop Job class. // In this sample, no writing to Infinispan will happen, so no need to configure // an InfinispanOutputFormat Configuration configuration = new Configuration(); configuration.set(InfinispanConfiguration.INPUT_REMOTE_CACHE_SERVER_LIST, args[0]); configuration.set(InfinispanConfiguration.INPUT_REMOTE_CACHE_NAME, "phrases"); Job job = Job.getInstance(configuration, "Infinispan Integration"); InfinispanInputFormat infinispanInputFormat = new InfinispanInputFormat<>(); // Obtain the Execution environment from Flink final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Create a DataSource that reads data using the InfinispanInputFormat DataSource> infinispanDS = env.createInput( HadoopInputs.createHadoopInput(infinispanInputFormat, Integer.class, String.class, job) ); // Count entries long count = infinispanDS.count(); // Obtain the values from entries DataSet values = infinispanDS.map(entry -> entry.f1).returns(String.class); // Obtain phrase lengths (length,1) for the values DataSet> lengthsCount = values.flatMap(new FlatMapFunction>() { @Override public void flatMap(String s, Collector> collector) throws Exception { collector.collect(new Tuple2<>(s.split(" ").length, 1)); } }); // Create the histogram List> results = lengthsCount.groupBy(0).sum(1).collect(); // Format results and print to stdout printResults(count, results); } private static void printResults(long entries, List> results) { System.out.printf("TOTAL PHRASES ANALYZED: %d. HISTOGRAM:\n", entries); results.forEach(t -> { Integer wordNumber = t.f0; Integer count = t.f1; System.out.printf("%-3d word phrases:", wordNumber); IntStream.range(1, count).boxed().forEach(c -> System.out.print("*")); System.out.printf("(%d)\n", count); }); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy