org.apache.flink.streaming.scala.examples.wordcount.WordCount.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.scala.examples.wordcount
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.examples.wordcount.util.WordCountData
/**
* Implements the "WordCount" program that computes a simple word occurrence
* histogram over text files in a streaming fashion.
*
* The input is a plain text file with lines separated by newline characters.
*
* Usage:
* {{{
* WordCount --input --output
* }}}
*
* If no parameters are provided, the program is run with default data from
* {@link WordCountData}.
*
* This example shows how to:
*
* - write a simple Flink Streaming program,
* - use tuple data types,
* - write and use transformation functions.
*
*/
object WordCount {
def main(args: Array[String]) {
// Checking input parameters
val params = ParameterTool.fromArgs(args)
// set up the execution environment
val env = StreamExecutionEnvironment.getExecutionEnvironment
// make parameters available in the web interface
env.getConfig.setGlobalJobParameters(params)
// get input data
val text =
// read the text file from given input path
if (params.has("input")) {
env.readTextFile(params.get("input"))
} else {
println("Executing WordCount example with default inputs data set.")
println("Use --input to specify file input.")
// get default test text data
env.fromElements(WordCountData.WORDS: _*)
}
val counts: DataStream[(String, Int)] = text
// split up the lines in pairs (2-tuples) containing: (word,1)
.flatMap(_.toLowerCase.split("\\W+"))
.filter(_.nonEmpty)
.map((_, 1))
// group by the tuple field "0" and sum up tuple field "1"
.keyBy(0)
.sum(1)
// emit result
if (params.has("output")) {
counts.writeAsText(params.get("output"))
} else {
println("Printing result to stdout. Use --output to specify output path.")
counts.print()
}
// execute program
env.execute("Streaming WordCount")
}
}