All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dataartisans.flink.cascading.example.WordCount Maven / Gradle / Ivy

/*
 * Copyright 2015 data Artisans GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dataartisans.flink.cascading.example;

import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowDef;
import cascading.operation.regex.RegexSplitGenerator;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.pipe.assembly.AggregateBy;
import cascading.pipe.assembly.CountBy;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import com.dataartisans.flink.cascading.FlinkConnector;


public class WordCount {
	public static void main(String[] args) {

		if (args.length < 2) {
			throw new IllegalArgumentException("Please specify input and ouput paths as arguments.");
		}

		Fields token = new Fields( "token", String.class );
		Fields text = new Fields( "text" );
		RegexSplitGenerator splitter = new RegexSplitGenerator( token, "\\s+" );
		// only returns "token"
		Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

		Pipe wcPipe = new Pipe( "wc", docPipe );
		wcPipe = new AggregateBy( wcPipe, token, new CountBy(new Fields("count")));

		Tap inTap = new Hfs(new TextDelimited(text, "\n" ), args[0]);
		Tap outTap = new Hfs(new TextDelimited(false, "\n"), args[1], SinkMode.REPLACE);

		FlowDef flowDef = FlowDef.flowDef().setName( "wc" )
				.addSource( docPipe, inTap )
				.addTailSink( wcPipe, outTap );

		FlowConnector flowConnector = new FlinkConnector();

		Flow wcFlow = flowConnector.connect( flowDef );

		wcFlow.complete();
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy