All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dataartisans.flinktraining.exercises.datastream_java.connectors.PopularPlacesToES Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015 data Artisans GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dataartisans.flinktraining.exercises.datastream_java.connectors;

import com.dataartisans.flinktraining.exercises.datastream_java.datatypes.TaxiRide;
import com.dataartisans.flinktraining.exercises.datastream_java.basics.RideCleansing;
import com.dataartisans.flinktraining.exercises.datastream_java.sources.TaxiRideSource;
import com.dataartisans.flinktraining.exercises.datastream_java.utils.GeoUtils;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.elasticsearch2.ElasticsearchSink;
import org.apache.flink.streaming.connectors.elasticsearch2.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch2.RequestIndexer;
import org.apache.flink.util.Collector;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;

import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Java reference implementation for the "Popular Places" exercise of the Flink training
 * (http://dataartisans.github.io/flink-training).
 *
 * The task of the exercise is to identify every five minutes popular areas where many taxi rides
 * arrived or departed in the last 15 minutes.
 * The results are written into an Elasticsearch index.
 *
 * Parameters:
 * -input path-to-input-file
 *
 */
public class PopularPlacesToES {

	public static void main(String[] args) throws Exception {

		// read parameters
		ParameterTool params = ParameterTool.fromArgs(args);
		String input = params.getRequired("input");

		final int popThreshold = 20; // threshold for popular places
		final int maxEventDelay = 60; // events are out of order by max 60 seconds
		final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second

		// set up streaming execution environment
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

		// start the data generator
		DataStream rides = env.addSource(
				new TaxiRideSource(input, maxEventDelay, servingSpeedFactor));

		// find popular places
		DataStream> popularPlaces = rides
				// remove all rides which are not within NYC
				.filter(new RideCleansing.NYCFilter())
				// match ride to grid cell and event type (start or end)
				.map(new GridCellMatcher())
				// partition by cell id and event type
				.keyBy(0, 1)
				// build sliding window
				.timeWindow(Time.minutes(15), Time.minutes(5))
				// count ride events in window
				.apply(new RideCounter())
				// filter by popularity threshold
				.filter(new FilterFunction>() {
					@Override
					public boolean filter(Tuple4 count) throws Exception {
						return count.f3 >= popThreshold;
					}
				})
		 		// map grid cell to coordinates
				.map(new GridToCoordinates());

		Map config = new HashMap<>();
		// This instructs the sink to emit after every element, otherwise they would be buffered
		config.put("bulk.flush.max.actions", "10");
		config.put("cluster.name", "elasticsearch");

		List transports = new ArrayList<>();
		transports.add(new InetSocketAddress(InetAddress.getByName("localhost"), 9300));

		popularPlaces.addSink(new ElasticsearchSink<>(
				config,
				transports,
				new PopularPlaceInserter()));

		// execute the transformation pipeline
		env.execute("Popular Places to Elasticsearch");
	}

	/**
	 * Inserts popular places into the "nyc-places" index.
	 */
	public static class PopularPlaceInserter
			implements ElasticsearchSinkFunction> {

		// construct index request
		@Override
		public void process(
				Tuple5 record,
				RuntimeContext ctx,
				RequestIndexer indexer) {

			// construct JSON document to index
			Map json = new HashMap<>();
			json.put("time", record.f2.toString());         // timestamp
			json.put("location", record.f1+","+record.f0);  // lat,lon pair
			json.put("isStart", record.f3.toString());      // isStart
			json.put("cnt", record.f4.toString());          // count

			IndexRequest rqst = Requests.indexRequest()
					.index("nyc-places")        // index name
					.type("popular-locations")  // mapping name
					.source(json);

			indexer.add(rqst);
		}
	}

	/**
	 * Maps taxi ride to grid cell and event type.
	 * Start records use departure location, end record use arrival location.
	 */
	public static class GridCellMatcher implements MapFunction> {

		@Override
		public Tuple2 map(TaxiRide taxiRide) throws Exception {
			return new Tuple2<>(
					GeoUtils.mapToGridCell(taxiRide.startLon, taxiRide.startLat),
					taxiRide.isStart
			);
		}
	}

	/**
	 * Counts the number of rides arriving or departing.
	 */
	public static class RideCounter implements WindowFunction<
			Tuple2,                // input type
			Tuple4, // output type
			Tuple,                                   // key type
			TimeWindow>                              // window type
	{

		@SuppressWarnings("unchecked")
		@Override
		public void apply(
				Tuple key,
				TimeWindow window,
				Iterable> gridCells,
				Collector> out) throws Exception {

			int cellId = ((Tuple2)key).f0;
			boolean isStart = ((Tuple2)key).f1;
			long windowTime = window.getEnd();

			int cnt = 0;
			for(Tuple2 c : gridCells) {
				cnt += 1;
			}

			out.collect(new Tuple4<>(cellId, windowTime, isStart, cnt));
		}
	}

	/**
	 * Maps the grid cell id back to longitude and latitude coordinates.
	 */
	public static class GridToCoordinates implements
			MapFunction, Tuple5> {

		@Override
		public Tuple5 map(
				Tuple4 cellCount) throws Exception {

			return new Tuple5<>(
					GeoUtils.getGridCellCenterLon(cellCount.f0),
					GeoUtils.getGridCellCenterLat(cellCount.f0),
					cellCount.f1,
					cellCount.f2,
					cellCount.f3);
		}
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy