All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.inin.analytics.elasticsearch.IndexingPostProcessor Maven / Gradle / Ivy

There is a newer version: 1.2.1
Show newest version
package com.inin.analytics.elasticsearch;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.inin.analytics.elasticsearch.transport.BaseTransport;
import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy;

public class IndexingPostProcessor {
	private static transient Logger logger = LoggerFactory.getLogger(IndexingPostProcessor.class);
	
	/**
	 * The job output in HDFS is just a manifest of indicies generated by the Job. Why? S3 is eventually consistent in some
	 * zones. That means if you try to list the indicies you just generated by this job, you might miss some. Instead, we
	 * have the job spit out tiny manifests. This method merges them together, de-dupes them, and if there's any shards that
	 * didn't get generated because they have no data it puts a placeholder empty shard in it's place to satisfy ElasticSearch.
	 * 
	 * @param jobOutput
	 * @param manifestFile
	 * @throws IOException
	 * @throws IllegalAccessException 
	 * @throws InstantiationException 
	 */
	public void execute(Path jobOutput, Path manifestFile, String scratchDir, int numShardsPerIndex, Configuration conf, Class reducerClass) throws IOException, InstantiationException, IllegalAccessException {
		FileSystem fs = FileSystem.get(conf);
		ESEmbededContainer esEmbededContainer = null;
		boolean rootManifestUploaded = false;
		try{
			Map numShardsGenerated = new HashMap();

			// Each reducer spits out it's own manifest file, merge em all together into 1 file
			FileUtil.copyMerge(fs, jobOutput, fs, manifestFile, false, conf, "");

			// Read the merged file, de-duping entries as it reads
			BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(manifestFile)));
			String line;
			line=br.readLine();
			Set indicies = new HashSet<>();
			while (line != null){
				indicies.add(line);
				int count = numShardsGenerated.containsKey(line) ? numShardsGenerated.get(line) : 0;
				numShardsGenerated.put(line, count + 1);
				line=br.readLine();
			}

			File scratch = new File(scratchDir);
			if(!scratch.exists()) {
				// Make the dir if it doesn't exist
				scratch.mkdirs();	
			} else {
				FileUtils.deleteDirectory(scratch);
				scratch.mkdirs();
			}
			
			esEmbededContainer = getESEmbededContainer(conf, reducerClass);

			String scratchFile = scratchDir + "manifest";
			PrintWriter writer = new PrintWriter(scratchFile, "UTF-8");
			
			// Create all the indexes
			for(String index : indicies) {
				esEmbededContainer.getNode().client().admin().indices().prepareCreate(index).get();
			}

			// Snapshot it
			List indexesToSnapshot = new ArrayList<>();
			indexesToSnapshot.addAll(indicies);
			esEmbededContainer.snapshot(indexesToSnapshot, BaseESReducer.SNAPSHOT_NAME, conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()), null);
			
			for(String index : indicies) {
				try{
					placeMissingIndexes(BaseESReducer.SNAPSHOT_NAME, esEmbededContainer, conf, index, !rootManifestUploaded);
					// The root level manifests are the same on each one, so it need only be uploaded once
					rootManifestUploaded = true;
				} catch (FileNotFoundException e) {
					logger.error("Unable to include index " + index + " in the manifest because missing shards could not be generated", e);
					continue;
				}

				// Re-write the manifest to local disk
				writer.println(index);	
			}
			
			// Clean up index from embedded instance
			for(String index : indicies) {
				esEmbededContainer.getNode().client().admin().indices().prepareDelete(index).execute();	
			}

			writer.close();

			// Move the manifest onto HDFS
			fs.copyFromLocalFile(new Path(scratchFile), manifestFile);
		} finally {
			if(esEmbededContainer != null) {
				esEmbededContainer.getNode().close();
				while(!esEmbededContainer.getNode().isClosed());
			}
			FileUtils.deleteDirectory(new File(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString())));
		}
	}

	public void placeMissingIndexes(String snapshotName, ESEmbededContainer esEmbededContainer, Configuration conf, String index, boolean includeRootManifest) throws IOException {
		BaseTransport transport = SnapshotTransportStrategy.get(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()), conf.get(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString()));
		transport.placeMissingShards(snapshotName, index, conf.getInt(ConfigParams.NUM_SHARDS_PER_INDEX.toString(), 5), includeRootManifest);			
	}

	private ESEmbededContainer getESEmbededContainer(Configuration conf, Class reducerClass) throws IOException, InstantiationException, IllegalAccessException {
		ESEmbededContainer esEmbededContainer = null;
		BaseESReducer red = reducerClass.newInstance();
		String templateName = red.getTemplateName();
		String templateJson = red.getTemplate();
		red.close();
		
		ESEmbededContainer.Builder builder = new ESEmbededContainer.Builder()
		.withNodeName("embededESTempLoaderNode")
		.withInMemoryBackedIndexes(true)
		.withWorkingDir(conf.get(ConfigParams.ES_WORKING_DIR.toString()))
		.withClusterName("bulkLoadPartition")
		.withNumShardsPerIndex(conf.getInt(ConfigParams.NUM_SHARDS_PER_INDEX.toString(), 5))
		.withSnapshotWorkingLocation(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()))
		.withSnapshotRepoName(conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()));
		
		if(templateName != null && templateJson != null) {
			builder.withTemplate(templateName, templateJson);	
		}
		
		esEmbededContainer = builder.build();
		return esEmbededContainer;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy