com.inin.analytics.elasticsearch.IndexingPostProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-lambda Show documentation
Show all versions of elasticsearch-lambda Show documentation
Framework For Lambda Architecture on Elasticsearch
package com.inin.analytics.elasticsearch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.inin.analytics.elasticsearch.transport.BaseTransport;
import com.inin.analytics.elasticsearch.transport.SnapshotTransportStrategy;
public class IndexingPostProcessor {
private static transient Logger logger = LoggerFactory.getLogger(IndexingPostProcessor.class);
/**
* The job output in HDFS is just a manifest of indicies generated by the Job. Why? S3 is eventually consistent in some
* zones. That means if you try to list the indicies you just generated by this job, you might miss some. Instead, we
* have the job spit out tiny manifests. This method merges them together, de-dupes them, and if there's any shards that
* didn't get generated because they have no data it puts a placeholder empty shard in it's place to satisfy ElasticSearch.
*
* @param jobOutput
* @param manifestFile
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
public void execute(Path jobOutput, Path manifestFile, String scratchDir, int numShardsPerIndex, Configuration conf, Class extends BaseESReducer> reducerClass) throws IOException, InstantiationException, IllegalAccessException {
FileSystem fs = FileSystem.get(conf);
ESEmbededContainer esEmbededContainer = null;
boolean rootManifestUploaded = false;
try{
Map numShardsGenerated = new HashMap();
// Each reducer spits out it's own manifest file, merge em all together into 1 file
FileUtil.copyMerge(fs, jobOutput, fs, manifestFile, false, conf, "");
// Read the merged file, de-duping entries as it reads
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(manifestFile)));
String line;
line=br.readLine();
Set indicies = new HashSet<>();
while (line != null){
indicies.add(line);
int count = numShardsGenerated.containsKey(line) ? numShardsGenerated.get(line) : 0;
numShardsGenerated.put(line, count + 1);
line=br.readLine();
}
File scratch = new File(scratchDir);
if(!scratch.exists()) {
// Make the dir if it doesn't exist
scratch.mkdirs();
} else {
FileUtils.deleteDirectory(scratch);
scratch.mkdirs();
}
esEmbededContainer = getESEmbededContainer(conf, reducerClass);
String scratchFile = scratchDir + "manifest";
PrintWriter writer = new PrintWriter(scratchFile, "UTF-8");
// Create all the indexes
for(String index : indicies) {
esEmbededContainer.getNode().client().admin().indices().prepareCreate(index).get();
}
// Snapshot it
List indexesToSnapshot = new ArrayList<>();
indexesToSnapshot.addAll(indicies);
esEmbededContainer.snapshot(indexesToSnapshot, BaseESReducer.SNAPSHOT_NAME, conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()), null);
for(String index : indicies) {
try{
placeMissingIndexes(BaseESReducer.SNAPSHOT_NAME, esEmbededContainer, conf, index, !rootManifestUploaded);
// The root level manifests are the same on each one, so it need only be uploaded once
rootManifestUploaded = true;
} catch (FileNotFoundException e) {
logger.error("Unable to include index " + index + " in the manifest because missing shards could not be generated", e);
continue;
}
// Re-write the manifest to local disk
writer.println(index);
}
// Clean up index from embedded instance
for(String index : indicies) {
esEmbededContainer.getNode().client().admin().indices().prepareDelete(index).execute();
}
writer.close();
// Move the manifest onto HDFS
fs.copyFromLocalFile(new Path(scratchFile), manifestFile);
} finally {
if(esEmbededContainer != null) {
esEmbededContainer.getNode().close();
while(!esEmbededContainer.getNode().isClosed());
}
FileUtils.deleteDirectory(new File(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString())));
}
}
public void placeMissingIndexes(String snapshotName, ESEmbededContainer esEmbededContainer, Configuration conf, String index, boolean includeRootManifest) throws IOException {
BaseTransport transport = SnapshotTransportStrategy.get(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()), conf.get(ConfigParams.SNAPSHOT_FINAL_DESTINATION.toString()));
transport.placeMissingShards(snapshotName, index, conf.getInt(ConfigParams.NUM_SHARDS_PER_INDEX.toString(), 5), includeRootManifest);
}
private ESEmbededContainer getESEmbededContainer(Configuration conf, Class extends BaseESReducer> reducerClass) throws IOException, InstantiationException, IllegalAccessException {
ESEmbededContainer esEmbededContainer = null;
BaseESReducer red = reducerClass.newInstance();
String templateName = red.getTemplateName();
String templateJson = red.getTemplate();
red.close();
ESEmbededContainer.Builder builder = new ESEmbededContainer.Builder()
.withNodeName("embededESTempLoaderNode")
.withInMemoryBackedIndexes(true)
.withWorkingDir(conf.get(ConfigParams.ES_WORKING_DIR.toString()))
.withClusterName("bulkLoadPartition")
.withNumShardsPerIndex(conf.getInt(ConfigParams.NUM_SHARDS_PER_INDEX.toString(), 5))
.withSnapshotWorkingLocation(conf.get(ConfigParams.SNAPSHOT_WORKING_LOCATION_CONFIG_KEY.toString()))
.withSnapshotRepoName(conf.get(ConfigParams.SNAPSHOT_REPO_NAME_CONFIG_KEY.toString()));
if(templateName != null && templateJson != null) {
builder.withTemplate(templateName, templateJson);
}
esEmbededContainer = builder.build();
return esEmbededContainer;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy