com.inin.analytics.elasticsearch.index.routing.ElasticsearchRoutingStrategyV1 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-lambda Show documentation
Framework For Lambda Architecture on Elasticsearch
There is a newer version: 1.2.1
package com.inin.analytics.elasticsearch.index.routing;

import java.util.HashMap;
import java.util.Map;

import com.google.common.base.Preconditions;
import com.inin.analytics.elasticsearch.index.rotation.ElasticSearchIndexMetadata;
import com.inin.analytics.elasticsearch.util.MurmurHash;


/**
 * This routing strategy for elasticsearch. Read up on what routing does here
 * http://www.elasticsearch.org/blog/customizing-your-document-routing/ 
 * 
 * Perhaps you have 10 shards per index and you don't wish to query every shard 
 * every time you do a search against an org. A simple sharding strategy would
 * put all the data for 1 org on 1 shard using consistant hashing on orgId. However that 
 * has the potential to hotspot some shards if an org with a lot of data comes
 * through.
 * 
 * This attempts to alleviate that by making the # subset of shards configurable. EG
 * numShards = 10, numShardsPerOrg = 3, all of an org's data will be split to one of 
 * 3 shards. Which one of the 3 is determined by hashing the conversationId.
 *  
 * Note: DO NOT CHANGE THIS CLASS. It's immutable once it's been used to generate ES indexes
 * so changing it affects data routing and will make data appear unavailable b/c its looking
 * in the wrong shard. The correct thing to do is to make a newer version of this class,
 * say ElasticsearchRoutingStrategyV2 and see to it that the hadoop jobs to rebuild
 * the ES indexes not only use it, but update zookeeper with which implementation
 * indexes were built with. That way you can evolve the routing strategy without breaking
 * anything.
 * 
 * @author drew
 *
 */
public class ElasticsearchRoutingStrategyV1 implements ElasticsearchRoutingStrategy, java.io.Serializable {
	private static final long serialVersionUID = 1L;
	private int numShardsPerOrg;
	private int numShards;
	private Map shardToRout = new HashMap<>();

	/**
	 * Adapted from DjbHashFunction & PlainOperationRouting in Elasticsearch. This is the default hashing 
	 * Algorithm for doc routing. We need this to reverse engineer routing strings that rout to
	 * the shard we want. 
	 * 
	 * @param value
	 * @return
	 */
	public int hash(String value) {
		long hash = 5381;
		for (int i = 0; i < value.length(); i++) {
			hash = ((hash << 5) + hash) + value.charAt(i);
		}

		return Math.abs((int) hash % numShards);
	}

	public void init() {
		Integer x = 0;
		while(shardToRout.size() < numShards) {
			Integer hash = hash(x.toString());
			if(shardToRout.get(x) == null) {
				shardToRout.put(x, hash);
			}
			x++;
		}
	}

	public ElasticsearchRoutingStrategyV1() {

	}

	@Override
	public void configure(ElasticSearchIndexMetadata rotatedIndexMetadata) {
		Preconditions.checkNotNull(rotatedIndexMetadata.getNumShardsPerOrg(), "Num shards per org must not be null with " + this.getClass().getSimpleName());
		Preconditions.checkNotNull(rotatedIndexMetadata.getNumShards(), "Num shards must not be null with " + this.getClass().getSimpleName());
		this.numShardsPerOrg = rotatedIndexMetadata.getNumShardsPerOrg();
		this.numShards = rotatedIndexMetadata.getNumShards();
		init();
	}

	@Override
	public int getNumShardsPerOrg() {
		return numShardsPerOrg;
	}

	@Override
	public int getNumShards() {
		return numShards;
	}
	

	public Map getShardToRout() {
		return shardToRout;
	}

	/**
	 * For an orgId & convId, get the shard routing for a document.  
	 * 
	 * Note: ES re-hashes routing values so shard 1 wont necessarily mean 
	 * your data ends up in shard 1. However, if you realize that 
	 * then you're in a bad place. 
	 * 
	 * 
	 * @param orgId
	 * @param convId
	 * @param numShards
	 * @param numShardsPerOrg
	 * @return
	 */

	@Override
	public String getRoutingHash(String orgId, String convId) {
		Preconditions.checkArgument(numShards >= numShardsPerOrg, "Misconfigured, numShards must be >= numShardsPerOrg");
		int orgIdHash = getOrgIdHash(orgId, numShards);
		int convIdHash = Math.abs(MurmurHash.getInstance().hash(convId.getBytes(), 0)) % numShardsPerOrg;

		int shard = orgIdHash + convIdHash;
		while(shard >= numShards) {
			shard -= numShards;
		}

		return shardToRout.get(shard).toString();
	}

	/**
	 * When searching data for an Org, you may desire to only search the shards
	 * which hold data for that Org. This gives you a list of possible shard routings.
	 * 
	 * @param orgId
	 * @param numShards
	 * @param numShardsPerOrg
	 * @return
	 */
	@Override
	public String[] getPossibleRoutingHashes(String orgId) {
		int orgIdHash = getOrgIdHash(orgId, numShards);
		String[] possibleShards = new String[numShardsPerOrg];
		for(int x = 0; x < numShardsPerOrg; x ++) {
			int shard = orgIdHash + x;
			while(shard >= numShards) {
				shard -= numShards;
			}
			possibleShards[x] = shardToRout.get(shard).toString();
		}
		return possibleShards;
	}

	private int getOrgIdHash(String orgId, int numShards) {
		return Math.abs(MurmurHash.getInstance().hash(orgId.getBytes(), 0)) % numShards;
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + numShards;
		result = prime * result + numShardsPerOrg;
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		ElasticsearchRoutingStrategyV1 other = (ElasticsearchRoutingStrategyV1) obj;
		if (numShards != other.numShards)
			return false;
		if (numShardsPerOrg != other.numShardsPerOrg)
			return false;
		return true;
	}

}