org.eclipse.jgit.pgm.debug.TextHashFunctions Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.eclipse.jgit.pgm Show documentation
Command line client tools built on top of JGit.
The newest version!
/*
 * Copyright (C) 2010, Google Inc.
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.pgm.debug;

import static java.lang.Integer.valueOf;
import static java.lang.Long.valueOf;

import java.io.File;
import java.lang.reflect.Field;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;

import org.eclipse.jgit.diff.RawText;
import org.eclipse.jgit.diff.RawTextComparator;
import org.eclipse.jgit.errors.LargeObjectException;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.MutableObjectId;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.lib.RepositoryBuilder;
import org.eclipse.jgit.lib.RepositoryCache;
import org.eclipse.jgit.pgm.TextBuiltin;
import org.eclipse.jgit.pgm.internal.CLIText;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.eclipse.jgit.util.FS;
import org.eclipse.jgit.util.NB;
import org.kohsuke.args4j.Option;

/**
 * Scan repository to compute maximum number of collisions for hash functions.
 *
 * This is a test suite to help benchmark the collision rate of hash functions
 * when applied to file contents in a Git repository. The test scans all text
 * files in the HEAD revision of the repository it is run within. For each file
 * it finds the unique lines, and then inserts those lines into a hash table to
 * determine collision rates under the selected hash functions.
 *
 * To add another hash function to the test suite, declare a new instance member
 * field of type {@link Hash} and implement the hashRegion method. The test
 * suite will automatically pick up the new function through reflection.
 *
 * To add another folding function (method of squashing a 32 bit hash code into
 * the hash tables smaller array index space), declare a new instance field of
 * type {@link Fold} and implement the logic. The test suite will automatically
 * pick up the new function through reflection.
 */
class TextHashFunctions extends TextBuiltin {

	/** Standard SHA-1 on the line, using the first 4 bytes as the hash code. */
	final Hash sha1 = new Hash() {
		private final MessageDigest md = Constants.newMessageDigest();

		@Override
		protected int hashRegion(byte[] raw, int ptr, int end) {
			md.reset();
			md.update(raw, ptr, end - ptr);
			return NB.decodeInt32(md.digest(), 0);
		}
	};

	/** Professor Daniel J. Bernstein's rather popular string hash. */
	final Hash djb = new Hash() {
		@Override
		protected int hashRegion(byte[] raw, int ptr, int end) {
			int hash = 5381;
			for (; ptr < end; ptr++)
				hash = ((hash << 5) + hash) + (raw[ptr] & 0xff);
			return hash;
		}
	};

	/** Hash function commonly used by java.lang.String. */
	final Hash string_hash31 = new Hash() {
		@Override
		protected int hashRegion(byte[] raw, int ptr, int end) {
			int hash = 0;
			for (; ptr < end; ptr++)
				hash = 31 * hash + (raw[ptr] & 0xff);
			return hash;
		}
	};

	/** The Rabin polynomial hash that is used by our own DeltaIndex. */
	final Hash rabin_DeltaIndex = new Hash() {
		private final byte[] buf16 = new byte[16];

		@Override
		protected int hashRegion(byte[] raw, int ptr, int end) {
			if (end - ptr < 16) {
				Arrays.fill(buf16, (byte) 0);
				System.arraycopy(raw, ptr, buf16, 0, end - ptr);
				return rabin(buf16, 0);
			} else {
				return rabin(raw, ptr);
			}
		}

		private int rabin(byte[] raw, int ptr) {
			int hash;

			// The first 4 steps collapse out into a 4 byte big-endian decode,
			// with a larger right shift as we combined shift lefts together.
			//
			hash = ((raw[ptr] & 0xff) << 24) //
					| ((raw[ptr + 1] & 0xff) << 16) //
					| ((raw[ptr + 2] & 0xff) << 8) //
					| (raw[ptr + 3] & 0xff);
			hash ^= T[hash >>> 31];

			hash = ((hash << 8) | (raw[ptr + 4] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 5] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 6] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 7] & 0xff)) ^ T[hash >>> 23];

			hash = ((hash << 8) | (raw[ptr + 8] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 9] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 10] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 11] & 0xff)) ^ T[hash >>> 23];

			hash = ((hash << 8) | (raw[ptr + 12] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 13] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 14] & 0xff)) ^ T[hash >>> 23];
			hash = ((hash << 8) | (raw[ptr + 15] & 0xff)) ^ T[hash >>> 23];

			return hash;
		}

		private final int[] T = { 0x00000000, 0xd4c6b32d, 0x7d4bd577,
				0xa98d665a, 0x2e5119c3, 0xfa97aaee, 0x531accb4, 0x87dc7f99,
				0x5ca23386, 0x886480ab, 0x21e9e6f1, 0xf52f55dc, 0x72f32a45,
				0xa6359968, 0x0fb8ff32, 0xdb7e4c1f, 0x6d82d421, 0xb944670c,
				0x10c90156, 0xc40fb27b, 0x43d3cde2, 0x97157ecf, 0x3e981895,
				0xea5eabb8, 0x3120e7a7, 0xe5e6548a, 0x4c6b32d0, 0x98ad81fd,
				0x1f71fe64, 0xcbb74d49, 0x623a2b13, 0xb6fc983e, 0x0fc31b6f,
				0xdb05a842, 0x7288ce18, 0xa64e7d35, 0x219202ac, 0xf554b181,
				0x5cd9d7db, 0x881f64f6, 0x536128e9, 0x87a79bc4, 0x2e2afd9e,
				0xfaec4eb3, 0x7d30312a, 0xa9f68207, 0x007be45d, 0xd4bd5770,
				0x6241cf4e, 0xb6877c63, 0x1f0a1a39, 0xcbcca914, 0x4c10d68d,
				0x98d665a0, 0x315b03fa, 0xe59db0d7, 0x3ee3fcc8, 0xea254fe5,
				0x43a829bf, 0x976e9a92, 0x10b2e50b, 0xc4745626, 0x6df9307c,
				0xb93f8351, 0x1f8636de, 0xcb4085f3, 0x62cde3a9, 0xb60b5084,
				0x31d72f1d, 0xe5119c30, 0x4c9cfa6a, 0x985a4947, 0x43240558,
				0x97e2b675, 0x3e6fd02f, 0xeaa96302, 0x6d751c9b, 0xb9b3afb6,
				0x103ec9ec, 0xc4f87ac1, 0x7204e2ff, 0xa6c251d2, 0x0f4f3788,
				0xdb8984a5, 0x5c55fb3c, 0x88934811, 0x211e2e4b, 0xf5d89d66,
				0x2ea6d179, 0xfa606254, 0x53ed040e, 0x872bb723, 0x00f7c8ba,
				0xd4317b97, 0x7dbc1dcd, 0xa97aaee0, 0x10452db1, 0xc4839e9c,
				0x6d0ef8c6, 0xb9c84beb, 0x3e143472, 0xead2875f, 0x435fe105,
				0x97995228, 0x4ce71e37, 0x9821ad1a, 0x31accb40, 0xe56a786d,
				0x62b607f4, 0xb670b4d9, 0x1ffdd283, 0xcb3b61ae, 0x7dc7f990,
				0xa9014abd, 0x008c2ce7, 0xd44a9fca, 0x5396e053, 0x8750537e,
				0x2edd3524, 0xfa1b8609, 0x2165ca16, 0xf5a3793b, 0x5c2e1f61,
				0x88e8ac4c, 0x0f34d3d5, 0xdbf260f8, 0x727f06a2, 0xa6b9b58f,
				0x3f0c6dbc, 0xebcade91, 0x4247b8cb, 0x96810be6, 0x115d747f,
				0xc59bc752, 0x6c16a108, 0xb8d01225, 0x63ae5e3a, 0xb768ed17,
				0x1ee58b4d, 0xca233860, 0x4dff47f9, 0x9939f4d4, 0x30b4928e,
				0xe47221a3, 0x528eb99d, 0x86480ab0, 0x2fc56cea, 0xfb03dfc7,
				0x7cdfa05e, 0xa8191373, 0x01947529, 0xd552c604, 0x0e2c8a1b,
				0xdaea3936, 0x73675f6c, 0xa7a1ec41, 0x207d93d8, 0xf4bb20f5,
				0x5d3646af, 0x89f0f582, 0x30cf76d3, 0xe409c5fe, 0x4d84a3a4,
				0x99421089, 0x1e9e6f10, 0xca58dc3d, 0x63d5ba67, 0xb713094a,
				0x6c6d4555, 0xb8abf678, 0x11269022, 0xc5e0230f, 0x423c5c96,
				0x96faefbb, 0x3f7789e1, 0xebb13acc, 0x5d4da2f2, 0x898b11df,
				0x20067785, 0xf4c0c4a8, 0x731cbb31, 0xa7da081c, 0x0e576e46,
				0xda91dd6b, 0x01ef9174, 0xd5292259, 0x7ca44403, 0xa862f72e,
				0x2fbe88b7, 0xfb783b9a, 0x52f55dc0, 0x8633eeed, 0x208a5b62,
				0xf44ce84f, 0x5dc18e15, 0x89073d38, 0x0edb42a1, 0xda1df18c,
				0x739097d6, 0xa75624fb, 0x7c2868e4, 0xa8eedbc9, 0x0163bd93,
				0xd5a50ebe, 0x52797127, 0x86bfc20a, 0x2f32a450, 0xfbf4177d,
				0x4d088f43, 0x99ce3c6e, 0x30435a34, 0xe485e919, 0x63599680,
				0xb79f25ad, 0x1e1243f7, 0xcad4f0da, 0x11aabcc5, 0xc56c0fe8,
				0x6ce169b2, 0xb827da9f, 0x3ffba506, 0xeb3d162b, 0x42b07071,
				0x9676c35c, 0x2f49400d, 0xfb8ff320, 0x5202957a, 0x86c42657,
				0x011859ce, 0xd5deeae3, 0x7c538cb9, 0xa8953f94, 0x73eb738b,
				0xa72dc0a6, 0x0ea0a6fc, 0xda6615d1, 0x5dba6a48, 0x897cd965,
				0x20f1bf3f, 0xf4370c12, 0x42cb942c, 0x960d2701, 0x3f80415b,
				0xeb46f276, 0x6c9a8def, 0xb85c3ec2, 0x11d15898, 0xc517ebb5,
				0x1e69a7aa, 0xcaaf1487, 0x632272dd, 0xb7e4c1f0, 0x3038be69,
				0xe4fe0d44, 0x4d736b1e, 0x99b5d833 };
	};

	/** Bitwise-and to extract only the low bits. */
	final Fold truncate = new Fold() {
		@Override
		public int fold(int hash, int bits) {
			return hash & ((1 << bits) - 1);
		}
	};

	/** Applies the golden ratio and takes the upper bits. */
	final Fold golden_ratio = new Fold() {
		@Override
		public int fold(int hash, int bits) {
			/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
			return (hash * 0x9e370001) >>> (32 - bits);
		}
	};

	// -----------------------------------------------------------------------
	//
	// Implementation of the suite lives below this line.
	//
	//

	@Option(name = "--hash", multiValued = true, metaVar = "NAME", usage = "Enable hash function(s)")
	List hashFunctions = new ArrayList();

	@Option(name = "--fold", multiValued = true, metaVar = "NAME", usage = "Enable fold function(s)")
	List foldFunctions = new ArrayList();

	@Option(name = "--text-limit", metaVar = "LIMIT", usage = "Maximum size in KiB to scan")
	int textLimit = 15 * 1024; // 15 MiB as later we do * 1024.

	@Option(name = "--repository", aliases = { "-r" }, multiValued = true, metaVar = "GIT_DIR", usage = "Repository to scan")
	List gitDirs = new ArrayList();

	@Override
	protected boolean requiresRepository() {
		return false;
	}

	@Override
	protected void run() throws Exception {
		if (gitDirs.isEmpty()) {
			RepositoryBuilder rb = new RepositoryBuilder() //
					.setGitDir(new File(gitdir)) //
					.readEnvironment() //
					.findGitDir();
			if (rb.getGitDir() == null)
				throw die(CLIText.get().cantFindGitDirectory);
			gitDirs.add(rb.getGitDir());
		}

		for (File dir : gitDirs) {
			RepositoryBuilder rb = new RepositoryBuilder();
			if (RepositoryCache.FileKey.isGitRepository(dir, FS.DETECTED))
				rb.setGitDir(dir);
			else
				rb.findGitDir(dir);

			Repository db = rb.build();
			try {
				run(db);
			} finally {
				db.close();
			}
		}
	}

	private void run(Repository db) throws Exception {
		List all = init();

		long fileCnt = 0;
		long lineCnt = 0;
		ObjectReader or = db.newObjectReader();
		try {
			final MutableObjectId id = new MutableObjectId();
			RevWalk rw = new RevWalk(or);
			TreeWalk tw = new TreeWalk(or);
			tw.reset(rw.parseTree(db.resolve(Constants.HEAD)));
			tw.setRecursive(true);

			while (tw.next()) {
				FileMode fm = tw.getFileMode(0);
				if (!FileMode.REGULAR_FILE.equals(fm)
						&& !FileMode.EXECUTABLE_FILE.equals(fm))
					continue;

				byte[] raw;
				try {
					tw.getObjectId(id, 0);
					raw = or.open(id).getCachedBytes(textLimit * 1024);
				} catch (LargeObjectException tooBig) {
					continue;
				}

				if (RawText.isBinary(raw))
					continue;

				RawText txt = new RawText(raw);
				int[] lines = new int[txt.size()];
				int cnt = 0;
				HashSet u = new HashSet();
				for (int i = 0; i < txt.size(); i++) {
					if (u.add(new Line(txt, i)))
						lines[cnt++] = i;
				}

				fileCnt++;
				lineCnt += cnt;

				for (Function fun : all)
					testOne(fun, txt, lines, cnt);
			}
		} finally {
			or.release();
		}

		if (db.getDirectory() != null) {
			String name = db.getDirectory().getName();
			File parent = db.getDirectory().getParentFile();
			if (name.equals(Constants.DOT_GIT) && parent != null)
				name = parent.getName();
			outw.println(name + ":"); //$NON-NLS-1$
		}
		outw.format("  %6d files; %5d avg. unique lines/file\n", //
				valueOf(fileCnt), //
				valueOf(lineCnt / fileCnt));
		outw.format("%-20s %-15s %9s\n", "Hash", "Fold", "Max Len");
		outw.println("-----------------------------------------------"); //$NON-NLS-1$
		String lastHashName = null;
		for (Function fun : all) {
			String hashName = fun.hash.name;
			if (hashName.equals(lastHashName))
				hashName = ""; //$NON-NLS-1$
			outw.format("%-20s %-15s %9d\n", // //$NON-NLS-1$
					hashName, //
					fun.fold.name, //
					valueOf(fun.maxChainLength));
			lastHashName = fun.hash.name;
		}
		outw.println();
		outw.flush();
	}

	private static void testOne(Function fun, RawText txt, int[] elements,
			int cnt) {
		final Hash cmp = fun.hash;
		final Fold fold = fun.fold;

		final int bits = tableBits(cnt);
		final int[] buckets = new int[1 << bits];
		for (int i = 0; i < cnt; i++)
			buckets[fold.fold(cmp.hash(txt, elements[i]), bits)]++;

		int maxChainLength = 0;
		for (int i = 0; i < buckets.length; i++)
			maxChainLength = Math.max(maxChainLength, buckets[i]);
		fun.maxChainLength = Math.max(fun.maxChainLength, maxChainLength);
	}

	private List init() {
		List hashes = new ArrayList();
		List folds = new ArrayList();

		try {
			for (Field f : TextHashFunctions.class.getDeclaredFields()) {
				if (f.getType() == Hash.class) {
					f.setAccessible(true);
					Hash cmp = (Hash) f.get(this);
					cmp.name = f.getName();
					hashes.add(cmp);

				} else if (f.getType() == Fold.class) {
					f.setAccessible(true);
					Fold fold = (Fold) f.get(this);
					fold.name = f.getName();
					folds.add(fold);
				}
			}
		} catch (IllegalArgumentException e) {
			throw new RuntimeException("Cannot determine names", e);
		} catch (IllegalAccessException e) {
			throw new RuntimeException("Cannot determine names", e);
		}

		List all = new ArrayList();
		for (Hash cmp : hashes) {
			if (include(cmp.name, hashFunctions)) {
				for (Fold f : folds) {
					if (include(f.name, foldFunctions)) {
						all.add(new Function(cmp, f));
					}
				}
			}
		}
		return all;
	}

	private static boolean include(String name, List want) {
		if (want.isEmpty())
			return true;
		for (String s : want) {
			if (s.equalsIgnoreCase(name))
				return true;
		}
		return false;
	}

	private static class Function {
		final Hash hash;

		final Fold fold;

		int maxChainLength;

		Function(Hash cmp, Fold fold) {
			this.hash = cmp;
			this.fold = fold;
		}
	}

	/** Base class for any hashCode function to be tested. */
	private static abstract class Hash extends RawTextComparator {
		String name;

		@Override
		public boolean equals(RawText a, int ai, RawText b, int bi) {
			return RawTextComparator.DEFAULT.equals(a, ai, b, bi);
		}
	}

	/** Base class for any hashCode folding function to be tested. */
	private static abstract class Fold {
		String name;

		/**
		 * Fold the given 32-bit hash code into only {@code bits} of space.
		 *
		 * @param hash
		 *            the 32 bit hash code to be folded into a smaller value.
		 * @param bits
		 *            total number of bits that can appear in the output. The
		 *            output value must be in the range {@code [0, 1 << bits)}.
		 *            When bits = 2, valid outputs are 0, 1, 2, 3.
		 * @return the folded hash, squeezed into only {@code bits}.
		 */
		abstract int fold(int hash, int bits);
	}

	/** Utility to help us identify unique lines in a file. */
	private class Line {
		private final RawText txt;

		private final int pos;

		Line(RawText txt, int pos) {
			this.txt = txt;
			this.pos = pos;
		}

		@Override
		public int hashCode() {
			return RawTextComparator.DEFAULT.hash(txt, pos);
		}

		@Override
		public boolean equals(Object obj) {
			if (obj instanceof Line) {
				Line e = (Line) obj;
				return RawTextComparator.DEFAULT.equals(txt, pos, e.txt, e.pos);
			}
			return false;
		}
	}

	private static int tableBits(final int sz) {
		int bits = 31 - Integer.numberOfLeadingZeros(sz);
		if (bits == 0)
			bits = 1;
		if (1 << bits < sz)
			bits++;
		return bits;
	}
}