All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.hash.PerfectHash Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.hash;

import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

/**
 * A perfect hash of an array of strings to their index in the array.
 * 

* A perfect hash function for a set S is a hash function * that maps distinct elements in S to a set of integers, * with no collisions. In mathematical terms, it is an injective function. *

* Perfect hash functions may be used to implement a lookup table with * constant worst-case access time. *

* A perfect hash function for a specific set S can be found by * a randomized algorithm in a number of operations that is proportional * to the size of S. The original construction of Fredman, * Komlós and Szemerédi (1984) chooses a large prime p * (larger than the size of the universe from which S is drawn), * and a parameter k, and maps each element x of * S to the index g(x) = (kx mod p) mod n. * * @author Haifeng Li */ public class PerfectHash implements Serializable { /** The keyword set. */ private String[] keywords; /** Hash table. */ private int[] table; /** The k parameters to calculate the hash. */ private int[] kvals; /** The lowest character value shown in the keywords. */ private char min; /** The character positions in keywords used to calculate the hash. */ private int[] select; /** Constructs the perfect hash of strings. */ public PerfectHash(String... keywords) { this(null, keywords); } /** * Constructs the perfect hash of strings. * @param select The character positions in keywords used to calculate the hash. */ public PerfectHash(int[] select, String... keywords) { if (keywords.length == 0) { throw new IllegalArgumentException("Empty string set"); } if (select != null) { this.select = Arrays.copyOf(select, select.length); Arrays.sort(this.select); } this.keywords = keywords; generate(keywords); } /** * Returns the index of a string. If the string * is in the set, returns its array index. Otherwise, -1. */ public int get(String key) { int i = hash(key); if (i < 0 || i >= table.length) return -1; int idx = table[i]; if (!key.equals(keywords[idx])) return -1; return idx; } /** Returns the hash code of a string. */ private int hash(String k) { int klen = k.length(); int out = klen; if (select == null) { for (int i = 0; i < klen; i++) { int c = k.charAt(i) - min; if (c < 0) return -1; if (c >= kvals.length) return -2; out += kvals[c]; } } else { for (int i : select) { if (i >= klen) continue; int c = k.charAt(i) - min; if (c < 0) return -1; if (c >= kvals.length) return -2; out += kvals[c]; } } return out; } /** Keyword information. */ private static class Key implements Comparable { /** selected characters in the string for hash computation. */ char[] ksig; /** original key length. */ int klen; /** the frequency of each character. */ int kfreq; /** the value of map. */ int value; public Key(char[] ksig, int klen, int value) { this.ksig = ksig; this.klen = klen; this.kfreq = 0; this.value = value; } @Override public int compareTo(Key b) { // sort in descending order return Integer.compare(b.kfreq, kfreq); } } /** Sorts an array according to freq. */ private void sort(char[] arr, int[] freq) { for (int i = 1; i < arr.length; i++) { for (int j = i; j > 0; j--) { if (freq[arr[j]] < freq[arr[j - 1]]) { char tmp = arr[j]; arr[j] = arr[j - 1]; arr[j - 1] = tmp; } else { break; } } } } /** Adds a key. */ private void add(Map map, String k, int v) { char[] ksig; int klen = k.length(); if (select == null) { ksig = k.toCharArray(); } else { ksig = new char[select.length]; int idx = 0; for (int i : select) { if (i >= klen) continue; ksig[idx++] = k.charAt(i); } if (idx < ksig.length) { ksig = Arrays.copyOf(ksig, idx); } } Key prev = map.put(k, new Key(ksig, klen, v)); if (prev != null) { throw new IllegalArgumentException(String.format("Duplicate key %s at %d and %d", k, prev.value, v)); } } /** Counts the character frequency across all string keys. */ private int[] countCharacterFrequency(Map map) { int[] freq = new int[Character.MAX_VALUE]; for (Key m : map.values()) { for (char c : m.ksig) freq[c]++; } return freq; } /** Sorts keys by character frequency. */ private Key[] sortKeys(Map map, int[] freq) { Key[] keys = new Key[map.size()]; int idx = 0; for (Key key : map.values()) { keys[idx++] = key; for (char c : key.ksig) { key.kfreq += freq[c]; } sort(key.ksig, freq); } Arrays.sort(keys); return keys; } /** Find a char in a which is not in b. */ private char diff(char[] a, char[] b) { OUTER: for (char _a : a) { for (char _b : b) { if (_a == _b) continue OUTER; } return _a; } throw new IllegalArgumentException(String.format("Failed to find disjoint union of keysigs: %s and %s", Arrays.toString(a), Arrays.toString(b))); } /** Used in key generation phase. */ private int hash(Key m) { int out = m.klen; for (char c : m.ksig) out += kvals[c - min]; return out; } private void resolve(Key x, Key y, int[] kvals, char min) { char c = diff(x.ksig, y.ksig); kvals[c - min]++; } /** Generates the perfect hash. */ private void generate(String[] keywords) { Map map = new HashMap<>(); for (int i = 0; i < keywords.length; i++) { add(map, keywords[i], i); } int[] freq = countCharacterFrequency(map); Key[] keys = sortKeys(map, freq); min = Character.MAX_VALUE; char max = 0; for (char i = 0; i < freq.length; i++) { if (freq[i] > 0) { min = i; break; } } for (char i = (char)(freq.length - 1); i >= 0; i--) { if (freq[i] > 0) { max = i; break; } } if (max < min) { throw new IllegalStateException("Failed to generate perfect hash. Possibly all empty keys."); } kvals = new int[max - min + 1]; int vsize = keys.length; int tsize = vsize + (vsize >> 1); Key[] used = new Key[tsize]; LOOP: for (;;) { for (Key key : keys) { int hash = hash(key); if (hash >= used.length) { tsize = hash + 1; used = new Key[tsize]; continue LOOP; } if (used[hash] != null) { resolve(key, used[hash], kvals, min); Arrays.fill(used, null); continue LOOP; } used[hash] = key; } table = new int[tsize]; Arrays.fill(table, -1); for (Map.Entry e : map.entrySet()) { Key m = e.getValue(); int hash = hash(m); if (table[hash] != -1) { throw new IllegalStateException("Failed to generate perfect hash."); } table[hash] = m.value; } return; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy