All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.almondtools.rexlex.stringsearch.WuManber Maven / Gradle / Ivy

package com.almondtools.rexlex.stringsearch;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import com.almondtools.rexlex.io.CharProvider;

public class WuManber implements StringSearchAlgorithm {

	private static final int SHIFT_SEED = 17;
	private static final int HASH_SEED = 23;
	private static final int SHIFT_SIZE = 255;
	private static final int HASH_SIZE = 127;

	private char minChar;
	private char maxChar;
	private int minLength;
	private int block;
	private int[] shift;
	private TrieRoot[] hash;

	public WuManber(List patterns) {
		List charpatterns = toCharArray(patterns);
		this.maxChar = computeMaxChar(charpatterns);
		this.minChar = computeMinChar(charpatterns);
		this.minLength = minLength(charpatterns);
		this.block = blockSize(minLength, minChar, maxChar, charpatterns.size());
		this.shift = computeShift(charpatterns, block, minLength);
		this.hash = computeHash(charpatterns, block);
	}

	private int minLength(List patterns) {
		int len = Integer.MAX_VALUE;
		for (char[] pattern : patterns) {
			if (pattern.length < len) {
				len = pattern.length;
			}
		}
		return len;
	}

	private static char computeMinChar(List patterns) {
		char min = Character.MAX_VALUE;
		for (char[] pattern : patterns) {
			for (int i = 0; i < pattern.length; i++) {
				if (pattern[i] < min) {
					min = pattern[i];
				}
			}
		}
		return min;
	}

	private static char computeMaxChar(List patterns) {
		char max = Character.MIN_VALUE;
		for (char[] pattern : patterns) {
			for (int i = 0; i < pattern.length; i++) {
				if (pattern[i] > max) {
					max = pattern[i];
				}
			}
		}
		return max;
	}

	private List toCharArray(List patterns) {
		List charpatterns = new ArrayList(patterns.size());
		for (String pattern : patterns) {
			charpatterns.add(pattern.toCharArray());
		}
		return charpatterns;
	}

	private static int blockSize(int minLength, char minChar, char maxChar, int patterns) {
		int optSize = (int) Math.ceil(Math.log(2 * minLength * patterns) / Math.log(maxChar - minChar));
		if (optSize > minLength) {
			return minLength;
		} else {
			return optSize;
		}
	}

	private static int[] computeShift(List patterns, int block, int minLength) {
		int[] shift = new int[SHIFT_SIZE];
		for (int i = 0; i < shift.length; i++) {
			shift[i] = minLength - block + 1;
		}
		List patternStrings = new ArrayList();
		Set blocks = new HashSet();
		for (char[] pattern : patterns) {
			patternStrings.add(new String(pattern));
			for (int i = 0; i < pattern.length + 1 - block; i++) {
				blocks.add(new String(Arrays.copyOfRange(pattern, i, i + block)));
			}
		}
		for (String currentBlock : blocks) {
			int shiftKey = shiftHash(currentBlock.toCharArray());
			int shiftBy = shift[shiftKey];
			for (String pattern : patternStrings) {
				int rightMost = pattern.length() - findRightMost(pattern, currentBlock) - block;
				if (rightMost >= 0 && rightMost < shiftBy) {
					shiftBy = rightMost;
				}
			}
			shift[shiftKey] = shiftBy;
		}
		return shift;
	}

	private static int findRightMost(String pattern, String block) {
		return pattern.lastIndexOf(block);
	}

	public static int shiftHash(char[] block) {
		int result = 1;
		for (char c : block) {
			result = SHIFT_SEED * result + c;
		}
		return result % SHIFT_SIZE;
	}

	private static TrieRoot[] computeHash(List charpatterns, int block) {
		TrieRoot[] hash = new TrieRoot[HASH_SIZE];
		for (char[] pattern : charpatterns) {
			char[] lastBlock = Arrays.copyOfRange(pattern, pattern.length - block, pattern.length);
			int hashKey = hashHash(lastBlock);
			TrieRoot trie = hash[hashKey];
			if (trie == null) {
				trie = new TrieRoot();
				hash[hashKey] = trie;
			}
			trie.extendReverse(pattern);
		}
		return hash;
	}

	public static int hashHash(char[] block) {
		int result = 1;
		for (char c : block) {
			result = HASH_SEED * result + c;
		}
		return result % HASH_SIZE;
	}

	@Override
	public StringFinder createFinder(CharProvider chars) {
		return new Finder(chars);
	}

	@Override
	public int getPatternLength() {
		return minLength;
	}

	private class Finder implements StringFinder {

		private CharProvider chars;
		private List buffer;

		public Finder(CharProvider chars) {
			this.chars = chars;
			this.buffer = new LinkedList();
		}

		@Override
		public void skipTo(int pos) {
			chars.move(pos);
		}

		@Override
		public StringMatch findNext() {
			if (!buffer.isEmpty()) {
				return buffer.remove(0);
			}
			int lookahead = minLength - 1;
			while (!chars.finished(lookahead)) {
				int pos = chars.current();
				char[] lastBlock = chars.between(pos + minLength - block, pos + minLength);
				int shiftKey = shiftHash(lastBlock);
				int shiftBy = shift[shiftKey];
				if (shiftBy == 0) {
					int hashkey = hashHash(lastBlock);
					Trie node = hash[hashkey];
					int patternPointer = lookahead;
					node = node.nextNode(chars.lookahead(patternPointer));
					while (node != null) {
						if (node.isTerminal()) {
							buffer.add(createMatch(patternPointer));
						}
						patternPointer--;
						if (pos + patternPointer < 0) {
							break;
						}
						node = node.nextNode(chars.lookahead(patternPointer));
					}
					chars.next();
					if (!buffer.isEmpty()) {
						return buffer.remove(0);
					}
				} else {
					chars.forward(shiftBy);
				}
			}
			return null;
		}

		@Override
		public List findAll() {
			List matches = new ArrayList();
			while (true) {
				StringMatch match = findNext();
				if (match == null) {
					return matches;
				} else {
					matches.add(match);
				}
			}
		}

		private StringMatch createMatch(int patternPointer) {
			int start = chars.current() + patternPointer;
			int end = chars.current() + minLength;
			String s = chars.slice(start, end);
			return new StringMatch(start, end, s);
		}
	}

	public static class Factory implements MultiWordSearchAlgorithmFactory {

		@Override
		public StringSearchAlgorithm of(List patterns) {
			return new WuManber(patterns);
		}

	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy