cdc.office.tools.MultiplyShiftHashSearcher Maven / Gradle / Ivy
package cdc.office.tools;
import java.io.File;
import java.io.IOException;
import java.util.BitSet;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import cdc.office.ss.WorkbookWriter;
import cdc.office.ss.WorkbookWriterFactory;
import cdc.office.ss.WorkbookWriterFeatures;
import cdc.office.tables.TableSection;
import cdc.util.cli.AbstractMainSupport;
import cdc.util.cli.FeatureMask;
import cdc.util.cli.MainResult;
import cdc.util.cli.OptionEnum;
import cdc.util.function.MultiplyShiftCharPredicate;
import cdc.util.time.Chronometer;
/**
* Given a set of characters, this tools can search the combinations of multiplier and shift values
* to configure a {@link MultiplyShiftCharPredicate}.
*
* Computing those values can be very long, depending on the size of the character set.
*
* @author Damien Carbonne
*/
public final class MultiplyShiftHashSearcher {
private static final Logger LOGGER = LogManager.getLogger(MultiplyShiftHashSearcher.class);
private final MainArgs margs;
private final BitSet set = new BitSet();
/** Number of solutions. */
private long count = 0L;
/** Number of solutions indexed with the maximum hash code. */
private final long[] counts;
/** Maximum possible number of entries (values different from 0) in counts. */
private final int countsMaxEntries;
/** Number of entries in counts. */
private int countsEntries = 0;
/** The set of characters for which a perfect hash function is searched. */
private final char[] chars;
/** Length of chars. */
private final int length;
/** Maximum acceptable hash code. */
private final int maxAcceptableHashCode;
/** Filler character. */
private final char filler;
private final boolean verbose;
private final boolean showAll;
private final boolean stopOnBest;
private final boolean fullCheck;
private final int bestSize;
private boolean bestSizeFound = false;
private final WorkbookWriter> writer;
public static class MainArgs {
public String chars;
public Character filler = null;
public File output;
public int minMultiplier = 1;
public int maxMultiplier = Integer.MAX_VALUE;
public double maxRatio = 4.0;
protected final FeatureMask features = new FeatureMask<>();
public boolean isEnabled(Feature feature) {
return features.isEnabled(feature);
}
public void setEnabled(Feature feature,
boolean enabled) {
features.setEnabled(feature, enabled);
}
public void validate() throws ParseException {
if (minMultiplier <= 0) {
throw new ParseException("min multiplier too small.");
}
if (maxMultiplier <= 0) {
throw new ParseException("max multiplier too small.");
}
if (maxRatio < 1.0) {
throw new ParseException("max ratio too small.");
}
}
/**
* Enumeration of possible boolean options.
*/
public enum Feature implements OptionEnum {
VERBOSE("verbose", "Print messages."),
FULL_CHECK("full-check",
"If enabled, a full check is done. This is very expensive but gives a better garantee on validity of result.\n"
+ "When the number of characters is high, it should be disabled."),
SHOW_ALL("show-all",
"Show all matching (multiplier, shift) pairs. If disabled, show only one solution for each max hash code."),
STOP_ON_BEST("stop-on-best",
"Stop searching when a solution whose size is the smallest power of 2 larger than the number of characters to encode has been found.");
private final String name;
private final String description;
private Feature(String name,
String description) {
this.name = name;
this.description = description;
}
@Override
public final String getName() {
return name;
}
@Override
public final String getDescription() {
return description;
}
}
}
private MultiplyShiftHashSearcher(MainArgs margs) throws IOException {
this.margs = margs;
LOGGER.debug("original: {}", margs.chars);
final Set tmp = new HashSet<>();
final StringBuilder builder = new StringBuilder();
final String decoded = decode(margs.chars);
for (int index = 0; index < decoded.length(); index++) {
final char c = decoded.charAt(index);
if (!tmp.contains(c)) {
tmp.add(c);
builder.append(c);
}
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("decoded: {}", decoded);
LOGGER.debug("protected decoded: {}", encode(decoded));
}
this.chars = builder.toString().toCharArray();
this.length = chars.length;
this.bestSize = 1 << (32 - Integer.numberOfLeadingZeros(this.length - 1));
this.maxAcceptableHashCode = (int) (this.length * margs.maxRatio) - 1;
this.filler = margs.filler == null ? chars[0] : margs.filler;
this.counts = new long[this.maxAcceptableHashCode + 1];
this.countsMaxEntries = this.maxAcceptableHashCode - this.length + 2;
this.verbose = margs.isEnabled(MainArgs.Feature.VERBOSE);
this.showAll = margs.isEnabled(MainArgs.Feature.SHOW_ALL);
this.stopOnBest = margs.isEnabled(MainArgs.Feature.STOP_ON_BEST);
this.fullCheck = margs.isEnabled(MainArgs.Feature.FULL_CHECK);
if (fullCheck) {
log("Full check is enabled: performances will be degraded.");
} else {
log("Full check is disabled: results may be invalid.");
}
log("number of chars: " + length);
log("best size: " + bestSize);
log("max acceptable hashcode: " + maxAcceptableHashCode);
log("min multiplier: " + margs.minMultiplier);
log("max multiplier: " + margs.maxMultiplier);
if (margs.output == null) {
this.writer = null;
} else {
final WorkbookWriterFactory factory = new WorkbookWriterFactory();
this.writer = factory.create(margs.output, WorkbookWriterFeatures.STANDARD_BEST);
this.writer.beginSheet("Parameters");
this.writer.addRow(TableSection.HEADER, "Parameter", "Value");
this.writer.addRow(TableSection.DATA, "Chars", encode(decoded));
this.writer.addRow(TableSection.DATA, "Chars Length", length);
this.writer.addRow(TableSection.DATA, "Best Size", bestSize);
this.writer.addRow(TableSection.DATA, "Max Ratio", margs.maxRatio);
this.writer.addRow(TableSection.DATA, "Max Hash Code", maxAcceptableHashCode);
this.writer.addRow(TableSection.DATA, "Filler", filler);
for (final MainArgs.Feature feature : MainArgs.Feature.values()) {
this.writer.addRow(TableSection.DATA, feature, margs.isEnabled(feature));
}
this.writer.beginSheet("Solutions");
this.writer.addRow(TableSection.HEADER,
"Multiplier",
"Shift",
"Min Hash Code",
"Max Hash Code",
"Hash Table Size",
"Hash Table");
}
}
private void log(String message) {
if (writer == null || verbose) {
LOGGER.info(message);
}
}
private static String encode(char c) {
return String.format("\\u%04x", (int) c);
}
private static String encode(String s) {
final StringBuilder builder = new StringBuilder();
for (int index = 0; index < s.length(); index++) {
final char c = s.charAt(index);
if (c == '\r') {
builder.append("\\r");
} else if (c == '\n') {
builder.append("\\n");
} else if (c == '\t') {
builder.append("\\t");
} else if (c == '\f') {
builder.append("\\f");
} else if (c == '\b') {
builder.append("\\b");
} else if (c == '\\') {
builder.append("\\\\");
} else if (c == '"') {
builder.append("\\\"");
} else if (Character.isWhitespace(c) || Character.isISOControl(c)
|| c == '\u202F' || c == '\u00A0' || c == '\u303F' || c == '\uFEFF'
|| c == '\u180E' || ('\u2000' <= c && c <= '\u200B')) {
builder.append(encode(c));
} else {
builder.append(c);
}
}
return builder.toString();
}
private static String decode(String s) {
final StringBuilder builder = new StringBuilder();
int index = 0;
while (index < s.length()) {
final char c = s.charAt(index);
if (c == '\\') {
final char next = s.charAt(index + 1);
if (next == '\\') {
builder.append('\\');
index++;
} else if (next == 'n') {
builder.append('\n');
index++;
} else if (next == 't') {
builder.append('\t');
index++;
} else if (next == 'r') {
builder.append('\r');
index++;
} else if (next == 'f') {
builder.append('\f');
index++;
} else if (next == 'b') {
builder.append('\b');
index++;
} else if (next == '"') {
builder.append('"');
index++;
} else if (next == 'u') {
// expect XXXX
final String code = s.substring(index + 2, index + 6);
index += 5;
builder.append((char) Integer.parseInt(code, 16));
} else {
throw new IllegalArgumentException();
}
} else {
builder.append(c);
}
index++;
}
return builder.toString();
}
/**
* Tests a multiplier and shift and returns {@code true} if they are usable.
*
* @param multiplier The multiplier.
* @param shift The shift.
* @return {@code true} if {@code multiplier} and {@code shift} are usable.
* @throws IOException When an IO error occurs.
*/
private boolean test(int multiplier,
int shift) throws IOException {
// Encode passed characters to check that they are all have a different hash
set.clear();
for (final char c : chars) {
final int hash = (c * multiplier) >>> shift;
if (hash < 0 || hash > maxAcceptableHashCode) {
return false;
}
set.set(hash);
}
if (set.cardinality() == length) {
// All characters have been hashed with a valid and different hash code
count++;
// Now encode all characters
if (fullCheck) {
set.clear();
for (int c = 65535; c > 0; c--) {
final int hash = (c * multiplier) >>> shift;
if (hash < 0 || hash > maxAcceptableHashCode) {
return false;
}
set.set(hash);
}
}
// Computes the maximum hash code
final int minHashCode = set.nextSetBit(0);
final int maxHashCode = set.length() - 1;
if (maxHashCode < bestSize) {
bestSizeFound = true;
}
final boolean isFirst = counts[maxHashCode] == 0;
if (isFirst) {
countsEntries++;
}
counts[maxHashCode]++;
if (isFirst || showAll) {
// Computes the hash table (must be a power of 2)
final int size = 1 << (32 - Integer.numberOfLeadingZeros(maxHashCode));
final char[] buffer = new char[size];
for (int index = 0; index < size; index++) {
buffer[index] = filler;
}
for (final char c : chars) {
final int hash = (c * multiplier) >>> shift;
buffer[hash] = c;
}
final String hashTable = new String(buffer);
if (writer == null || isFirst) {
log(String.format("multiplier: %9d shift: %2d min: %3d max: %3d size: %3d table: %s",
multiplier,
shift,
minHashCode,
maxHashCode,
size,
encode(hashTable)));
}
if (writer != null) {
writer.beginRow(TableSection.DATA);
writer.addCell(multiplier);
writer.addCell(shift);
writer.addCell(minHashCode);
writer.addCell(maxHashCode);
writer.addCell(size);
writer.addCell(encode(hashTable));
}
}
return true;
} else {
return false;
}
}
public static void test(String chars,
Character filler,
int multiplier,
int shift) {
final MainArgs margs = new MainArgs();
margs.chars = chars;
margs.filler = filler;
margs.output = null;
try {
final MultiplyShiftHashSearcher instance = new MultiplyShiftHashSearcher(margs);
final boolean found = instance.test(multiplier, shift);
if (!found) {
LOGGER.info("Failed with multiplier: {} shift: {}", multiplier, shift);
}
} catch (final IOException e) {
LOGGER.catching(e);
}
}
private void execute() throws IOException {
final Chronometer chrono = new Chronometer();
chrono.start();
final int maxShift = Integer.numberOfLeadingZeros(length - 1);
boolean finished = false;
for (int multiplier = margs.minMultiplier; !finished && multiplier > 0 && multiplier <= margs.maxMultiplier; multiplier++) {
if (multiplier % 1_000_000 == 0 || multiplier == margs.minMultiplier || multiplier == margs.maxMultiplier) {
log(String.format("%d %d %d/%d",
multiplier,
count,
countsEntries,
countsMaxEntries));
if (writer != null) {
writer.flush();
}
}
for (int shift = 0; !finished && shift <= maxShift; shift++) {
test(multiplier, shift);
finished = !showAll && (countsEntries == countsMaxEntries || (bestSizeFound && stopOnBest));
}
}
chrono.suspend();
log("Finished in " + chrono);
if (bestSizeFound) {
log("Found solution(s) for best size (" + bestSize + ")");
}
// Show the number of solutions found for each maximum hash code
// This smallest possible maximum hash code is length - 1
log("Number of solutions for max hash code between " + (length - 1) + " and " + maxAcceptableHashCode);
for (int index = length - 1; index < counts.length; index++) {
log(String.format(" %3d: %19d", index, counts[index]));
}
if (writer != null) {
writer.close();
}
}
public static void execute(MainArgs margs) throws IOException {
final MultiplyShiftHashSearcher instance = new MultiplyShiftHashSearcher(margs);
instance.execute();
}
public static MainResult exec(String... args) {
final MainSupport support = new MainSupport();
support.main(args);
return support.getResult();
}
public static void main(String... args) {
final int code = exec(args).getCode();
System.exit(code);
}
private static class MainSupport extends AbstractMainSupport {
private static final String CHARS = "chars";
private static final String FILLER = "filler";
private static final String MIN_MULTIPLIER = "min-multiplier";
private static final String MAX_MULTIPLIER = "max-multiplier";
private static final String MAX_RATIO = "max-ratio";
public MainSupport() {
super(MultiplyShiftHashSearcher.class, LOGGER);
}
@Override
protected String getVersion() {
return Config.VERSION;
}
@Override
protected boolean addArgsFileOption(Options options) {
return true;
}
@Override
protected void addSpecificOptions(Options options) {
options.addOption(Option.builder()
.longOpt(OUTPUT)
.desc("Optional name of the output spreadsheet file (must end with a CSV, XLS, XLSX compliant extension).\n"
+ "Warning: only CSV format supports unlimited number of rows.")
.hasArg()
.build());
options.addOption(Option.builder()
.longOpt(CHARS)
.desc("Characters to hash. Escaping characters is possible: \\r \\n \\t \\b \\f \\\\ and \\uXXXX.")
.hasArg()
.required()
.build());
options.addOption(Option.builder()
.longOpt(FILLER)
.desc("Optional filler character.")
.hasArg()
.build());
options.addOption(Option.builder()
.longOpt(MIN_MULTIPLIER)
.desc("Optional min multiplier (default to 1).")
.hasArg()
.build());
options.addOption(Option.builder()
.longOpt(MAX_MULTIPLIER)
.desc("Optional max multiplier (default to " + Integer.MAX_VALUE + ").")
.hasArg()
.build());
options.addOption(Option.builder()
.longOpt(MAX_RATIO)
.desc("Optional max ratio (default to 4.0).")
.hasArg()
.build());
addNoArgOptions(options, MainArgs.Feature.class);
}
@Override
protected MainArgs analyze(CommandLine cl) throws ParseException {
final MainArgs margs = new MainArgs();
margs.output = getValueAsFile(cl, OUTPUT);
margs.chars = getValueAsString(cl, CHARS, "");
margs.filler = getValueAsChar(cl, FILLER, null);
margs.minMultiplier = Math.max(getValueAsInt(cl, MIN_MULTIPLIER, 1), 1);
margs.maxMultiplier = getValueAsInt(cl, MAX_MULTIPLIER, Integer.MAX_VALUE);
margs.maxRatio = getValueAsDouble(cl, MAX_RATIO, 4.0);
setMask(cl, MainArgs.Feature.class, margs.features::setEnabled);
margs.validate();
return margs;
}
@Override
protected Void execute(MainArgs margs) throws IOException {
MultiplyShiftHashSearcher.execute(margs);
return null;
}
}
}