me.lemire.integercompression.benchmarktools.BenchmarkCSV Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of JavaFastPFOR Show documentation
Show all versions of JavaFastPFOR Show documentation
It is a library to compress and uncompress arrays of integers
very fast. The assumption is that most (but not all) values in
your array use less than 32 bits.
package me.lemire.integercompression.benchmarktools;
import me.lemire.integercompression.*;
import me.lemire.integercompression.differential.IntegratedBinaryPacking;
import me.lemire.integercompression.differential.IntegratedByteIntegerCODEC;
import me.lemire.integercompression.differential.IntegratedComposition;
import me.lemire.integercompression.differential.IntegratedIntegerCODEC;
import me.lemire.integercompression.differential.IntegratedVariableByte;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
/**
* This will run benchmarks using a set of posting lists stored as CSV files.
*
* @author lemire
*
*/
public class BenchmarkCSV {
static IntegratedIntegerCODEC codecs[] = {
new IntegratedComposition(new IntegratedBinaryPacking(),
new IntegratedVariableByte()) };
static IntegratedByteIntegerCODEC bcodecs[] = { new IntegratedVariableByte() };
static IntegerCODEC regcodecs[] = {
new Composition(new FastPFOR128(), new VariableByte()),
new Composition(new FastPFOR(), new VariableByte()),
new Composition(new BinaryPacking(), new VariableByte()) };
static ByteIntegerCODEC regbcodecs[] = { new VariableByte() };
private static ArrayList loadIntegers(final String filename, final Format f)
throws IOException {
int misparsed = 0;
if (f == Format.ONEARRAYPERLINE) {
ArrayList answer = new ArrayList();
BufferedReader br = new BufferedReader(new FileReader(
filename));
String s;
while ((s = br.readLine()) != null) {
String[] numbers = s.split("[,;;]"); // that's
// slow
int[] a = new int[numbers.length];
for (int k = 0; k < numbers.length; ++k) {
try {
a[k] = Integer
.parseInt(numbers[k]
.trim());
} catch (java.lang.NumberFormatException nfe) {
if (misparsed == 0)
System.err.println(nfe);
++misparsed;
}
}
answer.add(a);
}
if (misparsed > 0)
System.out.println("Failed to parse "
+ misparsed + " entries");
br.close();
return answer;
} else if (f == Format.ONEARRAYPERFILE) {
ArrayList answer = new ArrayList();
BufferedReader br = new BufferedReader(new FileReader(
filename));
String s;
while ((s = br.readLine()) != null) {
String[] numbers = s.split("[,;;]");// that's
// slow
for (int k = 0; k < numbers.length; ++k) {
try {
answer.add(Integer
.parseInt(numbers[k]
.trim()));
} catch (java.lang.NumberFormatException nfe) {
if (misparsed == 0)
System.err.println(nfe);
++misparsed;
}
}
}
int[] actualanswer = new int[answer.size()];
for (int i = 0; i < answer.size(); ++i)
actualanswer[i] = answer.get(i);
ArrayList wrap = new ArrayList();
wrap.add(actualanswer);
if (misparsed > 0)
System.out.println("Failed to parse "
+ misparsed + " entries");
br.close();
return wrap;
} else {
ArrayList answer = new ArrayList();
BufferedReader br = new BufferedReader(new FileReader(
filename));
String s;
while ((s = br.readLine()) != null) {
try {
answer.add(Integer.parseInt(s.trim()));
} catch (java.lang.NumberFormatException nfe) {
if (misparsed == 0)
System.err.println(nfe);
++misparsed;
}
}
int[] actualanswer = new int[answer.size()];
for (int i = 0; i < answer.size(); ++i)
actualanswer[i] = answer.get(i);
ArrayList wrap = new ArrayList();
wrap.add(actualanswer);
if (misparsed > 0)
System.out.println("Failed to parse "
+ misparsed + " entries");
br.close();
return wrap;
}
}
private enum Format {
ONEARRAYPERLINE, ONEARRAYPERFILE, ONEINTPERLINE
}
private enum CompressionMode {
AS_IS, DELTA
}
/**
* @param args command-line arguments
* @throws IOException when some IO error occurs
*/
public static void main(final String[] args) throws IOException {
Format myformat = Format.ONEARRAYPERLINE;
CompressionMode cm = CompressionMode.DELTA;
ArrayList files = new ArrayList();
for (String s : args) {
if (s.startsWith("-")) {// it is a flag
if (s.equals("--onearrayperfile"))
myformat = Format.ONEARRAYPERFILE;
else if (s.equals("--nodelta"))
cm = CompressionMode.AS_IS;
else if (s.equals("--oneintperline"))
myformat = Format.ONEINTPERLINE;
else
throw new RuntimeException(
"I don't understand: " + s);
} else {// it is a filename
files.add(s);
}
}
if (myformat == Format.ONEARRAYPERFILE)
System.out.println("Treating each file as one array.");
else if (myformat == Format.ONEARRAYPERLINE)
System.out
.println("Each line of each file is an array: use --onearrayperfile or --oneintperline to change.");
else if (myformat == Format.ONEINTPERLINE)
System.out
.println("Treating each file as one array, with one integer per line.");
if (cm == CompressionMode.AS_IS)
System.out
.println("Compressing the integers 'as is' (no differential coding)");
else
System.out
.println("Using differential coding (arrays will be sorted): use --nodelta to prevent sorting");
ArrayList data = new ArrayList();
for (String fn : files)
for (int[] x : loadIntegers(fn, myformat))
data.add(x);
System.out.println("Loaded " + data.size() + " array(s)");
if (cm == CompressionMode.DELTA) {
System.out
.println("Sorting the arrray(s) because you are using differential coding");
for (int[] x : data)
Arrays.sort(x);
}
bench(data, cm, false);
bench(data, cm, false);
bench(data, cm, true);
bytebench(data, cm, false);
bytebench(data, cm, false);
bytebench(data, cm, true);
}
private static void bench(ArrayList postings, CompressionMode cm,
boolean verbose) {
int maxlength = 0;
for (int[] x : postings)
if (maxlength < x.length)
maxlength = x.length;
if (verbose)
System.out.println("Max array length: " + maxlength);
int[] compbuffer = new int[2 * maxlength + 1024];
int[] decompbuffer = new int[maxlength];
if (verbose)
System.out.println("Scheme -- bits/int -- speed (mis)");
for (IntegerCODEC c : (cm == CompressionMode.DELTA ? codecs
: regcodecs)) {
long bef = 0;
long aft = 0;
long decomptime = 0;
long volumein = 0;
long volumeout = 0;
int[][] compdata = new int[postings.size()][];
for (int k = 0; k < postings.size(); ++k) {
int[] in = postings.get(k);
IntWrapper inpos = new IntWrapper(0);
IntWrapper outpos = new IntWrapper(0);
c.compress(in, inpos, in.length, compbuffer,
outpos);
int clength = outpos.get();
inpos = new IntWrapper(0);
outpos = new IntWrapper(0);
c.uncompress(compbuffer, inpos, clength,
decompbuffer, outpos);
volumein += in.length;
volumeout += clength;
if (outpos.get() != in.length)
throw new RuntimeException("bug");
for (int z = 0; z < in.length; ++z)
if (in[z] != decompbuffer[z])
throw new RuntimeException(
"bug");
compdata[k] = Arrays
.copyOf(compbuffer, clength);
}
bef = System.nanoTime();
for (int[] cin : compdata) {
IntWrapper inpos = new IntWrapper(0);
IntWrapper outpos = new IntWrapper(0);
c.uncompress(cin, inpos, cin.length,
decompbuffer, outpos);
if (inpos.get() != cin.length)
throw new RuntimeException("bug");
}
aft = System.nanoTime();
decomptime += (aft - bef);
double bitsPerInt = volumeout * 32.0 / volumein;
double decompressSpeed = volumein * 1000.0
/ (decomptime);
if (verbose)
System.out.println(c.toString()
+ "\t"
+ String.format("\t%1$.2f\t%2$.2f",
bitsPerInt, decompressSpeed));
}
}
private static void bytebench(ArrayList postings,
CompressionMode cm, boolean verbose) {
int maxlength = 0;
for (int[] x : postings)
if (maxlength < x.length)
maxlength = x.length;
if (verbose)
System.out.println("Max array length: " + maxlength);
byte[] compbuffer = new byte[6 * (maxlength + 1024)];
int[] decompbuffer = new int[maxlength];
if (verbose)
System.out.println("Scheme -- bits/int -- speed (mis)");
for (ByteIntegerCODEC c : (cm == CompressionMode.DELTA ? bcodecs
: regbcodecs)) {
long bef = 0;
long aft = 0;
long decomptime = 0;
long volumein = 0;
long volumeout = 0;
byte[][] compdata = new byte[postings.size()][];
for (int k = 0; k < postings.size(); ++k) {
int[] in = postings.get(k);
IntWrapper inpos = new IntWrapper(0);
IntWrapper outpos = new IntWrapper(0);
c.compress(in, inpos, in.length, compbuffer,
outpos);
int clength = outpos.get();
inpos = new IntWrapper(0);
outpos = new IntWrapper(0);
c.uncompress(compbuffer, inpos, clength,
decompbuffer, outpos);
volumein += in.length;
volumeout += clength;
if (outpos.get() != in.length)
throw new RuntimeException("bug");
for (int z = 0; z < in.length; ++z)
if (in[z] != decompbuffer[z])
throw new RuntimeException(
"bug");
compdata[k] = Arrays
.copyOf(compbuffer, clength);
}
bef = System.nanoTime();
for (byte[] cin : compdata) {
IntWrapper inpos = new IntWrapper(0);
IntWrapper outpos = new IntWrapper(0);
c.uncompress(cin, inpos, cin.length,
decompbuffer, outpos);
if (inpos.get() != cin.length)
throw new RuntimeException("bug");
}
aft = System.nanoTime();
decomptime += (aft - bef);
double bitsPerInt = volumeout * 8.0 / volumein;
double decompressSpeed = volumein * 1000.0
/ (decomptime);
if (verbose)
System.out.println(c.toString()
+ "\t"
+ String.format("\t%1$.2f\t%2$.2f",
bitsPerInt, decompressSpeed));
}
}
}