All Downloads are FREE. Search and download functionalities are using the official Maven repository.

me.lemire.integercompression.benchmarktools.BenchmarkCSV Maven / Gradle / Ivy

Go to download

It is a library to compress and uncompress arrays of integers very fast. The assumption is that most (but not all) values in your array use less than 32 bits.

There is a newer version: 0.2.1
Show newest version
package me.lemire.integercompression.benchmarktools;

import me.lemire.integercompression.*;
import me.lemire.integercompression.differential.IntegratedBinaryPacking;
import me.lemire.integercompression.differential.IntegratedByteIntegerCODEC;
import me.lemire.integercompression.differential.IntegratedComposition;
import me.lemire.integercompression.differential.IntegratedIntegerCODEC;
import me.lemire.integercompression.differential.IntegratedVariableByte;

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;

/**
 * This will run benchmarks using a set of posting lists stored as CSV files.
 * 
 * @author lemire
 * 
 */
public class BenchmarkCSV {
        static IntegratedIntegerCODEC codecs[] = {
                new IntegratedComposition(new IntegratedBinaryPacking(),
                        new IntegratedVariableByte()) };
        static IntegratedByteIntegerCODEC bcodecs[] = { new IntegratedVariableByte() };
        static IntegerCODEC regcodecs[] = {
                new Composition(new FastPFOR128(), new VariableByte()),
                new Composition(new FastPFOR(), new VariableByte()),
                new Composition(new BinaryPacking(), new VariableByte()) };
        static ByteIntegerCODEC regbcodecs[] = { new VariableByte() };

        private static ArrayList loadIntegers(final String filename, final Format f)
                throws IOException {
                int misparsed = 0;
                if (f == Format.ONEARRAYPERLINE) {
                        ArrayList answer = new ArrayList();
                        BufferedReader br = new BufferedReader(new FileReader(
                                filename));
                        String s;
                        while ((s = br.readLine()) != null) {
                                String[] numbers = s.split("[,;;]"); // that's
                                                                     // slow
                                int[] a = new int[numbers.length];
                                for (int k = 0; k < numbers.length; ++k) {
                                        try {
                                                a[k] = Integer
                                                        .parseInt(numbers[k]
                                                                .trim());
                                        } catch (java.lang.NumberFormatException nfe) {
                                                if (misparsed == 0)
                                                        System.err.println(nfe);
                                                ++misparsed;
                                        }
                                }
                                answer.add(a);
                        }
                        if (misparsed > 0)
                                System.out.println("Failed to parse "
                                        + misparsed + " entries");
                        br.close();
                        return answer;
                } else if (f == Format.ONEARRAYPERFILE) {
                        ArrayList answer = new ArrayList();
                        BufferedReader br = new BufferedReader(new FileReader(
                                filename));
                        String s;
                        while ((s = br.readLine()) != null) {
                                String[] numbers = s.split("[,;;]");// that's
                                                                    // slow
                                for (int k = 0; k < numbers.length; ++k) {
                                        try {
                                                answer.add(Integer
                                                        .parseInt(numbers[k]
                                                                .trim()));
                                        } catch (java.lang.NumberFormatException nfe) {
                                                if (misparsed == 0)
                                                        System.err.println(nfe);
                                                ++misparsed;
                                        }
                                }
                        }
                        int[] actualanswer = new int[answer.size()];
                        for (int i = 0; i < answer.size(); ++i)
                                actualanswer[i] = answer.get(i);
                        ArrayList wrap = new ArrayList();
                        wrap.add(actualanswer);
                        if (misparsed > 0)
                                System.out.println("Failed to parse "
                                        + misparsed + " entries");
                        br.close();
                        return wrap;
                } else {
                        ArrayList answer = new ArrayList();
                        BufferedReader br = new BufferedReader(new FileReader(
                                filename));
                        String s;
                        while ((s = br.readLine()) != null) {
                                try {
                                        answer.add(Integer.parseInt(s.trim()));
                                } catch (java.lang.NumberFormatException nfe) {
                                        if (misparsed == 0)
                                                System.err.println(nfe);
                                        ++misparsed;
                                }
                        }
                        int[] actualanswer = new int[answer.size()];
                        for (int i = 0; i < answer.size(); ++i)
                                actualanswer[i] = answer.get(i);
                        ArrayList wrap = new ArrayList();
                        wrap.add(actualanswer);
                        if (misparsed > 0)
                                System.out.println("Failed to parse "
                                        + misparsed + " entries");
                        br.close();
                        return wrap;
                }
        }

        private enum Format {
                ONEARRAYPERLINE, ONEARRAYPERFILE, ONEINTPERLINE
        }

        private enum CompressionMode {
                AS_IS, DELTA
        }

        /**
         * @param args command-line arguments
         * @throws IOException when some IO error occurs
         */
        public static void main(final String[] args) throws IOException {
                Format myformat = Format.ONEARRAYPERLINE;
                CompressionMode cm = CompressionMode.DELTA;
                ArrayList files = new ArrayList();
                for (String s : args) {
                        if (s.startsWith("-")) {// it is a flag
                                if (s.equals("--onearrayperfile"))
                                        myformat = Format.ONEARRAYPERFILE;
                                else if (s.equals("--nodelta"))
                                        cm = CompressionMode.AS_IS;
                                else if (s.equals("--oneintperline"))
                                        myformat = Format.ONEINTPERLINE;
                                else
                                        throw new RuntimeException(
                                                "I don't understand: " + s);
                        } else {// it is a filename
                                files.add(s);
                        }
                }
                if (myformat == Format.ONEARRAYPERFILE)
                        System.out.println("Treating each file as one array.");
                else if (myformat == Format.ONEARRAYPERLINE)
                        System.out
                                .println("Each line of each file is an array: use --onearrayperfile or --oneintperline to change.");
                else if (myformat == Format.ONEINTPERLINE)
                        System.out
                                .println("Treating each file as one array, with one integer per line.");
                if (cm == CompressionMode.AS_IS)
                        System.out
                                .println("Compressing the integers 'as is' (no differential coding)");
                else
                        System.out
                                .println("Using differential coding (arrays will be sorted): use --nodelta to prevent sorting");
                ArrayList data = new ArrayList();
                for (String fn : files)
                        for (int[] x : loadIntegers(fn, myformat))
                                data.add(x);
                System.out.println("Loaded " + data.size() + " array(s)");
                if (cm == CompressionMode.DELTA) {
                        System.out
                                .println("Sorting the arrray(s) because you are using differential coding");
                        for (int[] x : data)
                                Arrays.sort(x);
                }
                bench(data, cm, false);
                bench(data, cm, false);
                bench(data, cm, true);
                bytebench(data, cm, false);
                bytebench(data, cm, false);
                bytebench(data, cm, true);
        }

        private static void bench(ArrayList postings, CompressionMode cm,
                boolean verbose) {
                int maxlength = 0;
                for (int[] x : postings)
                        if (maxlength < x.length)
                                maxlength = x.length;
                if (verbose)
                        System.out.println("Max array length: " + maxlength);
                int[] compbuffer = new int[2 * maxlength + 1024];
                int[] decompbuffer = new int[maxlength];
                if (verbose)
                        System.out.println("Scheme -- bits/int -- speed (mis)");
                for (IntegerCODEC c : (cm == CompressionMode.DELTA ? codecs
                        : regcodecs)) {
                        long bef = 0;
                        long aft = 0;
                        long decomptime = 0;
                        long volumein = 0;
                        long volumeout = 0;
                        int[][] compdata = new int[postings.size()][];
                        for (int k = 0; k < postings.size(); ++k) {
                                int[] in = postings.get(k);
                                IntWrapper inpos = new IntWrapper(0);
                                IntWrapper outpos = new IntWrapper(0);
                                c.compress(in, inpos, in.length, compbuffer,
                                        outpos);
                                int clength = outpos.get();
                                inpos = new IntWrapper(0);
                                outpos = new IntWrapper(0);
                                c.uncompress(compbuffer, inpos, clength,
                                        decompbuffer, outpos);
                                volumein += in.length;
                                volumeout += clength;

                                if (outpos.get() != in.length)
                                        throw new RuntimeException("bug");
                                for (int z = 0; z < in.length; ++z)
                                        if (in[z] != decompbuffer[z])
                                                throw new RuntimeException(
                                                        "bug");
                                compdata[k] = Arrays
                                        .copyOf(compbuffer, clength);
                        }
                        bef = System.nanoTime();
                        for (int[] cin : compdata) {
                                IntWrapper inpos = new IntWrapper(0);
                                IntWrapper outpos = new IntWrapper(0);
                                c.uncompress(cin, inpos, cin.length,
                                        decompbuffer, outpos);
                                if (inpos.get() != cin.length)
                                        throw new RuntimeException("bug");
                        }
                        aft = System.nanoTime();
                        decomptime += (aft - bef);
                        double bitsPerInt = volumeout * 32.0 / volumein;
                        double decompressSpeed = volumein * 1000.0
                                / (decomptime);
                        if (verbose)
                                System.out.println(c.toString()
                                        + "\t"
                                        + String.format("\t%1$.2f\t%2$.2f",
                                                bitsPerInt, decompressSpeed));

                }
        }

        private static void bytebench(ArrayList postings,
                CompressionMode cm, boolean verbose) {
                int maxlength = 0;
                for (int[] x : postings)
                        if (maxlength < x.length)
                                maxlength = x.length;
                if (verbose)
                        System.out.println("Max array length: " + maxlength);
                byte[] compbuffer = new byte[6 * (maxlength + 1024)];
                int[] decompbuffer = new int[maxlength];
                if (verbose)
                        System.out.println("Scheme -- bits/int -- speed (mis)");
                for (ByteIntegerCODEC c : (cm == CompressionMode.DELTA ? bcodecs
                        : regbcodecs)) {
                        long bef = 0;
                        long aft = 0;
                        long decomptime = 0;
                        long volumein = 0;
                        long volumeout = 0;
                        byte[][] compdata = new byte[postings.size()][];
                        for (int k = 0; k < postings.size(); ++k) {
                                int[] in = postings.get(k);
                                IntWrapper inpos = new IntWrapper(0);
                                IntWrapper outpos = new IntWrapper(0);
                                c.compress(in, inpos, in.length, compbuffer,
                                        outpos);
                                int clength = outpos.get();
                                inpos = new IntWrapper(0);
                                outpos = new IntWrapper(0);
                                c.uncompress(compbuffer, inpos, clength,
                                        decompbuffer, outpos);
                                volumein += in.length;
                                volumeout += clength;

                                if (outpos.get() != in.length)
                                        throw new RuntimeException("bug");
                                for (int z = 0; z < in.length; ++z)
                                        if (in[z] != decompbuffer[z])
                                                throw new RuntimeException(
                                                        "bug");
                                compdata[k] = Arrays
                                        .copyOf(compbuffer, clength);
                        }
                        bef = System.nanoTime();
                        for (byte[] cin : compdata) {
                                IntWrapper inpos = new IntWrapper(0);
                                IntWrapper outpos = new IntWrapper(0);
                                c.uncompress(cin, inpos, cin.length,
                                        decompbuffer, outpos);
                                if (inpos.get() != cin.length)
                                        throw new RuntimeException("bug");
                        }
                        aft = System.nanoTime();
                        decomptime += (aft - bef);
                        double bitsPerInt = volumeout * 8.0 / volumein;
                        double decompressSpeed = volumein * 1000.0
                                / (decomptime);
                        if (verbose)
                                System.out.println(c.toString()
                                        + "\t"
                                        + String.format("\t%1$.2f\t%2$.2f",
                                                bitsPerInt, decompressSpeed));

                }
        }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy