org.mapdb20.Pump Maven / Gradle / Ivy

Go to download
/*
 *  Copyright (c) 2012 Jan Kotek
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.mapdb20;

import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Executor;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Data Pump moves data from one source to other.
 * It can be used to import data from text file, or copy store from memory to disk.
 */
public final class Pump {


    private static final Logger LOG = Logger.getLogger(Pump.class.getName());

    /**
     * Sorts large data set by given {@code Comparator}. Data are sorted with in-memory cache and temporary files.
     *
     * @param source iterator over unsorted data
     * @param mergeDuplicates should be duplicate keys merged into single one?
     * @param batchSize how much items can fit into heap memory
     * @param comparator used to sort data
     * @param serializer used to store data in temporary files
     * @return iterator over sorted data set
     */
    public static  Iterator sort(Iterator source, boolean mergeDuplicates, final int batchSize,
            Comparator comparator, final Serializer serializer, Executor executor){
        if(batchSize<=0) throw new IllegalArgumentException();
        if(comparator==null)
            comparator=Fun.comparator();
        if(source==null)
            source = Fun.emptyIterator();

        int counter = 0;
        final Object[] presort = new Object[batchSize];
        final List presortFiles = new ArrayList();
        final List presortCount2 = new ArrayList();

        try{
            while(source.hasNext()){
                presort[counter]=source.next();
                counter++;

                if(counter>=batchSize){
                    //sort all items
                    arraySort(presort, presort.length, comparator ,executor);

                    //flush presort into temporary file
                    File f = File.createTempFile("mapdb","sort");
                    f.deleteOnExit();
                    presortFiles.add(f);
                    DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(f)));
                    for(Object e:presort){
                        serializer.serialize(out,(E)e);
                    }
                    out.close();
                    presortCount2.add(counter);
                    Arrays.fill(presort,0);
                    counter = 0;
                }
            }
            //now all records from source are fetch
            if(presortFiles.isEmpty()){
                //no presort files were created, so on-heap sorting is enough
                arraySort(presort, counter, comparator, executor);
                return arrayIterator(presort,0, counter);
            }

            final int[] presortCount = new int[presortFiles.size()];
            for(int i=0;i0;
                    }

                    @Override public Object next() {
                        try {
                            Object ret =  serializer.deserialize(ins[pos],-1);
                            if(--presortCount[pos]==0){
                                ins[pos].close();
                                presortFiles.get(pos).delete();
                            }
                            return ret;
                        } catch (IOException e) {
                            throw new IOError(e);
                        }
                    }

                    @Override public void remove() {
                        //ignored
                    }

                };
            }

            //and add iterator over data on-heap
            arraySort(presort, counter, comparator, executor);
            iterators[iterators.length-1] = arrayIterator(presort,0,counter);

            //and finally sort presorted iterators and return iterators over them
            return sort(comparator, mergeDuplicates, iterators);

        }catch(IOException e){
            throw new IOError(e);
        }finally{
            for(File f:presortFiles) f.delete();
        }
    }

    /**
     * Reflection method {@link Arrays#parallelSort(Object[], int, int, Comparator)}.
     * Is not invoked directly to keep compatibility with java8
     */
    static private Method parallelSortMethod;
    static{
        try {
            parallelSortMethod = Arrays.class.getMethod("parallelSort", Object[].class, int.class, int.class, Comparator.class);
        } catch (NoSuchMethodException e) {
            //java 6 & 7
            parallelSortMethod = null;
        }
    }

    protected static void arraySort(Object[] array, int arrayLen, Comparator comparator,  Executor executor) {
        //if executor is specified, try to use parallel method in java 8
        if(executor!=null && parallelSortMethod!=null){
            //TODO this uses common pool, but perhaps we should use Executor instead
            try {
                parallelSortMethod.invoke(null, array, 0, arrayLen, comparator);
            } catch (IllegalAccessException e) {
                throw new RuntimeException(e);
            } catch (InvocationTargetException e) {
                throw new RuntimeException(e); //TODO exception hierarchy here?
            }
        }
        Arrays.sort(array, 0, arrayLen, comparator);
    }


    /**
     * Merge presorted iterators into single sorted iterator.
     *
     * @param comparator used to compare data
     * @param mergeDuplicates if duplicate keys should be merged into single one
     * @param iterators array of already sorted iterators
     * @return sorted iterator
     */
    public static  Iterator sort(Comparator comparator, final boolean mergeDuplicates, final Iterator... iterators) {
        final Comparator comparator2 = comparator==null?Fun.COMPARATOR:comparator;
        return new Iterator(){

            final NavigableSet items = new TreeSet(
                    new Fun.ArrayComparator(new Comparator[]{comparator2,Fun.COMPARATOR}));

            Object next = this; //is initialized with this so first `next()` will not throw NoSuchElementException

            {
                for(int i=0;i0){
                    throw new IllegalArgumentException("One of the iterators is not sorted");
                }

                Iterator iter = iterators[(Integer)lo[1]];
                if(iter.hasNext()){
                    items.add(new Object[]{iter.next(),lo[1]});
                }

                if(mergeDuplicates){
                    while(true){
                        Iterator subset = Fun.filter(items,next).iterator();
                        if(!subset.hasNext())
                            break;
                        List subset2 = new LinkedList();
                        while(subset.hasNext())
                            subset2.add(subset.next());
                        List toadd = new ArrayList();
                        for(Object[] t:subset2){
                            items.remove(t);
                            iter = iterators[(Integer)t[1]];
                            if(iter.hasNext())
                                toadd.add(new Object[]{iter.next(),t[1]});
                        }
                        items.addAll(toadd);
                    }
                }


                return (E) oldNext;
            }

            @Override public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }


    /**
     * Merges multiple iterators into single iterator.
     * Result iterator will return entries from all iterators.
     * It does not do sorting or any other special functionality.
     * Does not allow null elements.
     *
     * @param iters - iterators to be merged
     * @return union of all iterators.
     */
    public static  Iterator merge(Executor executor, final Iterator... iters){
        if(iters.length==0)
            return Fun.emptyIterator();

        final Iterator ret = new Iterator() {
                int i = 0;
                Object next = this;

                {
                    next();
                }

                @Override
                public boolean hasNext() {
                    return next != null;
                }

                @Override
                public E next() {
                    if (next == null)
                        throw new NoSuchElementException();

                    //move to next iterator if necessary
                    while (!iters[i].hasNext()) {
                        i++;
                        if (i == iters.length) {
                            //reached end of iterators
                            Object ret = next;
                            next = null;
                            return (E) ret;
                        }
                    }

                    //take next item from iterator
                    Object ret = next;
                    next = iters[i].next();
                    return (E) ret;
                }

                @Override
                public void remove() {
                    throw new UnsupportedOperationException();
                }
            };


        if(executor == null){
            //single threaded
            return ret;
        }

        final Object poisonPill = new Object();

        //else perform merge in separate thread and use blocking queue
        final BlockingQueue q = new ArrayBlockingQueue(128);
        //feed blocking queue in separate thread
        executor.execute(new Runnable() {
            @Override
            public void run() {
                try {
                    try {
                        while (ret.hasNext())
                            q.put(ret.next());
                    } finally {
                        q.put(poisonPill); //TODO poison pill should be send in non blocking way, perhaps remove elements?
                    }
                } catch (InterruptedException e) {
                    LOG.log(Level.SEVERE, "feeder failed", e);
                }
            }
        });

        return poisonPillIterator(q,poisonPill);
    }

    public static  Iterator poisonPillIterator(final BlockingQueue q, final Object poisonPill) {

        return new Iterator() {

            E next = getNext();

            private E getNext() {
                try {
                    E ret = q.take();
                    if(ret==poisonPill)
                        return null;
                    return ret;
                } catch (InterruptedException e) {
                    throw new DBException.Interrupted(e);
                }

            }

            @Override
            public boolean hasNext() {
                return next!=null;
            }

            @Override
            public E next() {
                E ret = next;
                if(ret == null)
                    throw new NoSuchElementException();
                next = getNext();
                return ret;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }

    /**
     * Build BTreeMap (or TreeSet) from presorted data.
     * This method is much faster than usual import using {@code Map.put(key,value)} method.
     * It is because tree integrity does not have to be maintained and
     * tree can be created in linear way with.
     *
     * This method expect data to be presorted in **reverse order** (highest to lowest).
     * There are technical reason for this requirement.
     * To sort unordered data use {@link Pump#sort(java.util.Iterator, boolean, int, java.util.Comparator, Serializer, Executor)}
     *
     * This method does not call commit. You should disable Write Ahead Log when this method is used {@link DBMaker.Maker#transactionDisable()}
     *
     *
     * @param source iterator over source data, must be reverse sorted
     * @param keyExtractor transforms items from source iterator into keys. If null source items will be used directly as keys.
     * @param valueExtractor transforms items from source iterator into values. If null BTreeMap will be constructed without values (as Set)
     * @param ignoreDuplicates should be duplicate keys merged into single one?
     * @param nodeSize maximal BTree node size before it is splited.
     * @param valuesStoredOutsideNodes if true values will not be stored as part of BTree nodes
     * @param counterRecid TODO make size counter friendly to use
     * @param keySerializer serializer for keys, use null for default value
     * @param valueSerializer serializer for value, use null for default value
     * @throws org.mapdb20.DBException.PumpSourceNotSorted if source iterator is not reverse sorted
     * @throws org.mapdb20.DBException.PumpSourceDuplicate if source iterator has duplicates
     */
    public static   long buildTreeMap(Iterator source,
                                             Engine engine,
                                             Fun.Function1 keyExtractor,
                                             Fun.Function1 valueExtractor,
                                             boolean ignoreDuplicates,
                                             int nodeSize,
                                             boolean valuesStoredOutsideNodes,
                                             long counterRecid,
                                             BTreeKeySerializer keySerializer,
                                             Serializer valueSerializer,
                                             Executor executor){

        //TODO upper levels of tree  could be created in separate thread

        if(keyExtractor==null)
            keyExtractor= (Fun.Function1) Fun.extractNoTransform();
        if(valueSerializer==null){
            //this is set
            valueSerializer = (Serializer) BTreeMap.BOOLEAN_PACKED;
            if(valueExtractor!=null)
                throw new IllegalArgumentException();
            valueExtractor = new Fun.Function1() {
                @Override
                public Object run(Object e) {
                    return Boolean.TRUE;
                }
            };
        }
        Serializer valueNodeSerializer = valuesStoredOutsideNodes ? BTreeMap.VALREF_SERIALIZER : valueSerializer;

        // update source iterator with new one, which just ignores duplicates
        if(ignoreDuplicates){
            source = ignoreDuplicatesIterator(source,keySerializer.comparator(), keyExtractor);
        }

        source = checkSortedIterator(source,keySerializer.comparator(), keyExtractor);

        final double NODE_LOAD = 0.75;
        // split if node is bigger than this
        final int maxNodeSize = (int) (nodeSize * NODE_LOAD);

        // temporary serializer for nodes
        Serializer nodeSerializer = new BTreeMap.NodeSerializer(valuesStoredOutsideNodes,keySerializer,valueNodeSerializer,0);

        //hold tree structure
        ArrayList> dirKeys = new ArrayList();
        dirKeys.add(new ArrayList());
        ArrayList> dirRecids = new ArrayList();
        dirRecids.add(arrayList(0L));

        ArrayList leafKeys = new ArrayList();
        ArrayList