All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.jena.atlas.data.DistinctDataNet Maven / Gradle / Ivy

There is a newer version: 5.1.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.atlas.data;

import java.io.File ;
import java.io.FileNotFoundException ;
import java.util.ArrayList ;
import java.util.Comparator ;
import java.util.Iterator ;
import java.util.List ;
import java.util.NoSuchElementException ;

import org.apache.jena.atlas.AtlasException ;
import org.apache.jena.atlas.iterator.Iter ;
import org.apache.jena.atlas.lib.Closeable ;
import org.apache.jena.atlas.lib.FileOps ;


/**
 * This class is like {@link DistinctDataBag} except that you are informed if the item you just
 * added was known to be distinct.  This will normally only work until the first spill.  After that,
 * the system may not be able to tell for sure, and will thus return false.  When you are finished
 * adding items, you may call {@link #netIterator()} to get any distinct items that are in the
 * spill files but were not indicated as distinct previously.
 */
public class DistinctDataNet extends DistinctDataBag
{
    protected File firstSpillFile;
    
    public DistinctDataNet(ThresholdPolicy policy, SerializationFactory serializerFactory, Comparator comparator)
    {
        super(policy, serializerFactory, comparator) ;
    }
    
    /**
     * @return true if the item added is known to be distinct.
     */
    public boolean netAdd(E item)
    {
        long s = size ;
        super.add(item) ;
        return !spilled && size > s ;
    }
    
    @Override
    protected void registerSpillFile(File spillFile)
    {
        // If this is the first time spilling, then keep this spill file separate
        if (!spilled)
        {
            firstSpillFile = spillFile;
        }
        else
        {
            super.registerSpillFile(spillFile);
        }
    }
    
    @Override
    protected void deleteSpillFiles()
    {
        super.deleteSpillFiles();
        if (null != firstSpillFile)
        {
            FileOps.delete(firstSpillFile, false);
            firstSpillFile = null;
        }
    }
    
    // Used by the .iterator() method
    @Override
    protected List getSpillFiles()
    {
        List toReturn = new ArrayList<>(super.getSpillFiles());
        if (null != firstSpillFile)
        {
            toReturn.add(firstSpillFile);
        }
        return toReturn;
    }
    
    // TODO: Will be used by the .netIterator() method
    protected List getNetSpillFiles()
    {
        return super.getSpillFiles();
    }
    
    /**
     * Returns an iterator to all additional items that are distinct but were
     * not reported to be so at the time {@link #netAdd(Object)} was invoked.
     * 

* If you do not exhaust the iterator, you should call {@link org.apache.jena.atlas.iterator.Iter#close(Iterator)} * to be sure any open file handles are closed. */ public Iterator netIterator() { // If we havn't spilled, then we have already indicated all distinct values via .netAdd() if (!spilled) { return Iter.nullIterator(); } Iterator blacklist; try { blacklist = getInputIterator(firstSpillFile); } catch ( FileNotFoundException e ) { throw new AtlasException("Cannot find the first spill file", e); } // TODO: Improve performance by making the superclass .iterator() use getNetSpillFiles() // instead of getSpillFiles() so it doesn't contain the contents of the first file Iterator rest = super.iterator(); SortedDiffIterator sdi = SortedDiffIterator.create(rest, blacklist, comparator); registerCloseableIterator(sdi); return sdi; } /** * Produces the set difference of two sorted set sequences. */ protected static class SortedDiffIterator implements Iterator, Closeable { private final Iterator grayList; private final Iterator blackList; private final Comparator comp; private boolean finished = false; private boolean blackSlotFull = false; private T white; private T black; /** * Produces the set difference of two sorted set sequences using the natural ordering of the items * (null items will always be considered less than any other items). * * @param first An Iterator<T> whose elements that are not also in second will be returned. * @param second An Iterator<T> whose elements that also occur in the first sequence will cause those elements to be removed from the returned sequence. */ public static > SortedDiffIterator create(Iterator first, Iterator second) { return create(first, second, new Comparator() { @Override public int compare(S o1, S o2) { if (null == o1 && null == o2) return 0; if (null == o1) return -1; if (null == o2) return 1; return o1.compareTo(o2); } }); } /** * Produces the set difference of two sorted set sequences using the specified comparator. * * @param first An Iterator<T> whose elements that are not also in second will be returned. * @param second An Iterator<T> whose elements that also occur in the first sequence will cause those elements to be removed from the returned sequence. * @param comparator The comparator used to compare the elements from each iterator. */ public static SortedDiffIterator create(Iterator first, Iterator second, Comparator comparator) { return new SortedDiffIterator<>(first, second, comparator); } private SortedDiffIterator(Iterator first, Iterator second, Comparator comparator) { this.grayList = first; this.blackList = second; this.comp = comparator; // Prime the white item fill(); } private void fill() { if (finished) return; if (!grayList.hasNext()) { close(); return; } if (!blackSlotFull) { if (!blackList.hasNext()) { white = grayList.next(); return; } black = blackList.next(); blackSlotFull = true; } // Outer loop advances white while (true) { if (!grayList.hasNext()) { close(); return; } white = grayList.next(); int cmp = comp.compare(white, black); if (cmp < 0) return; // Inner loop advances black until white is less than or equal to it while (cmp > 0) { if (!blackList.hasNext()) { black = null; blackSlotFull = false; return; } black = blackList.next(); cmp = comp.compare(white, black); if (cmp < 0) return; } } } @Override public boolean hasNext() { return !finished; } @Override public T next() { if (finished) throw new NoSuchElementException(); T toReturn = white; fill(); return toReturn; } @Override public void remove() { throw new UnsupportedOperationException("SortedDiffIterator.remove"); } @Override public void close() { finished = true; white = null; black = null; Iter.close(grayList); Iter.close(blackList); } } }