All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sfj.ReplacementDiskSort Maven / Gradle / Ivy

/*
 * Copyright 2020 C. Schanck
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sfj;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.concurrent.atomic.AtomicInteger;

import static java.util.Collections.singletonList;

/**
 * External merge sort implementation. Mostly as Knuth describes, at least the
 * run generation phase. Uses the replacement selection method to generate runs
 * of sorted elements from an unsorted source N at a time, then merges them M
 * at a time.
 * 

The input and output is delegated to a pair of interfaces, ExternalIterator, * and ExternalAppender. You actually use the class by providing callbacks * that make arbitrary iterators and appenders, which are then used read and write * your subclass of Element. SO if you have binary ints, fine. Strings, fine. Just * provide the appender/iterators to make it happen. * @param element subclass * * @author cschanck */ public class ReplacementDiskSort { public static class PassInfo { private final int pass; private long runTimeMS; private final List srcFiles; private final List destFiles; private final List runCounts; public PassInfo(int pass, List srcFiles, List destFiles, List destCounts, long runTimeMS) { this.pass = pass; this.srcFiles = srcFiles; this.destFiles = destFiles; this.runCounts = destCounts; this.runTimeMS = runTimeMS; } public int getPass() { return pass; } public long getRunTimeMS() { return runTimeMS; } public List getSrcFiles() { return srcFiles; } public List getDestFiles() { return destFiles; } public List getRunCounts() { return runCounts; } @Override public String toString() { return "PassInfo{" + "pass=" + pass + ", runTimeMS=" + runTimeMS + ", srcFiles=" + srcFiles + ", destFiles=" + destFiles + ", runCounts=" + runCounts + '}'; } } /** * This is the class you need to subclass; you will provide * file iterators (readers) and file appenders for your subclass. *

Store anything as the data object, but you need to render it * to and from disk. */ public static class Element { private final Object data; private int run = 0; public Element(Object data) { this.data = data; } public Object getData() { return data; } protected int getRun() { return run; } void setRun(int run) { this.run = run; } @Override public String toString() { return "Element{" + "data=" + data + ", run=" + run + '}'; } } /** * An external iterator. Continues to provide data until * the next() method return null. Providers should take care to close * any underlying resource when returning null. * @param Element subclass */ public interface ExternalIterator { EE next() throws IOException; } /** * An external appender. Appends Elements until the close * methodis called explicitly. * @param Element subclass */ public interface ExternalAppender { void append(EE elem) throws IOException; void close(); } /** * Make an iterator for a specific file. * @param Element subclass */ @FunctionalInterface public interface IterMaker { ExternalIterator make(File f) throws IOException; } /** * Make an appender for a specific file. * @param Element subclass */ @FunctionalInterface public interface AppenderMaker { ExternalAppender make(File f) throws IOException; } private final AppenderMaker appenderMaker; private final Comparator comp; private final boolean deleteFiles; private final IterMaker iteratorMaker; private AtomicInteger filenameCounter = new AtomicInteger(0); protected File workDirectory; private List runPassInfo = new ArrayList<>(); private PrintStream verbose = System.out; /** * Constructor. * @param iteratorMaker maker for iterator. * @param appenderMaker maker for appender * @param comp Comparator for your subclass of Element * @param deleteFiles delete the intermediate files as you go */ public ReplacementDiskSort(IterMaker iteratorMaker, AppenderMaker appenderMaker, Comparator comp, boolean deleteFiles) { this.iteratorMaker = iteratorMaker; this.appenderMaker = appenderMaker; this.comp = (c1, c2) -> { int ret = Integer.compare(c1.getRun(), c2.getRun()); if (ret == 0) { return comp.compare(c1, c2); } return ret; }; this.deleteFiles = deleteFiles; } /** * Single method to do a single run. * @param iteratorMaker maker for iterator. * @param appenderMaker maker for appender * @param comp Comparator for your subclass of Element * @param deleteFiles delete the intermediate files as you go * @param src source file * @param maxElementsForRuns max number of elements to have in memory for * generating initial run files. This controls how much memory the sort takes, * as it includes maxElementsForRuns element overhead, plus 1X the buffering needed * for an appender, and 1X the buffering needed for an iterator. * @param maxElementsForMerges max number of elements to have in memory for * merge passes. This controls how much memory the sort takes, * as it includes maxElementsForRuns element overhead, plus 1X the buffering needed * for an iterator, and maxElementsForMerges X the buffering needed for an iterator. * So the merge pass, depending on the buffering you do in your appender, can * have a bigger footprint if you are not careful. * @param dest destination file. Cannot be the same as source file. * @param workDir working directory * @param Element subclass * @return the disk sort object used for the sort * @throws IOException */ public static ReplacementDiskSort runOnce(IterMaker iteratorMaker, AppenderMaker appenderMaker, Comparator comp, boolean deleteFiles, File src, int maxElementsForRuns, int maxElementsForMerges, File dest, File workDir) throws IOException { ReplacementDiskSort ret = new ReplacementDiskSort<>(iteratorMaker, appenderMaker, comp, deleteFiles); ret.run(src, maxElementsForRuns, maxElementsForMerges, dest, workDir); return ret; } public ReplacementDiskSort setVerbose(PrintStream verbose) { this.verbose = verbose; return this; } private void verbose(String fmt, Object... args) { if (verbose != null) { verbose.println(String.format(fmt, args)); } } public synchronized void run(File src, int maxElementsForRuns, int maxElementsForMerges, File dest, File workingDirectory) throws IOException { if (!src.exists() || !src.canRead()) { throw new IOException("Can't read source file: [" + src + "]"); } if (dest.exists()) { throw new IOException("Can't write to dest file: [" + dest + "]"); } if (!workingDirectory.exists() || !workingDirectory.canWrite() || !workingDirectory.isDirectory()) { throw new IOException( "Can't write to working directory/does not exist/not directory: [" + workingDirectory + "]"); } this.workDirectory = workingDirectory; List current = makeRuns(src, maxElementsForRuns); List next = new ArrayList<>(); int pass = 1; while (current.size() > 1) { for (; !current.isEmpty(); ) { List subFiles; if (current.size() < maxElementsForMerges) { subFiles = current; } else if (current.size() < 2 * maxElementsForMerges) { subFiles = current.subList(0, current.size() / 2); } else { subFiles = current.subList(0, maxElementsForMerges); } File interim = passFile(pass); mergePass(pass++, subFiles, interim); subFiles.clear(); next.add(interim); } current = next; next = new ArrayList<>(); } Files.move(current.get(0).toPath(), dest.toPath(), StandardCopyOption.ATOMIC_MOVE); verbose("Sort complete. %d items. Elapsed time: %dms ", runPassInfo.get(runPassInfo.size() - 1).runCounts.get(0), runPassInfo.stream().mapToLong(rp -> rp.runTimeMS).sum()); } private File passFile(int pass) { return new File(workDirectory, "pass-" + pass + "-" + filenameCounter.getAndIncrement()); } protected List makeRuns(File src, int maxElementsForRuns) throws IOException { verbose("Pass 0: Generating Runs..."); PriorityQueue q = new PriorityQueue<>(maxElementsForRuns, this.comp); long msStart = System.currentTimeMillis(); ExternalIterator elements = iteratorMaker.make(src); ArrayList files = new ArrayList<>(); // fill the queue first. all pass 0. for (int i = 0; i < maxElementsForRuns; i++) { E p = elements.next(); if (p != null) { q.add(p); } else { break; } } int currentRun = 0; File f = passFile(0); ExternalAppender output = makeAppender(f); files.add(f); int count = 0; boolean doneReading = false; ArrayList runCounts = new ArrayList<>(); while (!q.isEmpty()) { E val = q.poll(); // if no more from this run, roll run file if (val.getRun() != currentRun) { runCounts.add((long) count); output.close(); f = passFile(0); ++currentRun; output = makeAppender(f); files.add(f); verbose("Pass 0: Generated run %d with %d elements...", currentRun, count); count = 0; } // write it output.append(val); count++; // if we have another if (!doneReading) { E newVal = elements.next(); if (newVal != null) { newVal.setRun(currentRun); // if it is out of order wrt last written value if (comp.compare(newVal, val) < 0) { // future run newVal.setRun(currentRun + 1); } // add new one back in. q.add(newVal); } else { doneReading = true; } } } verbose("Pass 0: Generated run %d with %d elements...", currentRun, count); runCounts.add((long) count); output.close(); long tookMS = System.currentTimeMillis() - msStart; this.runPassInfo.add(new PassInfo(0, singletonList(src), new ArrayList<>(files), runCounts, tookMS)); return files; } public List getPassInfo() { return Collections.unmodifiableList(this.runPassInfo); } private ExternalAppender makeAppender(File f) throws IOException { if (f.exists()) { throw new IOException("File: " + f + " exists; expected it to be missing"); } return appenderMaker.make(f); } private void writeFully(FileChannel ch, ByteBuffer slice) throws IOException { while (slice.hasRemaining()) { ch.write(slice); } } private class FileHead { private final ExternalIterator iter; private E next; public FileHead(File f) throws IOException { this.iter = iteratorMaker.make(f); this.next = iter.next(); } public boolean isDone() { return next == null; } public E pullElement() throws IOException { E ret = next; if (ret != null) { next = iter.next(); } return ret; } } protected File mergePass(int pass, List inputFiles, File dest) throws IOException { verbose("Merge pass %d: for %s...", pass, inputFiles); long startMS = System.currentTimeMillis(); PriorityQueue q = new PriorityQueue<>((c1, c2) -> comp.compare(c1.next, c2.next)); for (File file : inputFiles) { FileHead head = new FileHead(file); q.add(head); } long cnt = 0; ExternalAppender output = makeAppender(dest); while (!q.isEmpty()) { FileHead n = q.poll(); E elem = n.pullElement(); if (elem != null) { cnt++; output.append(elem); } if (!n.isDone()) { q.add(n); } } output.close(); long tookMS = System.currentTimeMillis() - startMS; if (deleteFiles) { for (File file : inputFiles) { if (!file.delete()) { int n = 0; } } } PassInfo pi = new PassInfo(pass, new ArrayList<>(inputFiles), singletonList(dest), singletonList(cnt), tookMS); runPassInfo.add(pi); verbose("Merge pass %d: completed: %s elements in %dms", pass, pi.runCounts.get(0), pi.runTimeMS); return dest; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy