All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.campagnelab.dl.somatic.tools.Randomize Maven / Gradle / Ivy

package org.campagnelab.dl.somatic.tools;


import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.XoRoShiRo128PlusRandom;
import org.apache.commons.io.FileUtils;
import org.campagnelab.dl.framework.tools.arguments.AbstractTool;
import org.campagnelab.dl.somatic.storage.RecordWriter;
import org.campagnelab.dl.varanalysis.protobuf.BaseInformationRecords;
import org.campagnelab.dl.somatic.storage.RecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Random;

/**
 * The randomizer object iterates over a parquet file and randomizes the order of records in batches.
 * 

* Created by rct66 on 5/18/16. * * @author rct66 */ public class Randomize extends AbstractTool { static private Logger LOG = LoggerFactory.getLogger(Randomize.class); public static void main(String[] args) { Randomize tool = new Randomize(); tool.parseArguments(args, "Randomize", tool.createArguments()); tool.execute(); } @Override public void execute() { String workingDir = new File(args().outputFile).getParent(); if (workingDir == null) { workingDir = "."; } try { long totalRecords = 0; for (String filename : args().inputFiles) { RecordReader source = new RecordReader(filename); totalRecords += source.getTotalRecords(); source.close(); } int numBuckets = (int)(totalRecords / arguments.recordsPerBucket) + 1; new File(workingDir + "/tmp").mkdir(); List bucketWriters = new ObjectArrayList(numBuckets); for (int i = 0; i < numBuckets; i++) { bucketWriters.add(new RecordWriter(workingDir + "/tmp/bucket" + i, arguments.chunkSizePerWriter)); } RecordWriter allWriter = new RecordWriter(args().outputFile); Random rand = new XoRoShiRo128PlusRandom(args().randomSeed); //set up logger ProgressLogger pgRead = new ProgressLogger(LOG); pgRead.itemsName = "sites"; pgRead.expectedUpdates = totalRecords; pgRead.displayFreeMemory = true; pgRead.start(); //fill buckets randomly System.out.println("Filling " + numBuckets + " temp buckets randomly"); for (String filename : args().inputFiles) { RecordReader source = new RecordReader(filename); for (BaseInformationRecords.BaseInformation rec : source) { int bucket = rand.nextInt(numBuckets); bucketWriters.get(bucket).writeRecord(rec); pgRead.lightUpdate(); } source.close(); System.gc(); } pgRead.stop(); System.out.println("Shuffling contents of each bucket and writing to output file"); System.out.printf("There are %d buckets to shuffle\n",numBuckets); //iterate over buckets ProgressLogger pgTempBucket = new ProgressLogger(LOG); pgTempBucket.itemsName = "buckets"; pgTempBucket.expectedUpdates = numBuckets; pgTempBucket.displayFreeMemory = true; pgTempBucket.start(); int i = 0; for (RecordWriter bucketWriter : bucketWriters) { bucketWriter.close(); //put contents of bucket in a list RecordReader bucketReader = new RecordReader(workingDir + "/tmp/bucket" + i ); List records = new ObjectArrayList<>(arguments.recordsPerBucket); for (BaseInformationRecords.BaseInformation rec : bucketReader) { records.add(rec); } bucketReader.close(); //shuffle list Collections.shuffle(records, rand); //write list to final file for (BaseInformationRecords.BaseInformation rec : records) { allWriter.writeRecord(rec); } i++; pgTempBucket.update(); } pgTempBucket.stop(); allWriter.close(); //delete temp files FileUtils.deleteDirectory(new File((workingDir + "/tmp"))); } catch (IOException e) { throw new RuntimeException(e); } } @Override public RandomizerArguments createArguments() { return new RandomizerArguments(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy