edu.isi.nlp.files.SplitCorpus Maven / Gradle / Ivy
package edu.isi.nlp.files;
import static com.google.common.base.Functions.compose;
import static com.google.common.base.Preconditions.checkArgument;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Files;
import com.google.common.math.IntMath;
import edu.isi.nlp.parameters.Parameters;
import edu.isi.nlp.symbols.Symbol;
import edu.isi.nlp.symbols.SymbolUtils;
import java.io.File;
import java.io.IOException;
import java.math.RoundingMode;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Given a file list, a file map, or both, will split it into either a fixed number of chunks or
* chunks of a fixed size. This is primarily used by Corpus::split() in Corpus.pm in
* buetext/perl-modules for splitting corpora in Runjobs code.
*
* The behavior of this program is deterministic.
*
* @author Constantine Lignos, Ryan Gabbard
*/
public final class SplitCorpus {
private static final Logger log = LoggerFactory.getLogger(SplitCorpus.class);
private static final String USAGE =
"SplitCorpus paramFile\n"
+ "Parameters are:\n"
+ "\tcom.bbn.bue.splitCorpus.inputList: list of files to split. Optional.\n"
+ "\tcom.bbn.bue.splitCorpus.inputMap: docId to file map of files to split. Optional.\n"
+ "\tcom.bbn.bue.splitCorpus.outputDir: path to write output\n"
+ "\tcom.bbn.bue.splitCorpus.numChunks: the number of chunks to split the corpus into. Optional.\n"
+ "\tcom.bbn.bue.splitCorpus.chunkSize: the number of of files to put in each chunk. Optional.\n"
+ "\n"
+ "If inputList is given, output file lists will be written to outputDir/split/fileList.txt\n"
+ "\tand a list of these lists will be written to outputDir/listOfLists.txt\n"
+ "If inputMap is given, output file maps will be written to outputDir/split/fileMap.txt\n"
+ "\tand a list of these maps will be written to outputDir/listOfMaps.txt\n"
+ "At least one of inputList and inputMap must be specified.\n"
+ "Exactly one of numChunks and chunkSize may be specified.";
public static final String INPUT_LIST_PARAM = "com.bbn.bue.splitCorpus.inputList";
public static final String INPUT_MAP_PARAM = "com.bbn.bue.splitCorpus.inputMap";
public static final String OUTPUT_DIR_PARAM = "com.bbn.bue.splitCorpus.outputDir";
public static final String NUM_CHUNKS_PARAM = "com.bbn.bue.splitCorpus.numChunks";
public static final String CHUNK_SIZE_PARAM = "com.bbn.bue.splitCorpus.chunkSize";
public static void main(String[] argv) {
// we wrap the main method in this way to
// ensure a non-zero return value on failure
try {
trueMain(argv);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
private static void trueMain(String[] argv) throws IOException {
if (argv.length != 1) {
log.info(USAGE);
System.exit(1);
}
final Parameters params = Parameters.loadSerifStyle(new File(argv[0]));
final File outputDir = params.getCreatableDirectory(OUTPUT_DIR_PARAM);
params.assertAtLeastOneDefined(INPUT_LIST_PARAM, INPUT_MAP_PARAM);
final Optional inputFileListFile = params.getOptionalExistingFile(INPUT_LIST_PARAM);
final Optional inputFileMapFile = params.getOptionalExistingFile(INPUT_MAP_PARAM);
final ImmutableMap docIdToFileMap =
loadDocIdToFileMap(inputFileListFile, inputFileMapFile);
params.assertExactlyOneDefined(NUM_CHUNKS_PARAM, CHUNK_SIZE_PARAM);
final Iterable>> chunks;
if (params.isPresent(NUM_CHUNKS_PARAM)) {
chunks = splitToNChunks(docIdToFileMap, params.getPositiveInteger(NUM_CHUNKS_PARAM));
} else {
chunks =
splitToChunksOfFixedSize(docIdToFileMap, params.getPositiveInteger(CHUNK_SIZE_PARAM));
}
final List listFiles = Lists.newArrayList();
final List mapFiles = Lists.newArrayList();
int chunkIdx = 0;
for (final List> chunk : chunks) {
final ImmutableMap chunkDocIdToFileMap = ImmutableMap.copyOf(chunk);
final ImmutableList chunkFileList = ImmutableList.copyOf(chunkDocIdToFileMap.values());
final File chunkOutputDir = new File(outputDir, Integer.toString(chunkIdx));
chunkOutputDir.mkdir();
final File chunkMapFile = new File(chunkOutputDir, "fileMap.txt");
mapFiles.add(chunkMapFile);
if (inputFileMapFile.isPresent()) {
// maps are only written if a map was given as input
FileUtils.writeSymbolToFileMap(
chunkDocIdToFileMap, Files.asCharSink(chunkMapFile, Charsets.UTF_8));
}
final File chunkListFile = new File(chunkOutputDir, "fileList.txt");
listFiles.add(chunkListFile);
if (inputFileListFile.isPresent()) {
// lists are only written if a list was given as input
FileUtils.writeFileList(chunkFileList, Files.asCharSink(chunkListFile, Charsets.UTF_8));
}
++chunkIdx;
}
// write lists pointing to output files
log.info("Split into {} chunks", chunkIdx);
if (inputFileListFile.isPresent()) {
// lists are only written if a list was given as input
final File listOfListsFile = new File(outputDir, "listOfLists.txt");
log.info("List of file lists written to {}", listOfListsFile);
FileUtils.writeFileList(listFiles, Files.asCharSink(listOfListsFile, Charsets.UTF_8));
}
if (inputFileMapFile.isPresent()) {
// maps are only written if a map was given as input
final File listOfMapsFile = new File(outputDir, "listOfMaps.txt");
log.info("List of file maps written to {}", listOfMapsFile);
FileUtils.writeFileList(mapFiles, Files.asCharSink(listOfMapsFile, Charsets.UTF_8));
}
}
private static Iterable>> splitToChunksOfFixedSize(
final ImmutableMap inputMap, int chunkSize) {
checkArgument(chunkSize > 0);
return Iterables.partition(inputMap.entrySet(), chunkSize);
}
/** If there are fewer files than chunks, fewer than numChunks will be returned. */
private static Iterable>> splitToNChunks(
final ImmutableMap inputMap, int numChunks) {
checkArgument(numChunks > 0);
final List> emptyChunk = ImmutableList.of();
if (inputMap.isEmpty()) {
return Collections.nCopies(numChunks, emptyChunk);
}
final int chunkSize = IntMath.divide(inputMap.size(), numChunks, RoundingMode.UP);
final ImmutableList>> chunks =
ImmutableList.copyOf(splitToChunksOfFixedSize(inputMap, chunkSize));
if (chunks.size() == numChunks) {
return chunks;
} else {
// there weren't enough elements to make the desired number of chunks, so we need to
// pad with empty chunks
final int shortage = numChunks - chunks.size();
final List>> padding = Collections.nCopies(shortage, emptyChunk);
return Iterables.concat(chunks, padding);
}
}
/**
* Gets a doc-id-to-file map for the input, either directly or making a fake one based on an input
* file list.
*/
private static ImmutableMap loadDocIdToFileMap(
final Optional inputFileListFile, final Optional inputFileMapFile)
throws IOException {
checkArgument(inputFileListFile.isPresent() || inputFileMapFile.isPresent());
final Optional> fileList;
if (inputFileListFile.isPresent()) {
fileList = Optional.of(FileUtils.loadFileList(inputFileListFile.get()));
} else {
fileList = Optional.absent();
}
final Optional> fileMap;
if (inputFileMapFile.isPresent()) {
fileMap = Optional.of(FileUtils.loadSymbolToFileMap(inputFileMapFile.get()));
} else {
fileMap = Optional.absent();
}
// sanity checks
if (fileList.isPresent()) {
// file list may not contain duplicates
final boolean containsDuplicates =
ImmutableSet.copyOf(fileList.get()).size() != fileList.get().size();
if (containsDuplicates) {
throw new RuntimeException("Input file list contains duplicates");
}
}
// if both a file map and a file list are given, they must be compatible.
if (fileList.isPresent() && fileMap.isPresent()) {
if (fileList.get().size() != fileMap.get().size()) {
throw new RuntimeException(
"Input file list and file map do not match in size ("
+ fileList.get().size()
+ " vs "
+ fileMap.get().size());
}
final boolean haveExactlyTheSameFiles =
ImmutableSet.copyOf(fileList.get()).equals(ImmutableSet.copyOf(fileMap.get().values()));
if (!haveExactlyTheSameFiles) {
throw new RuntimeException(
"Input file list and file map do not containe exactly the same files");
}
}
// output
if (fileMap.isPresent()) {
return fileMap.get();
} else {
// if we only had a file list as input, we make a fake doc-id-to-file-map using
// the absolute path as the document ID. This won't get output, so it doesn't matter
// that this is a little hacky
final Function fileNameAsSymbolFunction =
compose(SymbolUtils.symbolizeFunction(), FileUtils.toAbsolutePathFunction());
return Maps.uniqueIndex(fileList.get(), fileNameAsSymbolFunction);
}
}
}