All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.files.ConvertFileMapToEmbeddedDB Maven / Gradle / Ivy

The newest version!
package edu.isi.nlp.files;

import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.ByteSource;
import edu.isi.nlp.parameters.Parameters;
import edu.isi.nlp.symbols.Symbol;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Converts a a key-to-file map to a PalDB-backed database.
 *
 * @author Constantine Lignos
 */
public final class ConvertFileMapToEmbeddedDB {

  private static Logger log = LoggerFactory.getLogger(ConvertFileMapToEmbeddedDB.class);

  public static void main(String[] args) {
    // We wrap the main method in this way to ensure a non-zero return value on failure
    try {
      trueMain(args);
    } catch (Exception e) {
      e.printStackTrace();
      System.exit(1);
    }
  }

  private static void trueMain(String[] args) throws IOException {
    if (args.length != 1) {
      System.out.println("No parameter file specified");
      System.exit(1);
    }
    final Parameters params = Parameters.loadSerifStyle(new File(args[0]));
    final File inputMap = params.getExistingFile("inputMap");
    final File outputFile = params.getCreatableFile("outputFile");
    final boolean useCompression = params.getOptionalBoolean("useCompression").or(true);

    // Set up timing
    long readTime = 0;
    long writeTime = 0;

    log.info("Loading files from {}", inputMap);
    final Stopwatch stopwatch = Stopwatch.createStarted();
    final ImmutableMap fileMap = FileUtils.loadSymbolToFileMap(inputMap);
    final ImmutableKeyValueSource source = KeyValueSources.fromFileMap(fileMap);
    readTime += stopwatch.elapsed(TimeUnit.MILLISECONDS);

    // To allow us to time the open/close operations, we don't use try-with-resources
    stopwatch.reset().start();
    final KeyValueSink sink = KeyValueSinks.forPalDB(outputFile, useCompression);
    writeTime += stopwatch.elapsed(TimeUnit.MILLISECONDS);

    // Process files
    for (final Symbol key : source.keys()) {
      // Read
      stopwatch.reset().start();
      byte[] value = source.getRequired(key).read();
      readTime += stopwatch.elapsed(TimeUnit.MILLISECONDS);

      // Write
      stopwatch.reset().start();
      sink.put(key, value);
      writeTime += stopwatch.elapsed(TimeUnit.MILLISECONDS);
    }

    // Time closing
    stopwatch.reset().start();
    sink.close();
    writeTime += stopwatch.elapsed(TimeUnit.MILLISECONDS);

    log.info("Wrote {} documents to {}", fileMap.size(), outputFile);
    log.info("Read time: {}", readTime);
    log.info("Write time: {}", writeTime);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy