All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bbn.bue.common.files.MergeDocIDToFileMaps Maven / Gradle / Ivy

The newest version!
package com.bbn.bue.common.files;

import com.bbn.bue.common.collections.MapUtils;
import com.bbn.bue.common.parameters.Parameters;
import com.bbn.bue.common.symbols.Symbol;
import com.bbn.bue.common.symbols.SymbolUtils;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.io.Files;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Map;

/**
 * Merges DocIDToFileMaps together.
 */
public class MergeDocIDToFileMaps {

  private static final Logger log = LoggerFactory.getLogger(MergeDocIDToFileMaps.class);

  private MergeDocIDToFileMaps() {
    throw new UnsupportedOperationException();
  }

  public static void main(String[] argv) {
    // we wrap the main method in this way to
    // ensure a non-zero return value on failure
    try {
      trueMain(argv);
    } catch (Exception e) {
      e.printStackTrace();
      System.exit(1);
    }
  }

  private static void trueMain(String[] argv) throws IOException {
    if (argv.length != 1) {
      System.err.println("usage: MergeDocIdToFileMaps paramsFile"
          + "outputMap\n"
          + "inputListOfMaps: a file with one filename per line of file maps to merge\n"
          + "allowDuplicatesAndPreferEarlierEntries: boolean controlling our behavior when we find two docIDs\n");
      System.exit(1);
    }

    final Parameters params = Parameters.loadSerifStyle(new File(argv[0]));
    final File listOfMaps = params.getExistingFile("inputListOfMaps");
    final File outputMap = params.getCreatableFile("outputMap");
    final boolean allowDuplicatesAndPreferEarlierEntries = params.getOptionalBoolean("allowDuplicatesAndPreferEarlierEntries").or(false);

    final Map mergedMap = Maps.newHashMap();
    for (final File mapFile : FileUtils
        .loadFileList(Files.asCharSource(listOfMaps, Charsets.UTF_8))) {
      final ImmutableMap mapFromFile = FileUtils.loadSymbolToFileMap(
          Files.asCharSource(mapFile, Charsets.UTF_8));
      log.info("Loaded {} file mappings from {}", mapFromFile.size(), mapFile);
      for (final Map.Entry mapEntry : mapFromFile.entrySet()) {
        final File curMapping = mergedMap.get(mapEntry.getKey());
        if (curMapping == null) {
          mergedMap.put(mapEntry.getKey(), mapEntry.getValue());
        } else if (!curMapping.equals(mapEntry.getValue()) && !allowDuplicatesAndPreferEarlierEntries) {
          throw new RuntimeException(mapEntry.getKey() + " is mapped to " + mapEntry.getValue()
              + " in " + mapFile + " but has been mapped to " + curMapping + " in a previously "
              + "processed file");
        } else {
          // do nothing - it's fine to repeat the same mapping in multiple files
        }
      }
    }
    outputMap.getParentFile().mkdirs();

    final ImmutableMap sortedMergedMap = MapUtils.copyWithKeysSortedBy(mergedMap,
        SymbolUtils.byStringOrdering());
    ;
    log.info("Wrote map of {} files to {}", sortedMergedMap.size(), outputMap);
    FileUtils.writeSymbolToFileMap(sortedMergedMap, Files.asCharSink(outputMap, Charsets.UTF_8));
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy