All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.files.MergeDocIDToFileMaps Maven / Gradle / Ivy

The newest version!
package edu.isi.nlp.files;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Maps;
import com.google.common.io.Files;
import edu.isi.nlp.parameters.Parameters;
import edu.isi.nlp.symbols.Symbol;
import edu.isi.nlp.symbols.SymbolUtils;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Merges DocIDToFileMaps together.
 *
 * @author Ryan Gabbard
 */
public class MergeDocIDToFileMaps {

  private static final Logger log = LoggerFactory.getLogger(MergeDocIDToFileMaps.class);

  private MergeDocIDToFileMaps() {
    throw new UnsupportedOperationException();
  }

  public static void main(String[] argv) {
    // we wrap the main method in this way to
    // ensure a non-zero return value on failure
    try {
      trueMain(argv);
    } catch (Exception e) {
      e.printStackTrace();
      System.exit(1);
    }
  }

  private static void trueMain(String[] argv) throws IOException {
    if (argv.length != 1) {
      System.err.println(
          "usage: MergeDocIdToFileMaps paramsFile"
              + "outputMap\n"
              + "inputListOfMaps: a file with one filename per line of file maps to merge\n"
              + "allowDuplicatesAndPreferEarlierEntries: boolean controlling our behavior when we find two docIDs\n");
      System.exit(1);
    }

    final Parameters params = Parameters.loadSerifStyle(new File(argv[0]));
    final File listOfMaps = params.getExistingFile("inputListOfMaps");
    final File outputMap = params.getCreatableFile("outputMap");
    final boolean allowDuplicatesAndPreferEarlierEntries =
        params.getOptionalBoolean("allowDuplicatesAndPreferEarlierEntries").or(false);

    final Map mergedMap = Maps.newHashMap();
    for (final File mapFile :
        FileUtils.loadFileList(Files.asCharSource(listOfMaps, Charsets.UTF_8))) {
      final ImmutableMap mapFromFile =
          FileUtils.loadSymbolToFileMap(Files.asCharSource(mapFile, Charsets.UTF_8));
      log.info("Loaded {} file mappings from {}", mapFromFile.size(), mapFile);
      for (final Map.Entry mapEntry : mapFromFile.entrySet()) {
        final File curMapping = mergedMap.get(mapEntry.getKey());
        if (curMapping == null) {
          mergedMap.put(mapEntry.getKey(), mapEntry.getValue());
        } else if (!curMapping.equals(mapEntry.getValue())
            && !allowDuplicatesAndPreferEarlierEntries) {
          throw new RuntimeException(
              mapEntry.getKey()
                  + " is mapped to "
                  + mapEntry.getValue()
                  + " in "
                  + mapFile
                  + " but has been mapped to "
                  + curMapping
                  + " in a previously "
                  + "processed file");
        } else {
          // do nothing - it's fine to repeat the same mapping in multiple files
        }
      }
    }
    outputMap.getParentFile().mkdirs();

    final ImmutableMap sortedMergedMap =
        ImmutableSortedMap.copyOf(mergedMap, SymbolUtils.byStringOrdering());
    log.info("Wrote map of {} files to {}", sortedMergedMap.size(), outputMap);
    FileUtils.writeSymbolToFileMap(sortedMergedMap, Files.asCharSink(outputMap, Charsets.UTF_8));
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy