All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.julielab.genemapper.resources.IDReplacer Maven / Gradle / Ivy

Go to download

This project assembles code and files required to build the dictionaries and indexes used by the JCoRe Gene Mapper.

The newest version!
package de.julielab.genemapper.resources;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * This class takes as input a database or table-shaped file, a database source
 * ID column index, an ID mapping file and an source ID mapping file column
 * index. The indexes are zero-based. The program will replace the source IDs in
 * the database column with the target IDs that they are mapped to in the
 * mapping file. The mapping file is allowed to contain n-to-m-mappings. In such
 * cases, each database record line will be output for each ID mapping target.
 * 
 * All files must use the tab character as the column separator.
 * 
 * @author faessler
 *
 */
public class IDReplacer {

	private final File dbFile;
	private final int dbIndex;
	private final File mappingFile;
	private final int mappingIndex;
	private Map> mapping;
	private final File output;

	public Map> getMapping() {
		return mapping;
	}

	public IDReplacer(File dbFile, int dbIndex, File mappingFile, int mappingIndex) throws IOException {
		this(dbFile, dbIndex, mappingFile, mappingIndex, null);
	}

	public IDReplacer(File dbFile, int dbIndex, File mappingFile, int mappingIndex, File output) throws IOException {
		this.dbFile = dbFile;
		this.dbIndex = dbIndex;
		this.mappingFile = mappingFile;
		this.mappingIndex = mappingIndex;
		this.output = output;

		if (dbFile != null && (!dbFile.exists() || !dbFile.isFile()))
			throw new IOException(
					dbFile.getAbsolutePath() + " does not exist or is not a regular file (maybe a directory?)");
		if (mappingFile != null && (!mappingFile.exists() || !mappingFile.isFile()))
			throw new IOException(
					mappingFile.getAbsolutePath() + " does not exist or is not a regular file (maybe a directory?)");
	}

	public void readMapping() throws IOException {
		// read the lines of the mapping file, split them at tab, group them by
		// the values of the
		// specified column but do not group the whole split but map the split
		// to the other column (which is always at 1 - mappingIndex because the
		// specified index is 0 or 1)
		mapping = Files.lines(mappingFile.toPath(), StandardCharsets.UTF_8).map(l -> l.split("\\t"))
				.collect(Collectors.groupingBy(s -> s[mappingIndex],
						Collectors.mapping(s -> s[1 - mappingIndex], Collectors.toList())));

	}

	public void replace() throws IOException {
		// read the lines of the database file, split them at tab, filter those lines away where we don't have a mapping for,
		// replace the specified source ID column successively by each of the
		// mapped target ID and create a new record line by joining the fields
		// by a tab character (flatMap allows us here to create multiple output
		// lines for a single input line which we need since the source ID might
		// be mapped to multiple target IDs).
		Stream mappedDbLines = Files.lines(dbFile.toPath(), StandardCharsets.UTF_8).map(l -> l.split("\\t")).filter(s -> mapping.containsKey(s[dbIndex]))
				.flatMap(s -> mapping.get(s[dbIndex]).stream().map(id -> {
					s[dbIndex] = id;
					return Stream.of(s).collect(Collectors.joining("\t"));
				}));
		if (output != null) {
			Files.write(output.toPath(), (Iterable) () -> mappedDbLines.iterator(), StandardCharsets.UTF_8);
		} else {
			mappedDbLines.forEach(System.out::println);
		}
	}

	public static void main(String[] args) throws IOException {
		if (args.length < 4 || args.length > 5) {
			System.out.println("Usage: " + IDReplacer.class.getName()
					+ "     [output file]");
			System.exit(1);
		}

		File dbFile = new File(args[0]);
		int dbIndex = Integer.parseInt(args[1]);
		File mappingFile = new File(args[2]);
		int mappingIndex = Integer.parseInt(args[3]);
		File output = null;
		if (args.length > 4 && args[4].trim().length() > 0)
			output = new File(args[4]);

		IDReplacer replacer = new IDReplacer(dbFile, dbIndex, mappingFile, mappingIndex, output);
		replacer.readMapping();
		replacer.replace();

	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy