net.maizegenetics.pangenome.hapcollapse.CompareAssembliesToReference Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import net.maizegenetics.analysis.distance.DistanceMatrixPlugin;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.LoggingUtils;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Simple little utility to compare assembly alignments to the reference genome
* @author edbuckler
*/
public class CompareAssembliesToReference {
public static final String localDirectory = "/Users/edbuckler/temp/chr10fastafiles/";
public static final String loggingFile = "/Users/edbuckler/temp/logging.txt";
public static final String anchorSummaryFile = "/Users/edbuckler/temp/assemDistRef_170621b.txt";
private static final double maxDistance = 0.002; //maximum distance between taxa to merge
private static final int minSites = 100; //minimum number of sites to find a match
private static final String[] assemblies={"W22Assembly", "B104Assembly", "CML247Assembly","PH207Assembly","EP1Assembly"};
public static void main(String[] args) {
try{
LoggingUtils.setupStdOutLogging();
BufferedWriter writer = Files.newBufferedWriter(Paths.get(anchorSummaryFile));
writer.write("AnchorID\tOriginalSites\t");
writer.write(Arrays.stream(assemblies).collect(Collectors.joining("\t")));
writer.newLine();
List fastaPaths = DirectoryCrawler.listPaths("glob:*.fa.gz", Paths.get(localDirectory));
Pattern pattern = Pattern.compile("Id(\\d*)");
fastaPaths.stream().forEach(fastaPath -> {
try {
System.out.println(fastaPath.toString());
final Matcher matcher = pattern.matcher(fastaPath.toString());
matcher.find();
String anchorIdString = matcher.group(1);
//System.out.println(anchorIdString);
GenotypeTable origGenotypeTable = GenotypeTableBuilder.getInstanceMaskIndels(ImportUtils.readFasta(fastaPath.toString()));
String s = String.format("%s\t%d\t", anchorIdString, origGenotypeTable.numberOfSites());
writer.write(s);
DistanceMatrix distanceMatrix = DistanceMatrixPlugin.getDistanceMatrix(origGenotypeTable);
int b73Index=distanceMatrix.getTaxaList().indexOf("B73Ref");
for (String assembly : assemblies) {
int taxaIndex = distanceMatrix.getTaxaList().indexOf(assembly);
if (taxaIndex < 0) {
writer.write(Double.NaN + "\t");
} else {
writer.write(distanceMatrix.getDistance(b73Index,taxaIndex)+"\t");
}
}
writer.newLine();
} catch (IOException e) {
e.printStackTrace();
}
});
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}