All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.hapcollapse.CompareAssembliesToReference Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.hapcollapse;

import net.maizegenetics.analysis.distance.DistanceMatrixPlugin;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.LoggingUtils;

import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Simple little utility to compare assembly alignments to the reference genome
 * @author edbuckler
 */
public class CompareAssembliesToReference {
    public static final String localDirectory = "/Users/edbuckler/temp/chr10fastafiles/";
    public static final String loggingFile = "/Users/edbuckler/temp/logging.txt";
    public static final String anchorSummaryFile = "/Users/edbuckler/temp/assemDistRef_170621b.txt";

    private static final double maxDistance = 0.002; //maximum distance between taxa to merge
    private static final int minSites = 100; //minimum number of sites to find a match

    private static final String[] assemblies={"W22Assembly", "B104Assembly", "CML247Assembly","PH207Assembly","EP1Assembly"};

    public static void main(String[] args) {
        try{
        LoggingUtils.setupStdOutLogging();
        BufferedWriter writer = Files.newBufferedWriter(Paths.get(anchorSummaryFile));
        writer.write("AnchorID\tOriginalSites\t");
        writer.write(Arrays.stream(assemblies).collect(Collectors.joining("\t")));
        writer.newLine();
        List fastaPaths = DirectoryCrawler.listPaths("glob:*.fa.gz", Paths.get(localDirectory));
        Pattern pattern = Pattern.compile("Id(\\d*)");
        fastaPaths.stream().forEach(fastaPath -> {
            try {
                System.out.println(fastaPath.toString());
                final Matcher matcher = pattern.matcher(fastaPath.toString());
                matcher.find();
                String anchorIdString = matcher.group(1);
                //System.out.println(anchorIdString);
                GenotypeTable origGenotypeTable = GenotypeTableBuilder.getInstanceMaskIndels(ImportUtils.readFasta(fastaPath.toString()));
                String s = String.format("%s\t%d\t", anchorIdString, origGenotypeTable.numberOfSites());
                writer.write(s);
                DistanceMatrix distanceMatrix = DistanceMatrixPlugin.getDistanceMatrix(origGenotypeTable);
                int b73Index=distanceMatrix.getTaxaList().indexOf("B73Ref");
                for (String assembly : assemblies) {
                    int taxaIndex = distanceMatrix.getTaxaList().indexOf(assembly);
                    if (taxaIndex < 0) {
                        writer.write(Double.NaN + "\t");
                    } else {
                        writer.write(distanceMatrix.getDistance(b73Index,taxaIndex)+"\t");
                    }
                }
                writer.newLine();
            } catch (IOException e) {
                e.printStackTrace();
            }
        });
        writer.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy