All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.hapcollapse.PurgeSequencesFromAlignments Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.hapcollapse;

import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
import net.maizegenetics.analysis.distance.DistanceMatrixPlugin;
import net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin;
import net.maizegenetics.dna.snp.*;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;

import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * Created by edbuckler on 6/19/17.
 */
public class PurgeSequencesFromAlignments {
    //    public static final String localDirectory="/Users/edbuckler/temp/exportedFastaFromV2DB_anchorLongNsRemoved_MAFFTAligned_Trimmed/";
//    public static final String localDirectoryOut="/Users/edbuckler/temp/filtered/";
    public static final String localDirectory = "/Users/edbuckler/temp/chr10fastafiles/";
    public static final String localDirectoryOut = "/Users/edbuckler/temp/chr10fastafilesFilt/";
    public static final String loggingFile = "/Users/edbuckler/temp/logging.txt";
    public static final String anchorSummaryFile = "/Users/edbuckler/temp/anchorSummary170620a.txt";

    
    public static void main(String[] args) {
        Multiset initialTaxonMultiset = ConcurrentHashMultiset.create();
        Multiset retainedTaxonMultiset = ConcurrentHashMultiset.create();

        try {
//            LoggingUtils.setupLogging();
//            LoggingUtils.setupLogging(new PrintStream(loggingFile));
            BufferedWriter writer = Files.newBufferedWriter(Paths.get(anchorSummaryFile));
            List fastaPaths = DirectoryCrawler.listPaths("glob:*.fa.gz", Paths.get(localDirectory));
            Collections.sort(fastaPaths, Comparator.comparing(Path::getFileName));
            fastaPaths.stream().limit(100).parallel().forEach(fastaPath -> {
                try {
                    GenotypeTable gt = ImportUtils.readFasta(fastaPath.toString());
                    initialTaxonMultiset.addAll(gt.taxa());
                    Optional filterGT = filterBadAlignments(gt, 0.1, 130, 0.000);
                    System.out.println(fastaPath.toString());
                    System.out.println("gt.sites = " + gt.numberOfSites() + " Taxa = " + gt.numberOfTaxa());
                    StringBuffer resultLine = new StringBuffer();
                    resultLine.append(fastaPath.getFileName().toString() + "\t");
                    resultLine.append(gt.numberOfTaxa() + "\t");
                    resultLine.append(gt.numberOfSites() + "\t");
                    if (filterGT.isPresent()) {
                        System.out.println("Filter sites = " + filterGT.get().numberOfSites() + " Taxa = " + filterGT.get().numberOfTaxa());
                        writeFasta(localDirectoryOut+fastaPath.getFileName().toString(), filterGT.get());
                        retainedTaxonMultiset.addAll(filterGT.get().taxa());
                        resultLine.append(filterGT.get().numberOfTaxa() + "\t");
                        resultLine.append(filterGT.get().numberOfSites() + "\n");
                    } else {
                        System.out.println("Filter - NO ALIGNMENT");
                        resultLine.append(0 + "\t");
                        resultLine.append(0 + "\n");
                    }
                    writer.write(resultLine.toString());
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });
            for (Taxon taxon : retainedTaxonMultiset.elementSet().stream().sorted().collect(Collectors.toList())) {
                writer.write(taxon.getName() + "\t" + initialTaxonMultiset.count(taxon)+"\t" + retainedTaxonMultiset.count(taxon) + "\n");
            }
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println(retainedTaxonMultiset.toString());
    }

    private static int numberOfGaps(GenotypeTable gt, int taxonId) {
        return (int) IntStream.range(0, gt.numberOfSites())
                .filter(index -> gt.genotype(taxonId, index) == NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE)
                .count();
    }

    private static String sequenceToString(GenotypeTable gt, int taxonId) {
        StringBuilder sb = new StringBuilder();
        IntStream.range(0, gt.numberOfSites())
                .forEach(index -> sb.append(gt.genotypeAsString(taxonId, index)));
        return sb.toString();
    }

    private static void writeFasta(String filePathString, GenotypeTable genotypeTable) {
        try (BufferedWriter writer = Utils.getBufferedWriter(filePathString)) {
            for (int taxaIndex = 0; taxaIndex < genotypeTable.numberOfTaxa(); taxaIndex++) {
                writer.write(">" + genotypeTable.taxa().get(taxaIndex).getName());
                writer.newLine();
                writer.write(sequenceToString(genotypeTable, taxaIndex));
                writer.newLine();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * This method eliminates high distances alignments after all GAPs have been converted to N, and then it filters
     * on coverage and minimum MAF
     *
     * @param genotypeTable
     * @param maxDist
     * @param minSiteCount
     * @param minMAF
     * @return
     */
    public static Optional filterBadAlignments(GenotypeTable genotypeTable, double maxDist, int minSiteCount, double minMAF) {
        GenotypeTable gtNoIndels = GenotypeTableBuilder.getInstanceMaskIndels(genotypeTable);
        DistanceMatrix matrixNoIndels = DistanceMatrixPlugin.getDistanceMatrix(gtNoIndels);

        TaxaList filterHaplotypeTL = IntStream.range(0, matrixNoIndels.numberOfTaxa())
                .filter(taxaIndex -> {
                    double averageDist = IntStream.range(0, matrixNoIndels.numberOfTaxa())
                            .mapToDouble(t2index -> matrixNoIndels.getDistance(taxaIndex, t2index))
                            .filter(dist -> !Double.isNaN(dist))
                            .average().orElse(1);
                    return (averageDist < maxDist);
                })
                .mapToObj(matrixNoIndels::getTaxon)
                .collect(TaxaList.collect());
        GenotypeTable lowDistanceGT = FilterGenotypeTable.getInstance(gtNoIndels, filterHaplotypeTL);
        //DistanceMatrix matrixNoIndelsLowDistance = DistanceMatrixPlugin.getDistanceMatrix(lowDistanceGT);
        DataSet filter = new FilterSiteBuilderPlugin(null, false)
                .siteMinCount(minSiteCount)
                .siteMinAlleleFreq(minMAF)
                .performFunction(DataSet.getDataSet(lowDistanceGT));
        if (filter.getSize() == 2) return Optional.ofNullable((GenotypeTable) filter.getData(0).getData());
        return Optional.empty();
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy