net.maizegenetics.pangenome.hapcollapse.PurgeSequencesFromAlignments Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
import net.maizegenetics.analysis.distance.DistanceMatrixPlugin;
import net.maizegenetics.analysis.filter.FilterSiteBuilderPlugin;
import net.maizegenetics.dna.snp.*;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.taxa.distance.DistanceMatrix;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Created by edbuckler on 6/19/17.
*/
public class PurgeSequencesFromAlignments {
// public static final String localDirectory="/Users/edbuckler/temp/exportedFastaFromV2DB_anchorLongNsRemoved_MAFFTAligned_Trimmed/";
// public static final String localDirectoryOut="/Users/edbuckler/temp/filtered/";
public static final String localDirectory = "/Users/edbuckler/temp/chr10fastafiles/";
public static final String localDirectoryOut = "/Users/edbuckler/temp/chr10fastafilesFilt/";
public static final String loggingFile = "/Users/edbuckler/temp/logging.txt";
public static final String anchorSummaryFile = "/Users/edbuckler/temp/anchorSummary170620a.txt";
public static void main(String[] args) {
Multiset initialTaxonMultiset = ConcurrentHashMultiset.create();
Multiset retainedTaxonMultiset = ConcurrentHashMultiset.create();
try {
// LoggingUtils.setupLogging();
// LoggingUtils.setupLogging(new PrintStream(loggingFile));
BufferedWriter writer = Files.newBufferedWriter(Paths.get(anchorSummaryFile));
List fastaPaths = DirectoryCrawler.listPaths("glob:*.fa.gz", Paths.get(localDirectory));
Collections.sort(fastaPaths, Comparator.comparing(Path::getFileName));
fastaPaths.stream().limit(100).parallel().forEach(fastaPath -> {
try {
GenotypeTable gt = ImportUtils.readFasta(fastaPath.toString());
initialTaxonMultiset.addAll(gt.taxa());
Optional filterGT = filterBadAlignments(gt, 0.1, 130, 0.000);
System.out.println(fastaPath.toString());
System.out.println("gt.sites = " + gt.numberOfSites() + " Taxa = " + gt.numberOfTaxa());
StringBuffer resultLine = new StringBuffer();
resultLine.append(fastaPath.getFileName().toString() + "\t");
resultLine.append(gt.numberOfTaxa() + "\t");
resultLine.append(gt.numberOfSites() + "\t");
if (filterGT.isPresent()) {
System.out.println("Filter sites = " + filterGT.get().numberOfSites() + " Taxa = " + filterGT.get().numberOfTaxa());
writeFasta(localDirectoryOut+fastaPath.getFileName().toString(), filterGT.get());
retainedTaxonMultiset.addAll(filterGT.get().taxa());
resultLine.append(filterGT.get().numberOfTaxa() + "\t");
resultLine.append(filterGT.get().numberOfSites() + "\n");
} else {
System.out.println("Filter - NO ALIGNMENT");
resultLine.append(0 + "\t");
resultLine.append(0 + "\n");
}
writer.write(resultLine.toString());
} catch (IOException e) {
e.printStackTrace();
}
});
for (Taxon taxon : retainedTaxonMultiset.elementSet().stream().sorted().collect(Collectors.toList())) {
writer.write(taxon.getName() + "\t" + initialTaxonMultiset.count(taxon)+"\t" + retainedTaxonMultiset.count(taxon) + "\n");
}
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(retainedTaxonMultiset.toString());
}
private static int numberOfGaps(GenotypeTable gt, int taxonId) {
return (int) IntStream.range(0, gt.numberOfSites())
.filter(index -> gt.genotype(taxonId, index) == NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE)
.count();
}
private static String sequenceToString(GenotypeTable gt, int taxonId) {
StringBuilder sb = new StringBuilder();
IntStream.range(0, gt.numberOfSites())
.forEach(index -> sb.append(gt.genotypeAsString(taxonId, index)));
return sb.toString();
}
private static void writeFasta(String filePathString, GenotypeTable genotypeTable) {
try (BufferedWriter writer = Utils.getBufferedWriter(filePathString)) {
for (int taxaIndex = 0; taxaIndex < genotypeTable.numberOfTaxa(); taxaIndex++) {
writer.write(">" + genotypeTable.taxa().get(taxaIndex).getName());
writer.newLine();
writer.write(sequenceToString(genotypeTable, taxaIndex));
writer.newLine();
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* This method eliminates high distances alignments after all GAPs have been converted to N, and then it filters
* on coverage and minimum MAF
*
* @param genotypeTable
* @param maxDist
* @param minSiteCount
* @param minMAF
* @return
*/
public static Optional filterBadAlignments(GenotypeTable genotypeTable, double maxDist, int minSiteCount, double minMAF) {
GenotypeTable gtNoIndels = GenotypeTableBuilder.getInstanceMaskIndels(genotypeTable);
DistanceMatrix matrixNoIndels = DistanceMatrixPlugin.getDistanceMatrix(gtNoIndels);
TaxaList filterHaplotypeTL = IntStream.range(0, matrixNoIndels.numberOfTaxa())
.filter(taxaIndex -> {
double averageDist = IntStream.range(0, matrixNoIndels.numberOfTaxa())
.mapToDouble(t2index -> matrixNoIndels.getDistance(taxaIndex, t2index))
.filter(dist -> !Double.isNaN(dist))
.average().orElse(1);
return (averageDist < maxDist);
})
.mapToObj(matrixNoIndels::getTaxon)
.collect(TaxaList.collect());
GenotypeTable lowDistanceGT = FilterGenotypeTable.getInstance(gtNoIndels, filterHaplotypeTL);
//DistanceMatrix matrixNoIndelsLowDistance = DistanceMatrixPlugin.getDistanceMatrix(lowDistanceGT);
DataSet filter = new FilterSiteBuilderPlugin(null, false)
.siteMinCount(minSiteCount)
.siteMinAlleleFreq(minMAF)
.performFunction(DataSet.getDataSet(lowDistanceGT));
if (filter.getSize() == 2) return Optional.ofNullable((GenotypeTable) filter.getData(0).getData());
return Optional.empty();
}
}