All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.snp.io.BuilderFromHapMap Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
/*
 *  BuilderFromHapMap
 * 
 *  Created on Aug 1, 2014
 */
package net.maizegenetics.dna.snp.io;

import com.google.common.collect.SetMultimap;

import java.io.BufferedReader;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;

import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.SuperByteMatrix;
import net.maizegenetics.util.SuperByteMatrixBuilder;
import net.maizegenetics.util.Utils;

import org.apache.log4j.Logger;

/**
 *
 * @author Ed Buckler
 * @author Terry Casstevens
 */
public class BuilderFromHapMap {

    private static final Logger myLogger = Logger.getLogger(BuilderFromHapMap.class);
    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
    private static final int NUM_HAPMAP_NON_TAXA_HEADERS = 11;
    private static final int SNPID_INDEX = 0;
    private static final int VARIANT_INDEX = 1;
    private static final int CHROMOSOME_INDEX = 2;
    private static final int POSITION_INDEX = 3;
    private static final int NUM_VALUES_PROCESSED_TOGETHER = 7 << 20;

    private final String myHapmapFile;
    private boolean mySortTaxaAlphabetically = false;
    private boolean mySortPositions = false;
    private final ProgressListener myProgressListener;

    private BuilderFromHapMap(String hapmapFile, ProgressListener listener) {
        myHapmapFile = hapmapFile;
        myProgressListener = listener;
    }

    public static BuilderFromHapMap getBuilder(String hapmapFile) {
        return new BuilderFromHapMap(hapmapFile, null);
    }

    public static BuilderFromHapMap getBuilder(String hapmapFile, ProgressListener listener) {
        return new BuilderFromHapMap(hapmapFile, listener);
    }

    public GenotypeTable build() {

        ExecutorService pool = null;
        try (BufferedReader reader = Utils.getBufferedReader(myHapmapFile, 1 << 20)) {

            Map> sampAnnoBuild = new TreeMap<>();

            String currLine = reader.readLine();
            while ((currLine != null) && currLine.startsWith("##")) {

                String[] cat = currLine.split("=", 2);
                if (cat.length < 2) {
                    continue;
                }
                if (cat[0].startsWith("##SAMPLE")) {

                    SetMultimap mapOfAnno = TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1]);
                    String taxaID = mapOfAnno.get("ID").iterator().next();
                    if (taxaID != null) {
                        sampAnnoBuild.put(taxaID, mapOfAnno);
                    }

                }

                currLine = reader.readLine();
            }

            TaxaListBuilder taxaList = processTaxa(currLine, sampAnnoBuild);
            int numTaxa = taxaList.numberOfTaxa();

            Map chromosomeLookup = new ConcurrentHashMap<>();

            currLine = reader.readLine();

            boolean isOneLetter = false;
            String[] tokens = WHITESPACE_PATTERN.split(currLine, NUM_HAPMAP_NON_TAXA_HEADERS + 1);
            if (tokens.length <= NUM_HAPMAP_NON_TAXA_HEADERS) {
                throw new IllegalStateException("BuilderFromHapMap: Header Incorrectly Formatted: See:\nhttps://bitbucket.org/tasseladmin/tassel-5-source/wiki/UserManual/Load/Load#markdown-header-hapmap");
            }
            double avg = (double) (tokens[NUM_HAPMAP_NON_TAXA_HEADERS].length() + 1) / (double) numTaxa;
            if ((avg > 1.99) && (avg < 2.01)) {
                isOneLetter = true;
            } else if ((avg > 2.99) && (avg < 3.01)) {
                isOneLetter = false;
            } else {
                throw new IllegalStateException("BuilderFromHapMap: Genotype coded wrong use 1 or 2 letters per genotype.  Or first site has incorrect number of values.");
            }

            int numThreads = Runtime.getRuntime().availableProcessors();
            pool = Executors.newFixedThreadPool(numThreads);
            List> futures = new ArrayList<>();

            int numSitesToProcessTogether = NUM_VALUES_PROCESSED_TOGETHER / numTaxa;
            numSitesToProcessTogether = Math.min(1 << 16, numSitesToProcessTogether);
            numSitesToProcessTogether = Math.max(512, numSitesToProcessTogether);

            ArrayList textLines = new ArrayList<>(numSitesToProcessTogether);
            int numLines = 0;
            while (currLine != null) {
                textLines.add(currLine);
                numLines++;
                if (numLines % numSitesToProcessTogether == 0) {
                    ProcessHapmapBlock processBlock = new ProcessHapmapBlock(textLines, numTaxa, chromosomeLookup, isOneLetter);
                    futures.add(pool.submit(processBlock));
                    textLines = new ArrayList<>(numSitesToProcessTogether);
                }
                currLine = reader.readLine();
            }

            if (textLines.size() > 0) {
                ProcessHapmapBlock processBlock = new ProcessHapmapBlock(textLines, numTaxa, chromosomeLookup, isOneLetter);
                futures.add(pool.submit(processBlock));
            }

            int currentSite = 0;
            PositionListBuilder positions = new PositionListBuilder();
            GenotypeCallTableBuilder genotypes = GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(numTaxa, numLines);

            int numFutures = futures.size();
            int count = 0;
            for (Future future : futures) {
                ProcessHapmapBlock pb = future.get();
                positions.addAll(pb.getPositions());
                SuperByteMatrix bgTS = pb.getGenotypes();
                for (int t = 0; t < bgTS.getNumRows(); t++) {
                    for (int s = 0; s < bgTS.getNumColumns(); s++) {
                        genotypes.setBase(t, currentSite + s, bgTS.get(t, s));
                    }
                }
                currentSite += pb.getNumberSitesProcessed();
                if (myProgressListener != null) {
                    count++;
                    myProgressListener.progress(count * 100 / numFutures, null);
                }
            }
            pool.shutdown();

            if (mySortTaxaAlphabetically) {
                taxaList.sortTaxaAlphabetically(genotypes);
            }

            if (mySortPositions) {
                positions.sortPositions(genotypes);
            }

            if (positions.validateOrdering() == false) {
                throw new IllegalStateException("BuilderFromHapMap: Ordering incorrect. HapMap must be ordered by position. Please first use SortGenotypeFilePlugin to correctly order the file.");
            }

            return GenotypeTableBuilder.getInstance(genotypes.build(), positions.build(), taxaList.build());

        } catch (Exception e) {
            if (pool != null) {
                pool.shutdown();
            }
            myLogger.debug(e.getMessage(), e);
            throw new IllegalStateException(e.getMessage());
        }

    }

    private class ProcessHapmapBlock implements Callable {

        private List myInputLines;
        private final Map myChromosomeLookup;
        private final boolean myIsOneLetter;
        private final List myPositionList;
        private final int myNumSitesToProcess;
        private final int myNumTaxa;
        private SuperByteMatrix myGenotypes;

        public ProcessHapmapBlock(List inputLines, int numTaxa, Map chromosomeLookup, boolean isOneLetter) {
            myInputLines = inputLines;
            myChromosomeLookup = chromosomeLookup;
            myNumTaxa = numTaxa;
            myIsOneLetter = isOneLetter;
            myNumSitesToProcess = inputLines.size();
            myPositionList = new ArrayList<>(myNumSitesToProcess);
        }

        @Override
        public ProcessHapmapBlock call() throws Exception {

            myGenotypes = SuperByteMatrixBuilder.getInstance(myNumTaxa, myNumSitesToProcess);
            for (int site = 0; site < myNumSitesToProcess; site++) {
                String input = myInputLines.get(site);
                try {
                    int[] tabPos = new int[NUM_HAPMAP_NON_TAXA_HEADERS];
                    int tabIndex = 0;
                    int len = input.length();
                    for (int i = 0; (tabIndex < NUM_HAPMAP_NON_TAXA_HEADERS) && (i < len); i++) {
                        if (input.charAt(i) == '\t') {
                            tabPos[tabIndex++] = i;
                        }
                    }
                    String chrName = input.substring(tabPos[CHROMOSOME_INDEX - 1] + 1, tabPos[CHROMOSOME_INDEX]);
                    Chromosome currChr = myChromosomeLookup.get(chrName);
                    if (currChr == null) {
                        currChr = new Chromosome(new String(chrName));
                        myChromosomeLookup.put(chrName, currChr);
                    }
                    String variants = input.substring(tabPos[VARIANT_INDEX - 1] + 1, tabPos[VARIANT_INDEX]);
                    int physicalPos;
                    try {
                        physicalPos = Integer.parseInt(input.substring(tabPos[POSITION_INDEX - 1] + 1, tabPos[POSITION_INDEX]));
                    } catch (Exception ex) {
                        throw new IllegalArgumentException("BuilderFromHapMap: Position must be an integer: " + input.substring(tabPos[POSITION_INDEX - 1] + 1, tabPos[POSITION_INDEX]).trim());
                    }
                    GeneralPosition.Builder apb = new GeneralPosition.Builder(currChr, physicalPos)
                            .snpName(input.substring(0, tabPos[SNPID_INDEX]))
                            .knownVariants(variants) //TODO   strand, variants,
                            ;

                    byte glbMajor = NucleotideAlignmentConstants.getNucleotideDiploidByte(variants.charAt(0));
                    apb.allele(WHICH_ALLELE.GlobalMajor, glbMajor);
                    if (variants.length() == 3) {
                        byte glbMinor = NucleotideAlignmentConstants.getNucleotideDiploidByte(variants.charAt(2));
                        apb.allele(WHICH_ALLELE.GlobalMinor, glbMinor);
                    }

                    myPositionList.add(apb.build());
                    int offset = tabPos[NUM_HAPMAP_NON_TAXA_HEADERS - 1] + 1;

                    int taxon = 0;
                    if (myIsOneLetter) {
                        for (int i = offset; i < len; i += 2) {
                            if (taxon >= myNumTaxa) {
                                throw new IllegalStateException("BuilderFromHapMap: SNP Named: " + myPositionList.get(myPositionList.size() - 1).getSNPID() + " has too many values.");
                            }
                            byte value = NucleotideAlignmentConstants.getNucleotideDiploidByte(input.charAt(i));
                            if (value == NucleotideAlignmentConstants.UNDEFINED_DIPLOID_ALLELE) {
                                throw new IllegalStateException("BuilderFromHapMap: SNP Named: " + myPositionList.get(myPositionList.size() - 1).getSNPID() + " has illegal value: " + input.charAt(i));
                            }
                            myGenotypes.set(taxon++, site, value);
                        }
                    } else {
                        for (int i = offset; i < len; i += 3) {
                            if (taxon >= myNumTaxa) {
                                throw new IllegalStateException("BuilderFromHapMap: SNP Named: " + myPositionList.get(myPositionList.size() - 1).getSNPID() + " has too many values.");
                            }
                            // there is a phasing conflict with the existing import approach
                            byte value = GenotypeTableUtils.getDiploidValue(NucleotideAlignmentConstants.getNucleotideDiploidByte(input.charAt(i + 1)),
                                    NucleotideAlignmentConstants.getNucleotideDiploidByte(input.charAt(i)));
                            if (value == NucleotideAlignmentConstants.UNDEFINED_DIPLOID_ALLELE) {
                                throw new IllegalStateException("BuilderFromHapMap: SNP Named: " + myPositionList.get(myPositionList.size() - 1).getSNPID() + " has illegal value: " + input.charAt(i) + input.charAt(i + 1));
                            }
                            myGenotypes.set(taxon++, site, value);
                        }
                    }
                    if (taxon != myNumTaxa) {
                        throw new IllegalStateException("BuilderFromHapMap: SNP Named: " + myPositionList.get(myPositionList.size() - 1).getSNPID() + " has too few values.");
                    }

                    swapSitesIfOutOfOrder(site);

                } catch (Exception e) {
                    myLogger.error("Error parsing this row " + input);
                    myLogger.debug(e.getMessage(), e);
                    throw new IllegalStateException("BuilderFromHapMap: Error Parsing Line: " + input.substring(0, Math.min(25, input.length())) + "...\n" + e.getMessage());
                }
            }
            myInputLines = null;

            return this;

        }

        // Swap adjacent misordered sites, often caused by two sites at the same positions with a different name order
        private void swapSitesIfOutOfOrder(int site) {
            if (site < 1) {
                return;
            }
            if (myPositionList.get(site - 1).compareTo(myPositionList.get(site)) > 0) {
                //swap
                Position tempP = myPositionList.get(site - 1);
                myLogger.warn("Swapping:" + tempP.toString() + " <-> " + myPositionList.get(site).toString());
                myPositionList.set(site - 1, myPositionList.get(site));
                myPositionList.set(site, tempP);
                for (int t = 0; t < myGenotypes.getNumRows(); t++) {
                    byte tempG = myGenotypes.get(t, site - 1);
                    myGenotypes.set(t, site - 1, myGenotypes.get(t, site));
                    myGenotypes.set(t, site, tempG);
                }
            }

        }

        public int getNumberSitesProcessed() {
            return myNumSitesToProcess;
        }

        public SuperByteMatrix getGenotypes() {
            return myGenotypes;
        }

        public List getPositions() {
            return myPositionList;
        }

    }

    static TaxaListBuilder processTaxa(String readLn, Map> taxaAnnotation) {

        String[] header = WHITESPACE_PATTERN.split(readLn);
        int numTaxa = header.length - NUM_HAPMAP_NON_TAXA_HEADERS;
        TaxaListBuilder tlb = new TaxaListBuilder();
        for (int i = 0; i < numTaxa; i++) {
            String taxonID = header[i + NUM_HAPMAP_NON_TAXA_HEADERS];
            Taxon.Builder at = new Taxon.Builder(taxonID);
            SetMultimap taMap = taxaAnnotation.get(taxonID);
            if (taMap != null) {
                for (Map.Entry en : taMap.entries()) {
                    if (en.getKey().equals("ID")) {
                        continue; //skip the IDs as these became the name
                    }
                    String s = en.getValue().replace("\"", "");
                    at.addAnno(en.getKey(), s);
                }
            }
            tlb.add(at.build());
        }
        return tlb;

    }

    /**
     * Set the builder so that when built it will sort the taxa alphabetically.
     */
    public BuilderFromHapMap sortTaxa() {
        mySortTaxaAlphabetically = true;
        return this;
    }

    /**
     * Set the builder so that when built it will sort positions.
     */
    public BuilderFromHapMap sortPositions() {
        mySortPositions = true;
        return this;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy