All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.db_loading.AnchorDataPHG Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 *
 */
package net.maizegenetics.pangenome.db_loading;

import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Range;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import org.apache.log4j.Logger;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.nio.file.NoSuchFileException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.List;


/**
 * THis class differs from AnchorData in WGS_whatever as it
 * includes more fields to match what is stored in PHGSchema's
 * anchor_haplotypes.  Specifically it includes ave_allele_depth
 * and additional_data
 *
 * May 26, 2017:  Class updated to include gene_start and gene_end
 * These values needed when trimming anchors.  The stored anchors
 * have sequence of merged genes whose GFF start/end positions overlapped
 * with 1000bp flanking on each end.  THis 1000bp may be less is the
 * distance between 2 genes is less than 2000bps.
 *
 * @author lcj34
 */

public class AnchorDataPHG implements Comparable {
    private static final Logger myLogger = Logger.getLogger(AnchorDataPHG.class);

    private final Range intervalCoordinates;
    // "asm" fields only valid for assembly processing.
    private final String asmContig;
    private final int asmStart;
    private final int asmEnd;
    private final String asmStrand;
    private final String sequence;
    private final int seqLen;
    private final String seqHash; // sha1 digest value
    private final String gvcf; // json created from gvcf - will be missing for ref anchors
    private final int asmFileId;
    private final int gvcfFileId;

    public AnchorDataPHG(Range interval, String asmContig, int asmStart, int asmEnd, String asmStrand,String gvcf,
                         String sequence, int asmFileId, int gvcfFileId) {
        this.intervalCoordinates = interval;
        this.asmContig = asmContig;
        this.asmStart = asmStart;
        this.asmEnd = asmEnd;
        this.asmStrand = asmStrand;
        this.gvcf = gvcf;
        this.sequence = sequence;
        this.seqLen = sequence.length();
        this.seqHash = getChecksumForString(sequence, "MD5");
        this.asmFileId = asmFileId;
        this.gvcfFileId= gvcfFileId;
    }
    // For the PHG, should always be MD5
    public static String getChecksumForString(String seq, String protocol) {
        // from https://www.mkyong.com/java/java-md5-hashing-example/
        try {
            MessageDigest md = MessageDigest.getInstance(protocol);
            md.update(seq.getBytes());
            byte byteData[] = md.digest();
            // convert the byte to hex format
            StringBuffer sb = new StringBuffer();
            for (int idx = 0; idx < byteData.length; idx++) {
                sb.append(Integer.toString((byteData[idx] & 0xff) + 0x100, 16).substring(1));
            }
            return sb.toString();
        } catch (NoSuchAlgorithmException exc) {
            myLogger.error("getChecksumForString: problem getting checksum: " + exc.getMessage());
            throw new IllegalStateException("CheckSum: getChecksumForString: error: " + exc.getMessage());
        }

    }

    public Range intervalCoordinates() {
        return intervalCoordinates;
    }

    public String asmContig() { return asmContig; }
    public int asmStart() { return asmStart; }
    public int asmEnd() { return asmEnd; }
    public String asmStrand() { return asmStrand; }
    public String gvcf() {
        return gvcf;
    }

    public String sequence() {
        return sequence;
    }

    public int seqLen() {
        return seqLen;
    }

    public String seqHash() {
        return seqHash;
    }

    public int asmFileId() { return asmFileId;}
    public int gvcfFileId() { return gvcfFileId;}

    @Override
    public boolean equals(Object obj) {
        if (this == obj) return true;
        if (obj == null || getClass() != obj.getClass()) return false;

        AnchorDataPHG that = (AnchorDataPHG) obj;

        if (!(intervalCoordinates().equals(that.intervalCoordinates()))) return false;
        if (seqLen() != that.seqLen()) return false;
        if (!seqHash().equals(that.seqHash())) return false;

        return true;
    }

    @Override
    public int hashCode() {
        int hash = 7;
        hash = 37 * hash + this.intervalCoordinates().hashCode();
        hash = 37 * hash + this.seqHash.hashCode();
        hash = 37 * hash + this.seqLen;
        return hash;
    }

    @Override
    public int compareTo(AnchorDataPHG other) {
        int result = ComparisonChain.start()
                .compare(intervalCoordinates.lowerEndpoint().getChromosome(),
                        other.intervalCoordinates().lowerEndpoint().getChromosome())
                .compare(intervalCoordinates.lowerEndpoint().getPosition(),
                        other.intervalCoordinates().lowerEndpoint().getPosition())
                .result();
        return result;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy