net.maizegenetics.pangenome.db_loading.AnchorDataPHG Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Range;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.nio.file.NoSuchFileException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.List;
/**
* THis class differs from AnchorData in WGS_whatever as it
* includes more fields to match what is stored in PHGSchema's
* anchor_haplotypes. Specifically it includes ave_allele_depth
* and additional_data
*
* May 26, 2017: Class updated to include gene_start and gene_end
* These values needed when trimming anchors. The stored anchors
* have sequence of merged genes whose GFF start/end positions overlapped
* with 1000bp flanking on each end. THis 1000bp may be less is the
* distance between 2 genes is less than 2000bps.
*
* @author lcj34
*/
public class AnchorDataPHG implements Comparable {
private static final Logger myLogger = Logger.getLogger(AnchorDataPHG.class);
private final Range intervalCoordinates;
// "asm" fields only valid for assembly processing.
private final String asmContig;
private final int asmStart;
private final int asmEnd;
private final String asmStrand;
private final String sequence;
private final int seqLen;
private final String seqHash; // sha1 digest value
private final String gvcf; // json created from gvcf - will be missing for ref anchors
private final int asmFileId;
private final int gvcfFileId;
public AnchorDataPHG(Range interval, String asmContig, int asmStart, int asmEnd, String asmStrand,String gvcf,
String sequence, int asmFileId, int gvcfFileId) {
this.intervalCoordinates = interval;
this.asmContig = asmContig;
this.asmStart = asmStart;
this.asmEnd = asmEnd;
this.asmStrand = asmStrand;
this.gvcf = gvcf;
this.sequence = sequence;
this.seqLen = sequence.length();
this.seqHash = getChecksumForString(sequence, "MD5");
this.asmFileId = asmFileId;
this.gvcfFileId= gvcfFileId;
}
// For the PHG, should always be MD5
public static String getChecksumForString(String seq, String protocol) {
// from https://www.mkyong.com/java/java-md5-hashing-example/
try {
MessageDigest md = MessageDigest.getInstance(protocol);
md.update(seq.getBytes());
byte byteData[] = md.digest();
// convert the byte to hex format
StringBuffer sb = new StringBuffer();
for (int idx = 0; idx < byteData.length; idx++) {
sb.append(Integer.toString((byteData[idx] & 0xff) + 0x100, 16).substring(1));
}
return sb.toString();
} catch (NoSuchAlgorithmException exc) {
myLogger.error("getChecksumForString: problem getting checksum: " + exc.getMessage());
throw new IllegalStateException("CheckSum: getChecksumForString: error: " + exc.getMessage());
}
}
public Range intervalCoordinates() {
return intervalCoordinates;
}
public String asmContig() { return asmContig; }
public int asmStart() { return asmStart; }
public int asmEnd() { return asmEnd; }
public String asmStrand() { return asmStrand; }
public String gvcf() {
return gvcf;
}
public String sequence() {
return sequence;
}
public int seqLen() {
return seqLen;
}
public String seqHash() {
return seqHash;
}
public int asmFileId() { return asmFileId;}
public int gvcfFileId() { return gvcfFileId;}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
AnchorDataPHG that = (AnchorDataPHG) obj;
if (!(intervalCoordinates().equals(that.intervalCoordinates()))) return false;
if (seqLen() != that.seqLen()) return false;
if (!seqHash().equals(that.seqHash())) return false;
return true;
}
@Override
public int hashCode() {
int hash = 7;
hash = 37 * hash + this.intervalCoordinates().hashCode();
hash = 37 * hash + this.seqHash.hashCode();
hash = 37 * hash + this.seqLen;
return hash;
}
@Override
public int compareTo(AnchorDataPHG other) {
int result = ComparisonChain.start()
.compare(intervalCoordinates.lowerEndpoint().getChromosome(),
other.intervalCoordinates().lowerEndpoint().getChromosome())
.compare(intervalCoordinates.lowerEndpoint().getPosition(),
other.intervalCoordinates().lowerEndpoint().getPosition())
.result();
return result;
}
}