net.maizegenetics.analysis.gbs.Clusters Maven / Gradle / Ivy
/*
* Clusters
*/
package net.maizegenetics.analysis.gbs;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.TreeSet;
import net.maizegenetics.dna.tag.TagsByTaxa;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.tag.ReadsByTaxa;
/**
*
* @author fl262
*/
public class Clusters {
cluster[] cls;
public Clusters(ReadsByTaxa rbt) {
getClusters(rbt);
}
public Clusters(TagsByTaxa tbt) {
getClusters(tbt);
}
public Clusters(String infileS, boolean binary) {
readCluster(infileS, binary);
}
public void getClusters(ReadsByTaxa rbt) {
PolymorphismFinder pf = new PolymorphismFinder(rbt);
ArrayList clList = new ArrayList();
for (int i = 0; i < rbt.haplotypeNum; i++) {
long[] queryLongSeq = new long[2];
queryLongSeq[0] = rbt.haplotype[0][i];
queryLongSeq[1] = rbt.haplotype[1][i];
ArrayList hitIndex = pf.findOneMismatch(queryLongSeq);
if (hitIndex.isEmpty()) {
continue;
}
Integer[] hitIndexArray = hitIndex.toArray(new Integer[hitIndex.size()]);
for (int j = 0; j < hitIndexArray.length; j++) {
cluster cl = new cluster(i, hitIndexArray[j], hitIndexArray.length + 1, true);
clList.add(cl);
}
}
cls = clList.toArray(new cluster[clList.size()]);
}
// This constructor works for tbt file
public void getClusters(TagsByTaxa tbt) {
ArrayList clList = new ArrayList();
TagMatchFinder tmf = new TagMatchFinder(tbt);
for (int i = 0; i < tbt.getTagCount(); i++) {
long[] qTag = tbt.getTag(i);
TreeMap hitDiv = tmf.findMatchesWithIntLengthWords(qTag, 1, false);
for (Entry each : hitDiv.entrySet()) {
if (each.getValue() > 0) {
clList.add(new cluster(i, each.getKey(), hitDiv.size(), true));
}
}
}
cls = clList.toArray(new cluster[clList.size()]);
//Arrays.sort(cls);
}
public void networkFilter() {
TreeSet clSet = new TreeSet();
TreeSet threeSet = new TreeSet();
ArrayList twoClusterList = new ArrayList();
cluster[] twoCluster;
for (int i = 0; i < cls.length; i++) {
cls[i].switchQueryAndHit();
}
Arrays.sort(cls);
for (int i = 0; i < cls.length; i++) {
if (cls[i].cSize > 2) {
threeSet.add(cls[i].queryIndex);
threeSet.add(cls[i].hitIndex);
} else {
clSet.add(cls[i]);
}
}
cluster[] tempTwoCluster = clSet.toArray(new cluster[clSet.size()]);
Integer[] threeArray = threeSet.toArray(new Integer[threeSet.size()]);
Arrays.sort(threeArray);
for (int i = 0; i < tempTwoCluster.length; i++) {
int hit = Arrays.binarySearch(threeArray, tempTwoCluster[i].queryIndex);
if (hit > -1) {
continue;
}
hit = Arrays.binarySearch(threeArray, tempTwoCluster[i].hitIndex);
if (hit > -1) {
continue;
}
twoClusterList.add(tempTwoCluster[i]);
}
twoCluster = twoClusterList.toArray(new cluster[twoClusterList.size()]);
cls = twoCluster;
Arrays.sort(cls);
twoCluster = null;
tempTwoCluster = null;
threeArray = null;
clSet = null;
threeSet = null;
twoClusterList = null;
}
public void repeatLibraryFilter(ReadsByTaxa rbt, String TagSeqS, String repeatlibS) {
String[] tagSeqs = new String[this.getSnpCount()];
int[] clsIndex = new int[tagSeqs.length];
int count = 0;
for (int i = 0; i < cls.length; i++) {
if (!cls[i].ifSnp) {
continue;
}
long[] longSeq = new long[2];
longSeq[0] = rbt.haplotype[0][cls[i].queryIndex];
longSeq[1] = rbt.haplotype[1][cls[i].queryIndex];
tagSeqs[count] = BaseEncoder.getSequenceFromLong(longSeq);
clsIndex[count] = i;
count++;
}
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(TagSeqS), 65536);
for (int i = 0; i < tagSeqs.length; i++) {
bw.write(">" + String.valueOf(clsIndex[i]));
bw.newLine();
bw.write(tagSeqs[i]);
bw.newLine();
}
bw.flush();
bw.close();
} catch (Exception e) {
System.out.println("Error occurred while writing " + TagSeqS);
}
String blastFileS = TagSeqS.replace("fasta", "blast");
String cmd = "blastn -query " + TagSeqS + " -db " + repeatlibS + " -out " + blastFileS + " -evalue 1e-10 -outfmt 6";
System.out.println(cmd);
try {
Runtime.getRuntime().exec(cmd).waitFor();
} catch (Exception e) {
System.out.println("Error occurred while doing blast " + e.toString());
}
TreeSet repeatIndex = new TreeSet();
try {
BufferedReader br = new BufferedReader(new FileReader(blastFileS), 65536);
String temp;
while ((temp = br.readLine()) != null) {
Integer index = Integer.valueOf(temp.split("\t")[0]);
repeatIndex.add(index);
}
} catch (Exception e) {
System.out.println("Error occurred while reading" + blastFileS);
}
Integer[] repeatIndexArray = repeatIndex.toArray(new Integer[repeatIndex.size()]);
for (int i = 0; i < repeatIndexArray.length; i++) {
cls[repeatIndexArray[i]].ifSnp = false;
}
}
public void repeatFilter(ReadsByTaxa rbt, double ratio) {
int[] tagCount = new int[2 * this.getSnpCount()];
int[] clusterIndex = new int[2 * this.getSnpCount()];
for (int i = 0; i < cls.length; i++) {
if (!cls[i].ifSnp) {
continue;
}
int totalA = 0, totalC = 0;
for (int j = 0; j < rbt.taxaNum; j++) {
totalA += rbt.hapDist[cls[i].queryIndex][j];
totalC += rbt.hapDist[cls[i].hitIndex][j];
}
tagCount[2 * i] = totalA;
clusterIndex[2 * i] = i;
tagCount[2 * i + 1] = totalC;
clusterIndex[2 * i + 1] = i;
}
for (int i = 0; i < tagCount.length - 1; i++) {
for (int j = i + 1; j < tagCount.length; j++) {
if (tagCount[i] > tagCount[j]) {
int mid = tagCount[i];
tagCount[i] = tagCount[j];
tagCount[j] = mid;
mid = clusterIndex[i];
clusterIndex[i] = clusterIndex[j];
clusterIndex[j] = mid;
}
}
}
int beginIndex = (int) Math.floor(clusterIndex.length * (1 - ratio));
for (int i = beginIndex; i < clusterIndex.length; i++) {
cls[clusterIndex[i]].ifSnp = false;
}
}
public int getSnpCount() {
int snpCount = 0;
for (int i = 0; i < cls.length; i++) {
if (cls[i].ifSnp) {
snpCount++;
}
}
return snpCount;
}
public void alleleFrequencyFileter(ReadsByTaxa rbt, double minBorderMaf, double maxBorderMaf) {
for (int i = 0; i < cls.length; i++) {
if (!cls[i].ifSnp) {
continue;
}
int totalA = 0, totalC = 0;
for (int j = 0; j < rbt.taxaNum; j++) {
totalA += rbt.hapDist[cls[i].queryIndex][j];
totalC += rbt.hapDist[cls[i].hitIndex][j];
}
int min = totalA;
if (totalC < totalA) {
min = totalC;
}
double maf = (double) min / (double) (totalA + totalC);
if (maf < minBorderMaf || maf > maxBorderMaf) {
cls[i].ifSnp = false;
}
}
}
public void heteozygoteFilter(ReadsByTaxa rbt) {
for (int i = 0; i < cls.length; i++) {
int heteoA = 0, heteoC = 0, totalA = 0, totalC = 0;
boolean flag = false;
if (cls[i].ifSnp) {
for (int j = 0; j < rbt.taxaNum; j++) {
if (rbt.hapDist[cls[i].queryIndex][j] > 0) {
totalA += rbt.hapDist[cls[i].queryIndex][j];
if (rbt.hapDist[cls[i].hitIndex][j] > 0) {
totalC += rbt.hapDist[cls[i].hitIndex][j];
heteoA += rbt.hapDist[cls[i].queryIndex][j];
heteoC += rbt.hapDist[cls[i].hitIndex][j];
boolean ifSig = chiSquareEvenDf1(rbt.hapDist[cls[i].queryIndex][j], rbt.hapDist[cls[i].hitIndex][j], 3.841);
if (ifSig) {
cls[i].ifSnp = false;
flag = true;
break;
}
}
} else {
if (rbt.hapDist[cls[i].hitIndex][j] > 0) {
totalC += rbt.hapDist[cls[i].hitIndex][j];
}
}
}
} else {
continue;
}
if (flag) {
continue;
}
if (chiSquareEvenDf1(heteoA, heteoC, 3.841)) {
cls[i].ifSnp = false;
continue;
}
/*
if (!chiSquareDf1(heteoA, heteoC, totalA, totalC, 0)) {
cls[i].ifSnp = false;
}
*
*/
}
}
public boolean chiSquareDf1(int ob1, int ob2, int ex1, int ex2, double chiValueCutoff) {
double o1 = (double) ob1 / (double) (ob1 + ob2);
double o2 = (double) ob2 / (double) (ob1 + ob2);
double e1 = (double) ex1 / (double) (ex1 + ex2);
double e2 = (double) ex2 / (double) (ex1 + ex2);
double chi = Math.pow(o1 - e1, 2) / e1 + Math.pow(o2 - e2, 2) / e2;
if (chi > chiValueCutoff) {
return true;
}
return false;
}
public boolean chiSquareEvenDf1(int ob1, int ob2, double chiValueCutoff) {
double o1 = (double) ob1;
double o2 = (double) ob2;
double ex = (o1 + o2) / 2;
double chi = Math.pow(o1 - ex, 2) / ex + Math.pow(o2 - ex, 2) / ex;
if (chi > chiValueCutoff) {
return true;
}
return false;
}
public void readCluster(String infileS, boolean binary) {
try {
DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(infileS), 65536));
cls = new cluster[dis.readInt()];
int[] temp = new int[3];
for (int i = 0; i < cls.length; i++) {
for (int j = 0; j < temp.length; j++) {
temp[j] = dis.readInt();
}
cls[i] = new cluster(temp[0], temp[1], temp[2], dis.readBoolean());
}
} catch (Exception e) {
System.out.println("erroe in reading" + infileS + e.toString());
}
}
public void hapDetail(ReadsByTaxa rbt, String outfileS, float coverRate) {
ArrayList al = new ArrayList();
String[] hapMap;
for (int i = 0; i < cls.length; i++) {
if (cls[i].ifSnp) {
String temp;
temp = getHapDetail(rbt, i, cls[i].queryIndex, cls[i].hitIndex, coverRate);
if (temp != null) {
al.add(temp);
}
}
}
hapMap = al.toArray(new String[al.size()]);
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(outfileS), 65536);
bw.write("rs alleles chrom pos strand assembly center protLSID assayLSID panelLSID QCcode ");
for (int i = 0; i < rbt.getTaxaCount() - 1; i++) {
bw.write(rbt.getTaxaNames()[i] + "\t");
}
bw.write(rbt.getTaxaNames()[rbt.getTaxaCount() - 1]);
bw.newLine();
int count = 0;
for (int i = 0; i < hapMap.length; i++) {
bw.write(hapMap[i]);
bw.newLine();
}
bw.flush();
bw.close();
} catch (Exception e) {
System.out.println(e.toString());
}
}
private String getHapDetail(ReadsByTaxa rbt, int ID, int queryIndex, int hitIndex, float coverRate) {
StringBuilder sb = new StringBuilder();
sb.append(queryIndex).append("\tA/C\t").append(1).append("\t").append(hitIndex).append("\t+\tNA\tSWGDiv\tGBS\tSWGV1\tSWGPop\tQC+\t");
int countN = 0;
int totalA = 0, totalC = 0;
int hetQuery = 0, hetHit = 0;
for (int i = 0; i < rbt.taxaNum; i++) {
if (rbt.hapDist[queryIndex][i] > 0) {
totalA += rbt.hapDist[queryIndex][i];
if (rbt.hapDist[hitIndex][i] > 0) {
totalC += rbt.hapDist[hitIndex][i];
sb.append(String.valueOf(rbt.hapDist[queryIndex][i])).append("|").append(String.valueOf(rbt.hapDist[hitIndex][i])).append("\t");
hetQuery += rbt.hapDist[queryIndex][i];
hetHit += rbt.hapDist[hitIndex][i];
} else {
sb.append(String.valueOf(rbt.hapDist[queryIndex][i])).append("|").append("\t");
}
} else {
if (rbt.hapDist[hitIndex][i] > 0) {
totalC += rbt.hapDist[hitIndex][i];
sb.append("|");
sb.append(String.valueOf(rbt.hapDist[hitIndex][i])).append("\t");
} else {
sb.append("N").append("\t");
countN++;
}
}
}
if (1 - (float) countN / (float) rbt.taxaNum < coverRate) {
return null;
}
sb.append(String.valueOf(hetQuery)).append("\t");
sb.append(String.valueOf(hetHit)).append("\t");
sb.append(String.valueOf(totalA)).append("\t");
sb.append(String.valueOf(totalC));
return sb.toString();
}
public void writeHapMap(ReadsByTaxa rbt, String outfileS, float coverRate) {
String[] hapMap;
ArrayList al = new ArrayList();
for (int i = 0; i < cls.length; i++) {
String temp;
if (cls[i].ifSnp) {
temp = getHapMapRecord(rbt, i + 1, cls[i].queryIndex, cls[i].hitIndex, coverRate);
if (temp != null) {
al.add(temp);
}
}
}
hapMap = al.toArray(new String[al.size()]);
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(outfileS), 65536);
bw.write("rs alleles chrom pos strand assembly center protLSID assayLSID panelLSID QCcode ");
for (int i = 0; i < rbt.getTaxaCount() - 1; i++) {
bw.write(rbt.getTaxaNames()[i] + "\t");
}
bw.write(rbt.getTaxaNames()[rbt.getTaxaCount() - 1]);
bw.newLine();
for (int i = 0; i < hapMap.length; i++) {
bw.write(hapMap[i]);
bw.newLine();
}
bw.flush();
bw.close();
} catch (Exception e) {
System.out.println(e.toString());
}
}
private String getHapMapRecord(ReadsByTaxa rbt, int ID, int queryIndex, int hitIndex, float coverRate) {
StringBuilder sb = new StringBuilder();
sb.append(queryIndex).append("|").append(hitIndex).append("\tA/C\t").append(1).append("\t").append(ID).append("\t+\tNA\tSWGDiv\tGBS\tSWGV1\tSWGPop\tQC+\t");
int countN = 0;
int totalA = 0, totalC = 0;
for (int i = 0; i < rbt.taxaNum; i++) {
if (rbt.hapDist[queryIndex][i] > 0) {
totalA += rbt.hapDist[queryIndex][i];
if (rbt.hapDist[hitIndex][i] > 0) {
totalC += rbt.hapDist[hitIndex][i];
sb.append("R").append("\t");
} else {
sb.append("A").append("\t");
}
} else {
if (rbt.hapDist[hitIndex][i] > 0) {
totalC += rbt.hapDist[hitIndex][i];
sb.append("C").append("\t");
} else {
sb.append("N").append("\t");
countN++;
}
}
}
if (1 - (float) countN / (float) rbt.taxaNum < coverRate) {
return null;
}
sb.deleteCharAt(sb.length() - 1);
return sb.toString();
}
public void writeCluster(String outfileS, boolean binary) {
int snpCount = getSnpCount();
if (binary) {
try {
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outfileS), 65536));
dos.writeInt(snpCount);
for (int i = 0; i < cls.length; i++) {
cls[i].writeBinary(dos);
}
dos.flush();
dos.close();
} catch (Exception e) {
System.out.println(e.toString());
}
} else {
try {
BufferedWriter bw = new BufferedWriter(new FileWriter(outfileS), 65536);
bw.write(Integer.toString(snpCount));
bw.newLine();
bw.write("queryIndex\thitIndex\tdiv\tclusterSize");
bw.newLine();
for (int i = 0; i < cls.length; i++) {
cls[i].writeTxt(bw);
}
bw.flush();
bw.close();
} catch (Exception e) {
System.out.println(e.toString());
}
}
}
public void writeFastA(ReadsByTaxa rbt, String outFastAS) {
try {
int count = 1;
BufferedWriter bw = new BufferedWriter(new FileWriter(outFastAS), 65536);
for (int i = 0; i < cls.length; i++) {
if (!cls[i].ifSnp) {
continue;
}
long longSeq[] = new long[2];
longSeq[0] = rbt.haplotype[0][cls[i].queryIndex];
longSeq[1] = rbt.haplotype[1][cls[i].queryIndex];
bw.write(">" + String.valueOf(count));
bw.newLine();
bw.write("G" + BaseEncoder.getSequenceFromLong(longSeq));
bw.newLine();
count++;
longSeq[0] = rbt.haplotype[0][cls[i].hitIndex];
longSeq[1] = rbt.haplotype[1][cls[i].hitIndex];
bw.write(">" + String.valueOf(count));
bw.newLine();
bw.write("G" + BaseEncoder.getSequenceFromLong(longSeq));
bw.newLine();
count++;
}
bw.flush();
bw.close();
} catch (Exception e) {
System.out.println("Error occurred while writing" + outFastAS);
}
}
public void prin() {
for (int i = 0; i < 100; i++) {
cls[i].screenPri();
}
System.out.println(cls.length);
}
}
class cluster implements Comparable {
int queryIndex;
int hitIndex;
int cSize;
boolean ifSnp;
public cluster(int queryIndex, int hitIndex, int cSize, boolean ifSnp) {
this.queryIndex = queryIndex;
this.hitIndex = hitIndex;
this.cSize = cSize;
this.ifSnp = ifSnp;
}
public void setIfSnp(boolean ifSnp) {
this.ifSnp = ifSnp;
}
public void writeBinary(DataOutputStream dos) {
try {
if (ifSnp) {
dos.writeInt(queryIndex);
dos.writeInt(hitIndex);
dos.writeInt(cSize);
dos.writeBoolean(ifSnp);
}
} catch (Exception e) {
System.out.println(e.toString());
}
}
public void writeTxt(BufferedWriter bw) {
try {
if (ifSnp) {
bw.write(queryIndex + "\t" + hitIndex + "\t" + cSize + "\t" + String.valueOf(ifSnp));
bw.newLine();
}
} catch (Exception e) {
System.out.println(e.toString());
}
}
public void screenPri() {
System.out.println(queryIndex + "\t" + hitIndex + "\t" + cSize);
}
public void switchQueryAndHit() {
if (queryIndex > hitIndex) {
int mid = queryIndex;
queryIndex = hitIndex;
hitIndex = mid;
}
}
public int compareTo(cluster o) {
if (queryIndex < o.queryIndex) {
return -1;
} else if (queryIndex > o.queryIndex) {
return 1;
} else {
if (hitIndex < o.hitIndex) {
return -1;
} else if (hitIndex > o.hitIndex) {
return 1;
} else {
if (cSize < o.cSize) {
return -1;
} else if (cSize > o.cSize) {
return 1;
} else {
return 0;
}
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy