
umcg.genetica.io.probemapping.reading Maven / Gradle / Ivy
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package umcg.genetica.io.probemapping;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import umcg.genetica.containers.Pair;
import umcg.genetica.containers.Triple;
import umcg.genetica.math.matrix.DoubleMatrixDataset;
/**
*
* @author MarcJan
*/
public class reading {
private static Pattern SPLIT_ON_TAB = Pattern.compile("\t");
private static Pattern SPLIT_ON_SEMICOLON = Pattern.compile(";");
private static Pattern SPLIT_ON_SEMICOLON2 = Pattern.compile("; ");
private static Pattern SPLIT_ON_SPACE = Pattern.compile(" ");
protected static final String ENCODING = "ISO-8859-1";
/**
* Read sam files
* @param folderIn
* @param fileExtention
* @return
*/
public static HashMap>> readInMultipleSamFiles(String folderIn, String fileExtention) {
HashMap>> readAlignments = new HashMap>>();
File file = new File(folderIn);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(fileExtention)) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
String id = vecFiles.get(f).toString();
id = id.replace(currentFile.getParent(), "");
if (id.contains(".chromosome.")) {
id = id.split(".chromosome.")[1];
id = id.replace(".sam", "");
id = "Chr" + id;
} else if (id.contains(".nchr.")) {
id = "Non Chromosomal region";
} else if (id.contains(".ncrna.")) {
id = "Non Coding RNA Transcripts";
} else if (id.contains(".cdna.")) {
id = "Transcripts";
}
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
//System.out.println(str);
if (!str.startsWith("@")) {
String[] parts = SPLIT_ON_TAB.split(str);
if (!(parts[1].equals("4") || parts[1].equals("20"))) {
ArrayList> lines = new ArrayList>(7);
ArrayList line = new ArrayList();
line.add(id);
line.add(parts[2]);
if (parts[1].equals("16")) {
line.add("-");
} else {
line.add("+");
}
line.add(parts[3]);
line.add(String.valueOf(Integer.parseInt(parts[3]) + (parts[9].length() - 1)));
line.add(parts[5]);
for (int i = 6; i < parts.length; ++i) {
if (parts[i].startsWith("NM:")) {
line.add(parts[i].replace("NM:i:", ""));
}
if (parts[i].startsWith("XA:Z:")) {
String[] parts2 = SPLIT_ON_SEMICOLON.split(parts[i].replace("XA:Z:", ""));
for (String piece : parts2) {
ArrayList t = new ArrayList(7);
t.add(id);
String[] parts3 = piece.split(",");
for (int j = 0; j < parts3.length; ++j) {
if (j == 1) {
if (parts3[j].startsWith("+")) {
t.add("+");
t.add(parts3[j].replace("+", ""));
t.add(String.valueOf(Integer.parseInt(parts3[j].replace("+", "")) + (parts[9].length() - 1)));
} else {
t.add("-");
t.add(parts3[j].replace("-", ""));
t.add(String.valueOf(Integer.parseInt(parts3[j].replace("-", "")) + (parts[9].length() - 1)));
}
}
t.add(parts3[j]);
}
lines.add(t);
}
}
}
lines.add(line);
if (readAlignments.containsKey(parts[0])) {
readAlignments.get(parts[0]).addAll(lines);
} else {
readAlignments.put(parts[0], lines);
}
}
}
}
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (readAlignments);
}
/**
* Read sam files
* if there are more than maxDiff differences in the read the row is skipped
* @param folderIn
* @param fileExtention
* @param maxDiff
* @return
*/
public static HashMap>> readInMultipleSamFiles2(String folderIn, String fileExtention, int maxDiff) {
HashMap>> readAlignments = new HashMap>>();
File file = new File(folderIn);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(fileExtention)) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
String id = vecFiles.get(f).toString();
id = id.replace(currentFile.getParent(), "");
if (id.contains(".chromosome.")) {
id = id.split(".chromosome.")[1];
id = id.replace(".sam", "");
id = "Chr" + id;
} else if (id.contains(".nchr.")) {
id = "Non Chromosomal region";
} else if (id.contains(".ncrna.")) {
id = "Non Coding RNA Transcripts";
} else if (id.contains(".cdna.")) {
id = "Transcripts";
}
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
//System.out.println(str);
if (!str.startsWith("@")) {
String[] parts = SPLIT_ON_TAB.split(str);
if (!(parts[1].equals("4") || parts[1].equals("20"))) {
ArrayList> lines = new ArrayList>(7);
ArrayList line = new ArrayList();
line.add(id);
line.add(parts[2]);
if (parts[1].equals("16")) {
line.add("-");
} else {
line.add("+");
}
line.add(parts[3]);
line.add(String.valueOf(Integer.parseInt(parts[3]) + (parts[9].length() - 1)));
line.add(parts[5]);
for (int i = 6; i < parts.length; ++i) {
if (parts[i].startsWith("NM:")) {
line.add(parts[i].replace("NM:i:", ""));
}
if (parts[i].startsWith("XA:Z:")) {
String[] parts2 = SPLIT_ON_SEMICOLON.split(parts[i].replace("XA:Z:", ""));
for (String piece : parts2) {
ArrayList t = new ArrayList(7);
t.add(id);
String[] parts3 = piece.split(",");
for (int j = 0; j < parts3.length; ++j) {
if (j == 1) {
if (parts3[j].startsWith("+")) {
t.add("+");
t.add(parts3[j].replace("+", ""));
t.add(String.valueOf(Integer.parseInt(parts3[j].replace("+", "")) + (parts[9].length() - 1)));
} else {
t.add("-");
t.add(parts3[j].replace("-", ""));
t.add(String.valueOf(Integer.parseInt(parts3[j].replace("-", "")) + (parts[9].length() - 1)));
}
}
t.add(parts3[j]);
}
if (Integer.parseInt(t.get(t.size() - 1)) <= maxDiff) {
lines.add(t);
}
}
}
}
if (Integer.parseInt(line.get(line.size() - 1)) <= maxDiff) {
lines.add(line);
}
if (readAlignments.containsKey(parts[0])) {
readAlignments.get(parts[0]).addAll(lines);
} else {
readAlignments.put(parts[0], lines);
}
}
}
}
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (readAlignments);
}
/**
* Read sam files
* if there are more than maxDiff differences in the read the row is skipped
* Degenerate bases are not counted as a mismatch (removed from edit distance)
*
* @param folderIn
* @param fileExtention
* @param maxDiff
* @return
*/
public static HashMap>> readInMultipleSamFiles2DG(String folderIn, String fileExtention, int maxDiff) {
HashMap>> readAlignments = new HashMap>>();
File file = new File(folderIn);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(fileExtention)) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
String id = vecFiles.get(f).toString();
id = id.replace(currentFile.getParent(), "");
if (id.contains(".chromosome.")) {
id = id.split(".chromosome.")[1];
id = id.replace(".sam", "");
id = "Chr" + id;
} else if (id.contains(".nchr.")) {
id = "Non Chromosomal region";
} else if (id.contains(".ncrna.")) {
id = "Non Coding RNA Transcripts";
} else if (id.contains(".cdna.")) {
id = "Transcripts";
}
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
//System.out.println(str);
if (!str.startsWith("@")) {
String[] parts = SPLIT_ON_TAB.split(str);
if (!(parts[1].equals("4") || parts[1].equals("20"))) {
ArrayList> lines = new ArrayList>(7);
ArrayList line = new ArrayList();
line.add(id);
line.add(parts[2]);
if (parts[1].equals("16")) {
line.add("-");
} else {
line.add("+");
}
line.add(parts[3]);
line.add(String.valueOf(Integer.parseInt(parts[3]) + (parts[9].length() - 1)));
//System.out.println(parts[9]);
int nrNs = getNrNs(parts[9]);
line.add(parts[5]);
for (int i = 10; i < parts.length; ++i) {
if (parts[i].startsWith("NM:")) {
int maxDif = Integer.parseInt(parts[i].replace("NM:i:", ""));
line.add(String.valueOf(maxDif - nrNs));
}
if (parts[i].startsWith("XA:Z:")) {
System.out.println("Skrewed");
}
}
if (Integer.parseInt(line.get(line.size() - 1)) <= maxDiff) {
lines.add(line);
}
if (readAlignments.containsKey(parts[0])) {
readAlignments.get(parts[0]).addAll(lines);
} else {
readAlignments.put(parts[0], lines);
}
}
}
}
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (readAlignments);
}
/**
* Return number N's in a sequence.
* @param string
* @return
*/
private static int getNrNs(String string) {
char[] characterArray = string.toCharArray();
int numberN = 0;
for (int i = 0; i < characterArray.length; ++i) {
if (characterArray[i] == 'n' || characterArray[i] == 'N') {
numberN++;
}
}
return (numberN);
}
/**
* Read annotation file.
* General read in
* @param annotationFile
* @param storingId
* @param sizeMap
* @return
*/
public static HashMap> readAnnotationFile(String annotationFile, int storingId, int sizeMap) {
HashMap> probeInfo = new HashMap>((int) Math.ceil(sizeMap / 0.75));
int entryId = 0;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(annotationFile)), ENCODING), 8096);
String str = "";
str = in.readLine();
String[] header = SPLIT_ON_TAB.split(str);
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
HashMap t = new HashMap((int) Math.ceil(header.length / 0.75));
for (int i = 0; i < strParts.length; ++i) {
if (i != storingId) {
t.put(header[i], strParts[i]);
}
}
if (storingId == -1) {
probeInfo.put(String.valueOf(entryId), t);
entryId++;
} else if (storingId == -2) {
probeInfo.put(strParts[0]+"-"+strParts[1]+"-"+strParts[22], t);
entryId++;
}else {
probeInfo.put(strParts[storingId], t);
}
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
/**
* Read annotation file
* Supply int key and int for value
*
* @param annotationFile
* @param firstRowAsHeader
* @param key
* @param val
* @param sizeMap
* @return
*/
public static HashMap readAnnotationFileHash(String annotationFile, boolean firstRowAsHeader, int key, int val, int sizeMap) {
HashMap probeInfo = new HashMap((int) Math.ceil(sizeMap / 0.75));
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(annotationFile)), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
if(val==-1){
probeInfo.put(strParts[key], str);
} else {
probeInfo.put(strParts[key], strParts[val]);
}
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
/**
* Read annotation file
* Supply int key and int for value
*
* @param annotationFile
* @param firstRowAsHeader
* @param key
* @param val
* @param sizeMap
* @return
*/
public static HashMap> readAnnotationFileHashMap(String annotationFile, boolean firstRowAsHeader, int key, int val1, int val2, int val3, int sizeMap) {
HashMap> probeInfo = new HashMap>((int) Math.ceil(sizeMap / 0.75));
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(annotationFile)), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
Triple tmp;
if(strParts[val2].contains(":")){
strParts[val2] = strParts[val2].split(":")[0];
}
if(strParts[val3].contains(":")){
strParts[val3] = strParts[val3].split(":")[1];
}
if(strParts[val1].equals("Y")){
strParts[val1] = "24";
} else if(strParts[val1].equals("X")){
strParts[val1] = "23";
} else {
}
tmp = new Triple(Integer.parseInt(strParts[val1]),Integer.parseInt(strParts[val2]),Integer.parseInt(strParts[val3]));
probeInfo.put(strParts[key], tmp);
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
/**
* Read annotation file
* Supply int key and int for value
*
* @param annotationFile
* @param firstRowAsHeader
* @param key
* @param val
* @param sizeMap
* @return
*/
public static ArrayList< Triple> readAnnotationFileArrayList(String annotationFile, int col1, int col2, int col3, boolean firstRowAsHeader, int sizeMap) {
ArrayList< Triple> probeInfo = new ArrayList< Triple>((int) Math.ceil(sizeMap / 0.75));
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(annotationFile)), ENCODING), 8096);
String str;
if (firstRowAsHeader) {
str = in.readLine();
}
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
Triple tmp;
if(strParts[col1].contains("chr")){
strParts[col1] = strParts[col1].replace("chr", "");
}
if(strParts[col1].equalsIgnoreCase("Y")){
strParts[col1] = "24";
} else if(strParts[col1].equalsIgnoreCase("X")){
strParts[col1] = "23";
}
if(strParts[col1].length()<=2){
tmp = new Triple(Integer.parseInt(strParts[col1]),Integer.parseInt(strParts[col2]),Integer.parseInt(strParts[col3]));
probeInfo.add(tmp);
}
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
/**
* read GTF file (gene code information)
* @param annotationFile
* @param sizeMap
* @return
*/
public static HashMap readGTFAnnotationFileHash(String annotationFile, int sizeMap) {
HashMap probeInfo = new HashMap((int) Math.ceil(sizeMap / 0.75));
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(annotationFile)), ENCODING), 8096);
String str = "";
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
if (strParts.length == 9) {
String[] strParts2 = SPLIT_ON_SEMICOLON2.split(strParts[8]);
HashMap tmpHash = new HashMap();
for (String tmp : strParts2) {
tmp = tmp.replaceAll("\"", "");
String[] tmpPart = SPLIT_ON_SPACE.split(tmp);
tmpHash.put(tmpPart[0], tmpPart[1]);
}
String tmp = tmpHash.get("gene_id");
tmpHash.put("gene_id", tmp);
tmp = tmp.split("\\.")[0];
tmpHash.put("gene_id", tmp);
tmp = tmpHash.get("transcript_id");
tmp = tmp.split("\\.")[0];
tmpHash.put("transcript_id", tmp);
if (probeInfo.containsKey(tmpHash.get("gene_id"))) {
if (!(probeInfo.get(tmpHash.get("gene_id")).equals(tmpHash.get("gene_name")))) {
System.out.println(tmpHash.get("gene_id") + "\t" + probeInfo.get(tmpHash.get("gene_id")) + "\t" + tmpHash.get("gene_name"));
System.exit(0);
}
} else {
//System.out.println(tmpHash.get("gene_id")+"\t"+tmpHash.get("gene_name"));
probeInfo.put(tmpHash.get("gene_id"), tmpHash.get("gene_name"));
}
if (probeInfo.containsKey(tmpHash.get("transcript_id"))) {
if (!(probeInfo.get(tmpHash.get("transcript_id")).equals(tmpHash.get("gene_name")))) {
System.out.println(tmpHash.get("transcript_id") + "\t" + probeInfo.get(tmpHash.get("transcript_id")) + "\t" + tmpHash.get("gene_name"));
System.exit(0);
}
} else {
//System.out.println(tmpHash.get("gene_id")+"\t"+tmpHash.get("gene_name"));
probeInfo.put(tmpHash.get("transcript_id"), tmpHash.get("gene_name"));
}
}
// probeInfo.put(strParts[key], strParts[val]);
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
/**
* Read SNP information files
* @param annotationFileFolder
* @param minMaf
* @param firstRowAsHeader
* @return
*/
public static HashMap> readMultipleSNPAnnotationFilesSmall(String annotationFileFolder, double minMaf, boolean firstRowAsHeader) {
HashMap> snpInfo = new HashMap>();
File file = new File(annotationFileFolder);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(".txt")) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
String[] header = SPLIT_ON_TAB.split(str);
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
if (!strParts[5].isEmpty()) {
//System.out.println(strParts[5]);
if (Double.parseDouble(strParts[5]) > minMaf) {
if (snpInfo.containsKey(strParts[2])) {
snpInfo.get(strParts[2]).add(Integer.parseInt(strParts[3]));
} else {
HashSet locations = new HashSet();
snpInfo.put(strParts[2], locations);
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (snpInfo);
}
/**
* Read SNP information files
* @param annotationFileFolder
* @param minMaf
* @param firstRowAsHeader
* @return
*/
public static HashMap> readMultipleSNPAnnotationFiles(String annotationFileFolder, double minMaf, boolean firstRowAsHeader) {
HashMap> snpInfo = new HashMap>(25);
File file = new File(annotationFileFolder);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(".txt")) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
String currectChr = "";
HashMap locations = new HashMap();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
String[] header = SPLIT_ON_TAB.split(str);
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
StringBuilder keys = new StringBuilder(strParts[0]);
if(strParts.length>8 && !strParts[8].isEmpty()){
keys.append(";").append(strParts[8]);
}
if (!strParts[5].isEmpty()) {
//System.out.println(strParts[5]);
if (Double.parseDouble(strParts[5]) >= minMaf) {
if (locations.size()>0) {
if(locations.containsKey(Integer.parseInt(strParts[3])) && strParts.length>8){
StringBuilder newKeys = new StringBuilder(locations.get(Integer.parseInt(strParts[3])));
newKeys.append(";").append(strParts[8]);
locations.put(Integer.parseInt(strParts[3]), newKeys.toString());
} else{
locations.put(Integer.parseInt(strParts[3]), keys.toString());
}
} else {
locations.put(Integer.parseInt(strParts[3]), keys.toString());
currectChr = strParts[2];
}
}
}
}
ArrayList keys = new ArrayList();
keys.addAll(locations.keySet());
Collections.sort(keys);
LinkedHashMap locations2 = new LinkedHashMap((int)(Math.round((double)locations.size() / 0.75)));
for(Integer i : keys){
locations2.put(i, locations.get(i));
}
locations = null;
snpInfo.put(currectChr, locations2);
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (snpInfo);
}
/**
* Read SNP information files
* @param annotationFileFolder
* @param minMaf
* @param firstRowAsHeader
* @return
*/
public static HashMap> readMultipleSNPAnnotationFiles2(String annotationFileFolder, double minMaf, boolean firstRowAsHeader) {
HashMap> snpInfo = new HashMap>(25);
File file = new File(annotationFileFolder);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(".txt")) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
String currectChr = "";
HashMap locations = new HashMap();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
String[] header = SPLIT_ON_TAB.split(str);
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
StringBuilder keys = new StringBuilder(strParts[0]);
if(strParts.length>8 && !strParts[8].isEmpty()){
keys.append(";").append(strParts[8]);
}
if (!strParts[5].isEmpty()) {
//System.out.println(strParts[5]);
if (Double.parseDouble(strParts[5]) >= minMaf) {
if (locations.size()>0) {
if(locations.containsKey(Integer.parseInt(strParts[3])) && strParts.length>8){
StringBuilder newKeys = new StringBuilder(locations.get(Integer.parseInt(strParts[3])));
newKeys.append(";").append(strParts[8]);
locations.put(Integer.parseInt(strParts[3]), newKeys.toString());
} else{
locations.put(Integer.parseInt(strParts[3]), keys.toString());
}
} else {
locations.put(Integer.parseInt(strParts[3]), keys.toString());
currectChr = strParts[2];
}
}
}
}
snpInfo.put(currectChr, locations);
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (snpInfo);
}
/**
* Read SNP information files
* @param annotationFileFolder
* @param minMaf
* @param firstRowAsHeader
* @return
*/
public static HashMap> readMultipleSNPAnnotationFiles3(String annotationFileFolder, double minMaf, boolean firstRowAsHeader) {
HashMap> snpInfo = new HashMap>(25);
File file = new File(annotationFileFolder);
File[] files = file.listFiles();
ArrayList vecFiles = new ArrayList();
for (int f = 0; f < files.length; f++) {
if (files[f].getAbsolutePath().endsWith(".txt")) {
vecFiles.add(files[f]);
}
}
for (int f = 0; f < vecFiles.size(); f++) {
File currentFile = vecFiles.get(f);
System.out.println("Processing:\t" + f + "\t" + currentFile.getAbsolutePath());
String currectChr = "";
TreeMap locations = new TreeMap();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
String[] header = SPLIT_ON_TAB.split(str);
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
StringBuilder keys = new StringBuilder(strParts[0]);
if(strParts.length>8 && !strParts[8].isEmpty()){
keys.append(";").append(strParts[8]);
}
if (!strParts[5].isEmpty()) {
//System.out.println(strParts[5]);
if (Double.parseDouble(strParts[5]) >= minMaf) {
if (locations.size()>0) {
if(locations.containsKey(Integer.parseInt(strParts[3])) && strParts.length>8){
StringBuilder newKeys = new StringBuilder(locations.get(Integer.parseInt(strParts[3])));
newKeys.append(";").append(strParts[8]);
locations.put(Integer.parseInt(strParts[3]), newKeys.toString());
} else{
locations.put(Integer.parseInt(strParts[3]), keys.toString());
}
} else {
locations.put(Integer.parseInt(strParts[3]), keys.toString());
currectChr = strParts[2];
}
}
}
}
snpInfo.put(currectChr, locations);
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
}
return (snpInfo);
}
/**
* Read one file into HashSet
*
* @param fileWithAnnotation
* @return Sample annotation
*/
public static HashSet readFilterHash(String probeFilteringFiles) {
HashSet probesToBeRemoved = new HashSet();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(probeFilteringFiles)), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
probesToBeRemoved.add(str);
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
return (probesToBeRemoved);
}
/**
* Read one file into HashSet
*
* @param fileWithAnnotation
* @return Sample annotation
*/
public static ArrayList readListToArrayList(String probeFilteringFiles) {
ArrayList probesToBeRemoved = new ArrayList();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(probeFilteringFiles)), ENCODING), 8096);
String str;
while ((str = in.readLine()) != null) {
probesToBeRemoved.add(str);
}
in.close();
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
return (probesToBeRemoved);
}
/**
* Read multiple file into HashSet
*
* @param fileWithAnnotation
* @return Sample annotation
*/
public static HashSet readFilterHash2(String[] probeFilteringFiles) {
ArrayList> probesToBeRemoved = new ArrayList>();
for (String s : probeFilteringFiles) {
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(s)), ENCODING), 8096);
String str;
HashSet tmpPprobesToBeRemoved = new HashSet();
while ((str = in.readLine()) != null) {
tmpPprobesToBeRemoved.add(str);
}
in.close();
probesToBeRemoved.add(tmpPprobesToBeRemoved);
} catch (IOException e) {
System.out.println(e.getMessage());
System.exit(-1);
}
}
HashSet finalSet = probesToBeRemoved.get(0);
for (int i = 1; i < probesToBeRemoved.size(); ++i) {
finalSet.retainAll(probesToBeRemoved.get(i));
}
return (finalSet);
}
/**
* Read double matrix file restricting to given rows Eigenvector file / pc
* file / probe matrix
*
* @param eigenVectorFile
* @return
*/
public static DoubleMatrixDataset readDoubleMatrixFile(String eigenVectorFile, Set rowsToInclude) {
DoubleMatrixDataset tmp = new DoubleMatrixDataset();
try {
if (rowsToInclude == null) {
tmp = new DoubleMatrixDataset(eigenVectorFile);
} else {
tmp = new DoubleMatrixDataset(eigenVectorFile, null, rowsToInclude);
}
} catch (IOException ex) {
Logger.getLogger(reading.class.getName()).log(Level.SEVERE, null, ex);
}
return (tmp);
}
public static HashMap> readMetaAnalysisResults(String metaAnalysisScores, boolean firstRowAsHeader, int key, int val1, int val2, int sizeMap) {
HashMap> probeInfo = new HashMap>((int) Math.ceil(sizeMap / 0.75));
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(metaAnalysisScores)), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
Pair tmp = new Pair(strParts[val1],Double.parseDouble(strParts[val2]));
probeInfo.put(strParts[key], tmp);
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
public static HashMap>> readAnnotationFileHashMap2(String annotationFile, boolean firstRowAsHeader, int key, int val1, int val2, int val3, int sizeMap) {
HashMap>> probeInfo = new HashMap>>((int) Math.ceil(sizeMap / 0.75));
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(annotationFile)), ENCODING), 8096);
String str = "";
if (firstRowAsHeader) {
str = in.readLine();
}
while ((str = in.readLine()) != null) {
String[] strParts = SPLIT_ON_TAB.split(str);
Pair tmp;
if(strParts[val2].contains(":")){
strParts[val2] = strParts[val2].split(":")[0];
}
if(strParts[val3].contains(":")){
strParts[val3] = strParts[val3].split(":")[1];
}
if(strParts[val1].equals("Y")){
strParts[val1] = "24";
} else if(strParts[val1].equals("X")){
strParts[val1] = "23";
} else {
}
int chr = Integer.parseInt(strParts[val1]);
tmp = new Pair(Integer.parseInt(strParts[val2]),Integer.parseInt(strParts[val3]));
if(probeInfo.containsKey(chr)){
probeInfo.get(chr).put(strParts[key], tmp);
} else {
HashMap> tmpje = new HashMap>();
tmpje.put(strParts[key], tmp);
probeInfo.put(chr, tmpje);
}
}
} catch (IOException e) {
e.printStackTrace();
System.out.println(e.getMessage());
System.exit(-1);
}
return (probeInfo);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy