org.opencb.cellbase.lib.builders.RegulatoryRegionBuilder Maven / Gradle / Ivy
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.cellbase.lib.builders;
import org.opencb.biodata.models.core.RegulatoryFeature;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.FileUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.*;
import java.util.*;
/**
* User: fsalavert.
* Date: 4/10/13
* Time: 10:14 AM
*/
@Deprecated
public class RegulatoryRegionBuilder extends CellBaseBuilder {
private static final int CHUNK_SIZE = 2000;
private static final String REGULATORY_FEATURES = "regulatory_features";
@Deprecated
private static final String DEPRECATED_MOTIF_FEATURES = "deprecated_motif_features";
private static final String MOTIF_FEATURES = "motif_features";
private static final String FEATURE_TYPE = "feature_type";
private static final String ID = "id";
private static final String BINDING_MATRIX = "binding_matrix";
private static final String MOTIF_FEATURE_TYPE = "motif_feature_type";
private Path regulatoryRegionPath;
public RegulatoryRegionBuilder(Path regulatoryRegionFilesDir, CellBaseSerializer serializer) {
super(serializer);
this.regulatoryRegionPath = regulatoryRegionFilesDir;
}
public void createSQLiteRegulatoryFiles(Path regulatoryRegionPath)
throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException {
List gffColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "group");
List gffColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT", "TEXT");
// Path regulatoryRegionPath = regulationDir.toPath();
Path filePath;
filePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE);
createSQLiteRegulatoryFiles(filePath, REGULATORY_FEATURES, gffColumnNames, gffColumnTypes);
filePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE);
createSQLiteRegulatoryFiles(filePath, MOTIF_FEATURES, gffColumnNames, gffColumnTypes);
// TODO: REMOVE
// >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DEPRECATED
filePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz");
createSQLiteRegulatoryFiles(filePath, "annotated_features", gffColumnNames, gffColumnTypes);
filePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz");
createSQLiteRegulatoryFiles(filePath, DEPRECATED_MOTIF_FEATURES, gffColumnNames, gffColumnTypes);
filePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz");
createSQLiteRegulatoryFiles(filePath, "regulatory_features_multicell", gffColumnNames, gffColumnTypes);
// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< DEPRECATED
// GFFColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame");
// GFFColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT");
filePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz");
if (Files.exists(filePath)) {
createSQLiteRegulatoryFiles(filePath, "mirna_uniq", gffColumnNames, gffColumnTypes);
}
}
@Override
public void parse() throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException {
if (regulatoryRegionPath == null || !Files.exists(regulatoryRegionPath) || !Files.isDirectory(regulatoryRegionPath)) {
throw new IOException("Regulation directory whether does not exist, is not a directory or cannot be read");
}
// Create the SQLite databases
createSQLiteRegulatoryFiles(regulatoryRegionPath);
String chunkIdSuffix = CHUNK_SIZE / 1000 + "k";
Path regulatoryFilePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE + ".db");
Path motifFilePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".db");
Path annotatedFilePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz.db");
Path deprecatedMotifFilePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz.db");
Path deprecatedRegulatoryFilePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz.db");
Path mirnaFilePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz.db");
List filePaths = Arrays.asList(regulatoryFilePath, motifFilePath, annotatedFilePath,
deprecatedMotifFilePath, deprecatedRegulatoryFilePath);
List tableNames = Arrays.asList(REGULATORY_FEATURES, MOTIF_FEATURES, "annotated_features",
DEPRECATED_MOTIF_FEATURES, "regulatory_features_multicell");
if (Files.exists(mirnaFilePath)) {
filePaths.add(mirnaFilePath);
tableNames.add("mirna_uniq");
}
// Fetching and joining all chromosomes found in the different databases
Set setChr = new HashSet<>();
setChr.addAll(getChromosomesList(regulatoryFilePath, REGULATORY_FEATURES));
setChr.addAll(getChromosomesList(motifFilePath, MOTIF_FEATURES));
setChr.addAll(getChromosomesList(annotatedFilePath, "annotated_features"));
setChr.addAll(getChromosomesList(deprecatedMotifFilePath, DEPRECATED_MOTIF_FEATURES));
setChr.addAll(getChromosomesList(deprecatedRegulatoryFilePath, "regulatory_features_multicell"));
if (Files.exists(mirnaFilePath)) {
setChr.addAll(getChromosomesList(mirnaFilePath, "mirna_uniq"));
}
List chromosomes = new ArrayList<>(setChr);
List regulatoryFeatures;
HashSet chunksHash;
for (String chromosome : chromosomes) {
for (int i = 0; i < tableNames.size(); i++) {
chunksHash = new HashSet<>();
regulatoryFeatures = queryChromosomesRegulatoryDB(filePaths.get(i), tableNames.get(i), chromosome);
for (RegulatoryFeature regulatoryFeature : regulatoryFeatures) {
int firstChunkId = getChunkId(regulatoryFeature.getStart(), CHUNK_SIZE);
int lastChunkId = getChunkId(regulatoryFeature.getEnd(), CHUNK_SIZE);
List chunkIds = new ArrayList<>();
String chunkId;
for (int j = firstChunkId; j <= lastChunkId; j++) {
chunkId = chromosome + "_" + j + "_" + chunkIdSuffix;
chunkIds.add(chunkId);
//count chunks
if (!chunksHash.contains(j)) {
chunksHash.add(j);
}
}
// regulatoryFeature.setChunkIds(chunkIds);
// remove 'chr' prefix
// if (genericFeature.getChromosome() != null) {
// genericFeature.setSequenceName(genericFeature.getSequenceName().replace("chr", ""));
// }
serializer.serialize(regulatoryFeature);
}
}
}
}
public void createSQLiteRegulatoryFiles(Path filePath, String tableName, List columnNames, List columnTypes)
throws ClassNotFoundException, IOException, SQLException {
int limitRows = 100000;
int batchCount = 0;
if (!Files.exists(filePath) || Files.size(filePath) == 0) {
return;
}
Path dbPath = Paths.get(filePath.toString() + ".db");
if (Files.exists(dbPath) && Files.size(dbPath) > 0) {
return;
}
BufferedReader br = FileUtils.newBufferedReader(filePath);
Class.forName("org.sqlite.JDBC");
Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());
conn.setAutoCommit(false); //Set false to perform commits manually and increase performance on insertion
//Create table query
Statement createTables = conn.createStatement();
StringBuilder sbQuery = new StringBuilder();
sbQuery.append("CREATE TABLE if not exists " + tableName + "(");
for (int i = 0; i < columnNames.size(); i++) { //columnNames and columnTypes must have the same size
sbQuery.append("'" + columnNames.get(i) + "' " + columnTypes.get(i) + ",");
}
sbQuery.deleteCharAt(sbQuery.length() - 1);
sbQuery.append(")");
System.out.println(sbQuery.toString());
createTables.executeUpdate(sbQuery.toString());
//Prepare insert query
sbQuery = new StringBuilder();
sbQuery.append("INSERT INTO " + tableName + "(");
for (int i = 0; i < columnNames.size(); i++) {
sbQuery.append("'" + columnNames.get(i) + "',");
}
sbQuery.deleteCharAt(sbQuery.length() - 1);
sbQuery.append(") values (");
sbQuery.append(repeat("?,", columnNames.size()));
sbQuery.deleteCharAt(sbQuery.length() - 1);
sbQuery.append(")");
System.out.println(sbQuery.toString());
PreparedStatement ps = conn.prepareStatement(sbQuery.toString());
//Read file
String line = null;
while ((line = br.readLine()) != null) {
insertByType(ps, getFields(line, tableName), columnTypes);
ps.addBatch();
batchCount++;
//commit batch
if (batchCount % limitRows == 0 && batchCount != 0) {
ps.executeBatch();
conn.commit();
}
}
br.close();
//Execute last Batch
ps.executeBatch();
conn.commit();
//Create index
System.out.println("creating indices...");
createTables.executeUpdate("CREATE INDEX " + tableName + "_seqname_idx on " + tableName + "(" + columnNames.get(0) + ")");
System.out.println("indices created.");
conn.commit();
conn.close();
}
public List getChromosomesList(Path dbPath, String tableName) throws IOException {
try {
FileUtils.checkFile(dbPath);
} catch (IOException e) {
logger.warn(e.getMessage());
return Collections.emptyList();
}
List chromosomes = new ArrayList<>();
try {
Class.forName("org.sqlite.JDBC");
Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());
Statement query = conn.createStatement();
ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName);
// ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName + " where seqname like 'chr%'");
while (rs.next()) {
chromosomes.add(rs.getString(1));
}
conn.close();
} catch (ClassNotFoundException | SQLException e) {
e.printStackTrace();
}
return chromosomes;
}
public List queryChromosomesRegulatoryDB(Path dbPath, String tableName, String chromosome) {
try {
FileUtils.checkFile(dbPath);
} catch (IOException e) {
logger.warn(e.getMessage());
return Collections.emptyList();
}
Connection conn;
List regulatoryFeatures = new ArrayList<>();
try {
Class.forName("org.sqlite.JDBC");
conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());
Statement query = conn.createStatement();
ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='" + chromosome + "'");
// ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='chr" + chromosome + "'");
while (rs.next()) {
regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName));
}
conn.close();
} catch (ClassNotFoundException | SQLException e) {
e.printStackTrace();
}
return regulatoryFeatures;
}
public static List queryRegulatoryDB(Path dbPath, String tableName, String chrFile, int start, int end) {
Connection conn = null;
List regulatoryFeatures = new ArrayList<>();
try {
Class.forName("org.sqlite.JDBC");
conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());
Statement query = conn.createStatement();
ResultSet rs = query.executeQuery("select * from " + tableName + " where start<=" + end + " AND end>=" + start);
while (rs.next()) {
regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName));
}
conn.close();
} catch (ClassNotFoundException | SQLException e) {
e.printStackTrace();
}
return regulatoryFeatures;
}
private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs, String tableName) throws SQLException {
RegulatoryFeature regulatoryFeature = null;
switch (tableName.toLowerCase()) {
case REGULATORY_FEATURES:
regulatoryFeature = getRegulatoryFeature(rs);
break;
case MOTIF_FEATURES:
regulatoryFeature = getMotifFeature(rs);
break;
case "annotated_features":
regulatoryFeature = getAnnotatedFeature(rs);
break;
case "regulatory_features_multicell":
regulatoryFeature = getDeprecatedRegulatoryFeature(rs);
break;
case DEPRECATED_MOTIF_FEATURES:
regulatoryFeature = getDeprecatedMotifFeature(rs);
break;
case "mirna_uniq":
regulatoryFeature = getMirnaFeature(rs);
break;
default:
break;
}
return regulatoryFeature;
}
private static RegulatoryFeature getMotifFeature(ResultSet rs) throws SQLException {
// GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3
RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
Map groupFields = getGroupFields(rs.getString(9));
regulatoryFeature.setChromosome(rs.getString(1));
regulatoryFeature.setSource(rs.getString(2));
regulatoryFeature.setFeatureType(rs.getString(3));
regulatoryFeature.setStart(rs.getInt(4));
regulatoryFeature.setEnd(rs.getInt(5));
regulatoryFeature.setScore(rs.getString(6));
regulatoryFeature.setStrand(rs.getString(7));
// Seems weird that the motif_feature_type property is used to fill the Name field. However, this is how the
// it was being done from the previous ENSEMBL files
regulatoryFeature.setName(groupFields.get(MOTIF_FEATURE_TYPE));
regulatoryFeature.setMatrix(groupFields.get(BINDING_MATRIX));
return regulatoryFeature;
}
private static RegulatoryFeature getRegulatoryFeature(ResultSet rs) throws SQLException {
// GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3
RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
Map groupFields = getGroupFields(rs.getString(9));
regulatoryFeature.setId(groupFields.get(ID));
regulatoryFeature.setChromosome(rs.getString(1));
regulatoryFeature.setSource(rs.getString(2));
regulatoryFeature.setFeatureType(groupFields.get(FEATURE_TYPE).replace(" ", "_"));
regulatoryFeature.setStart(rs.getInt(4));
regulatoryFeature.setEnd(rs.getInt(5));
regulatoryFeature.setScore(rs.getString(6));
regulatoryFeature.setStrand(rs.getString(7));
return regulatoryFeature;
}
private static RegulatoryFeature getAnnotatedFeature(ResultSet rs) throws SQLException {
// GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3
RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
Map groupFields = getGroupFields(rs.getString(9));
regulatoryFeature.setChromosome(rs.getString(1));
regulatoryFeature.setSource(rs.getString(2));
regulatoryFeature.setFeatureType(rs.getString(3));
regulatoryFeature.setStart(rs.getInt(4));
regulatoryFeature.setEnd(rs.getInt(5));
regulatoryFeature.setScore(rs.getString(6));
regulatoryFeature.setStrand(rs.getString(7));
regulatoryFeature.setFrame(rs.getString(8));
regulatoryFeature.setName(groupFields.get("name"));
regulatoryFeature.setAlias(groupFields.get("alias"));
regulatoryFeature.setFeatureClass(groupFields.get("class"));
regulatoryFeature.getCellTypes().add(groupFields.get("cell_type"));
return regulatoryFeature;
}
@Deprecated
private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs) throws SQLException {
// GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3
RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
Map groupFields = getGroupFields(rs.getString(9));
regulatoryFeature.setChromosome(rs.getString(1));
regulatoryFeature.setSource(rs.getString(2));
regulatoryFeature.setFeatureType(rs.getString(3));
regulatoryFeature.setStart(rs.getInt(4));
regulatoryFeature.setEnd(rs.getInt(5));
regulatoryFeature.setScore(rs.getString(6));
regulatoryFeature.setStrand(rs.getString(7));
regulatoryFeature.setFrame(rs.getString(8));
regulatoryFeature.setFrame(rs.getString(9));
return regulatoryFeature;
}
@Deprecated
private static RegulatoryFeature getDeprecatedMotifFeature(ResultSet rs) throws SQLException {
// GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3
RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
Map groupFields = getGroupFields(rs.getString(9));
regulatoryFeature.setChromosome(rs.getString(1));
regulatoryFeature.setSource(rs.getString(2));
regulatoryFeature.setFeatureType(rs.getString(3) + "_motif");
regulatoryFeature.setStart(rs.getInt(4));
regulatoryFeature.setEnd(rs.getInt(5));
regulatoryFeature.setScore(rs.getString(6));
regulatoryFeature.setStrand(rs.getString(7));
regulatoryFeature.setFrame(rs.getString(8));
String[] split = groupFields.get("name").split(":");
regulatoryFeature.setName(split[0]);
regulatoryFeature.setMatrix(split[1]);
return regulatoryFeature;
}
private static RegulatoryFeature getMirnaFeature(ResultSet rs) throws SQLException {
// GFF https://genome.ucsc.edu/FAQ/FAQformat.html#format3
RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
Map groupFields = getGroupFields(rs.getString(9));
regulatoryFeature.setChromosome(rs.getString(1));
regulatoryFeature.setSource(rs.getString(2));
regulatoryFeature.setFeatureType(rs.getString(3));
regulatoryFeature.setStart(rs.getInt(4));
regulatoryFeature.setEnd(rs.getInt(5));
regulatoryFeature.setScore(rs.getString(6));
regulatoryFeature.setStrand(rs.getString(7));
regulatoryFeature.setFrame(rs.getString(8));
regulatoryFeature.setFeatureClass("microRNA");
regulatoryFeature.setName(groupFields.get("name"));
return regulatoryFeature;
}
private static Map getGroupFields(String group) {
//process group column
Map groupFields = new HashMap<>();
String[] attributeFields = group.split(";");
String[] attributeKeyValue;
for (String attributeField : attributeFields) {
attributeKeyValue = attributeField.trim().split("=");
groupFields.put(attributeKeyValue[0].toLowerCase(), attributeKeyValue[1]);
}
return groupFields;
}
public static List getFields(String line, String tableName) {
List fields = new ArrayList<>();
switch (tableName.toLowerCase()) {
case REGULATORY_FEATURES:
fields = getRegulatoryFeaturesFields(line);
break;
case MOTIF_FEATURES:
fields = getMotifFeaturesFields(line);
break;
case "annotated_features":
fields = getAnnotatedFeaturesFields(line);
break;
case "regulatory_features_multicell":
fields = getRegulatoryFeaturesFields(line);
break;
case DEPRECATED_MOTIF_FEATURES:
fields = getMotifFeaturesFields(line);
break;
case "mirna_uniq":
fields = getMirnaFeaturesFields(line);
break;
default:
break;
}
return fields;
}
@Deprecated
public static List getAnnotatedFeaturesFields(String line) {
String[] fields = line.split("\t");
fields[0] = fields[0].replace("chr", "");
return Arrays.asList(fields);
}
public static List getRegulatoryFeaturesFields(String line) {
String[] fields = line.split("\t");
fields[0] = fields[0].replace("chr", "");
return Arrays.asList(fields);
}
public static List getMotifFeaturesFields(String line) {
String[] fields = line.split("\t");
fields[0] = fields[0].replace("chr", "");
return Arrays.asList(fields);
}
public static List getMirnaFeaturesFields(String line) {
String[] fields = line.split("\t");
fields[0] = fields[0].replace("chr", "");
return Arrays.asList(fields);
}
public static void insertByType(PreparedStatement ps, List fields, List types) throws SQLException {
//Datatypes In SQLite Version 3 -> http://www.sqlite.org/datatype3.html
String raw;
String type;
if (types.size() == fields.size()) {
for (int i = 0; i < fields.size(); i++) { //columnNames and columnTypes must have same size
int sqliteIndex = i + 1;
raw = fields.get(i);
type = types.get(i);
switch (type) {
case "INTEGER":
case "INT":
ps.setInt(sqliteIndex, Integer.parseInt(raw));
break;
case "REAL":
ps.setFloat(sqliteIndex, Float.parseFloat(raw));
break;
case "TEXT":
ps.setString(sqliteIndex, raw);
break;
default:
ps.setString(sqliteIndex, raw);
break;
}
}
}
}
public String repeat(String s, int n) {
if (s == null) {
return null;
}
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < n; i++) {
sb.append(s);
}
return sb.toString();
}
private int getChunkId(int position, int chunksize) {
if (chunksize <= 0) {
return position / CHUNK_SIZE;
} else {
return position / chunksize;
}
}
private int getChunkStart(int id, int chunksize) {
if (chunksize <= 0) {
return (id == 0) ? 1 : id * CHUNK_SIZE;
} else {
return (id == 0) ? 1 : id * chunksize;
}
}
private int getChunkEnd(int id, int chunksize) {
if (chunksize <= 0) {
return (id * CHUNK_SIZE) + CHUNK_SIZE - 1;
} else {
return (id * chunksize) + chunksize - 1;
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy