All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencb.cellbase.lib.builders.RegulatoryRegionBuilder Maven / Gradle / Ivy

There is a newer version: 6.3.0
Show newest version
/*
 * Copyright 2015-2020 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.opencb.cellbase.lib.builders;

import org.opencb.biodata.models.core.RegulatoryFeature;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.*;
import java.util.*;

/**
 * User: fsalavert.
 * Date: 4/10/13
 * Time: 10:14 AM
 */
@Deprecated
public class RegulatoryRegionBuilder extends CellBaseBuilder {

    private static final int CHUNK_SIZE = 2000;
    private static final String REGULATORY_FEATURES = "regulatory_features";
    @Deprecated
    private static final String DEPRECATED_MOTIF_FEATURES = "deprecated_motif_features";
    private static final String MOTIF_FEATURES = "motif_features";
    private static final String FEATURE_TYPE = "feature_type";
    private static final String ID = "id";
    private static final String BINDING_MATRIX = "binding_matrix";
    private static final String MOTIF_FEATURE_TYPE = "motif_feature_type";
    private Path regulatoryRegionPath;

    public RegulatoryRegionBuilder(Path regulatoryRegionFilesDir, CellBaseSerializer serializer) {
        super(serializer);

        this.regulatoryRegionPath = regulatoryRegionFilesDir;

    }

    public void createSQLiteRegulatoryFiles(Path regulatoryRegionPath)
            throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException {
        List gffColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "group");
        List gffColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT", "TEXT");

        //        Path regulatoryRegionPath = regulationDir.toPath();

        Path filePath;

        filePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE);
        createSQLiteRegulatoryFiles(filePath, REGULATORY_FEATURES, gffColumnNames, gffColumnTypes);

        filePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE);
        createSQLiteRegulatoryFiles(filePath, MOTIF_FEATURES, gffColumnNames, gffColumnTypes);

        // TODO: REMOVE
        // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DEPRECATED
        filePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz");
        createSQLiteRegulatoryFiles(filePath, "annotated_features", gffColumnNames, gffColumnTypes);


        filePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz");
        createSQLiteRegulatoryFiles(filePath, DEPRECATED_MOTIF_FEATURES, gffColumnNames, gffColumnTypes);


        filePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz");
        createSQLiteRegulatoryFiles(filePath, "regulatory_features_multicell", gffColumnNames, gffColumnTypes);
        // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< DEPRECATED



//  GFFColumnNames = Arrays.asList("seqname", "source", "feature", "start", "end", "score", "strand", "frame");
//  GFFColumnTypes = Arrays.asList("TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT", "TEXT", "TEXT");
        filePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz");
        if (Files.exists(filePath)) {
            createSQLiteRegulatoryFiles(filePath, "mirna_uniq", gffColumnNames, gffColumnTypes);
        }

    }

    @Override
    public void parse() throws SQLException, IOException, ClassNotFoundException, NoSuchMethodException {
        if (regulatoryRegionPath == null || !Files.exists(regulatoryRegionPath) || !Files.isDirectory(regulatoryRegionPath)) {
            throw new IOException("Regulation directory whether does not exist, is not a directory or cannot be read");
        }

        // Create the SQLite databases
        createSQLiteRegulatoryFiles(regulatoryRegionPath);

        String chunkIdSuffix = CHUNK_SIZE / 1000 + "k";

        Path regulatoryFilePath = regulatoryRegionPath.resolve(EtlCommons.REGULATORY_FEATURES_FILE + ".db");
        Path motifFilePath = regulatoryRegionPath.resolve(EtlCommons.MOTIF_FEATURES_FILE + ".db");
        Path annotatedFilePath = regulatoryRegionPath.resolve("AnnotatedFeatures.gff.gz.db");
        Path deprecatedMotifFilePath = regulatoryRegionPath.resolve("MotifFeatures.gff.gz.db");
        Path deprecatedRegulatoryFilePath = regulatoryRegionPath.resolve("RegulatoryFeatures_MultiCell.gff.gz.db");
        Path mirnaFilePath = regulatoryRegionPath.resolve("mirna_uniq.gff.gz.db");

        List filePaths = Arrays.asList(regulatoryFilePath, motifFilePath, annotatedFilePath,
                deprecatedMotifFilePath, deprecatedRegulatoryFilePath);
        List tableNames = Arrays.asList(REGULATORY_FEATURES, MOTIF_FEATURES, "annotated_features",
                DEPRECATED_MOTIF_FEATURES, "regulatory_features_multicell");

        if (Files.exists(mirnaFilePath)) {
            filePaths.add(mirnaFilePath);
            tableNames.add("mirna_uniq");
        }

        // Fetching and joining all chromosomes found in the different databases
        Set setChr = new HashSet<>();
        setChr.addAll(getChromosomesList(regulatoryFilePath, REGULATORY_FEATURES));
        setChr.addAll(getChromosomesList(motifFilePath, MOTIF_FEATURES));
        setChr.addAll(getChromosomesList(annotatedFilePath, "annotated_features"));
        setChr.addAll(getChromosomesList(deprecatedMotifFilePath, DEPRECATED_MOTIF_FEATURES));
        setChr.addAll(getChromosomesList(deprecatedRegulatoryFilePath, "regulatory_features_multicell"));
        if (Files.exists(mirnaFilePath)) {
            setChr.addAll(getChromosomesList(mirnaFilePath, "mirna_uniq"));
        }

        List chromosomes = new ArrayList<>(setChr);
        List regulatoryFeatures;
        HashSet chunksHash;
        for (String chromosome : chromosomes) {
            for (int i = 0; i < tableNames.size(); i++) {
                chunksHash = new HashSet<>();
                regulatoryFeatures = queryChromosomesRegulatoryDB(filePaths.get(i), tableNames.get(i), chromosome);
                for (RegulatoryFeature regulatoryFeature : regulatoryFeatures) {
                    int firstChunkId = getChunkId(regulatoryFeature.getStart(), CHUNK_SIZE);
                    int lastChunkId = getChunkId(regulatoryFeature.getEnd(), CHUNK_SIZE);

                    List chunkIds = new ArrayList<>();
                    String chunkId;
                    for (int j = firstChunkId; j <= lastChunkId; j++) {
                        chunkId = chromosome + "_" + j + "_" + chunkIdSuffix;
                        chunkIds.add(chunkId);
                        //count chunks
                        if (!chunksHash.contains(j)) {
                            chunksHash.add(j);
                        }
                    }
//                    regulatoryFeature.setChunkIds(chunkIds);

                    // remove 'chr' prefix
//                    if (genericFeature.getChromosome() != null) {
//                        genericFeature.setSequenceName(genericFeature.getSequenceName().replace("chr", ""));
//                    }
                    serializer.serialize(regulatoryFeature);
                }
            }
        }
    }


    public void createSQLiteRegulatoryFiles(Path filePath, String tableName, List columnNames, List columnTypes)
            throws ClassNotFoundException, IOException, SQLException {
        int limitRows = 100000;
        int batchCount = 0;

        if (!Files.exists(filePath) || Files.size(filePath) == 0) {
            return;
        }

        Path dbPath = Paths.get(filePath.toString() + ".db");
        if (Files.exists(dbPath) && Files.size(dbPath) > 0) {
            return;
        }

        BufferedReader br = FileUtils.newBufferedReader(filePath);

        Class.forName("org.sqlite.JDBC");
        Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());
        conn.setAutoCommit(false); //Set false to perform commits manually and increase performance on insertion

        //Create table query
        Statement createTables = conn.createStatement();

        StringBuilder sbQuery = new StringBuilder();
        sbQuery.append("CREATE TABLE if not exists " + tableName + "(");
        for (int i = 0; i < columnNames.size(); i++) {    //columnNames and columnTypes must have the same size
            sbQuery.append("'" + columnNames.get(i) + "' " + columnTypes.get(i) + ",");
        }
        sbQuery.deleteCharAt(sbQuery.length() - 1);
        sbQuery.append(")");

        System.out.println(sbQuery.toString());
        createTables.executeUpdate(sbQuery.toString());

        //Prepare insert query
        sbQuery = new StringBuilder();
        sbQuery.append("INSERT INTO " + tableName + "(");
        for (int i = 0; i < columnNames.size(); i++) {
            sbQuery.append("'" + columnNames.get(i) + "',");
        }
        sbQuery.deleteCharAt(sbQuery.length() - 1);
        sbQuery.append(") values (");
        sbQuery.append(repeat("?,", columnNames.size()));
        sbQuery.deleteCharAt(sbQuery.length() - 1);
        sbQuery.append(")");
        System.out.println(sbQuery.toString());

        PreparedStatement ps = conn.prepareStatement(sbQuery.toString());

        //Read file
        String line = null;
        while ((line = br.readLine()) != null) {

            insertByType(ps, getFields(line, tableName), columnTypes);
            ps.addBatch();
            batchCount++;

            //commit batch
            if (batchCount % limitRows == 0 && batchCount != 0) {
                ps.executeBatch();
                conn.commit();
            }

        }
        br.close();

        //Execute last Batch
        ps.executeBatch();
        conn.commit();

        //Create index
        System.out.println("creating indices...");
        createTables.executeUpdate("CREATE INDEX " + tableName + "_seqname_idx on " + tableName + "(" + columnNames.get(0) + ")");
        System.out.println("indices created.");

        conn.commit();
        conn.close();
    }

    public List getChromosomesList(Path dbPath, String tableName) throws IOException {

        try {
            FileUtils.checkFile(dbPath);
        } catch (IOException e) {
            logger.warn(e.getMessage());
            return Collections.emptyList();
        }

        List chromosomes = new ArrayList<>();
        try {
            Class.forName("org.sqlite.JDBC");
            Connection conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());

            Statement query = conn.createStatement();
            ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName);
//            ResultSet rs = query.executeQuery("select distinct(seqname) from " + tableName + " where seqname like 'chr%'");

            while (rs.next()) {
                chromosomes.add(rs.getString(1));
            }
            conn.close();

        } catch (ClassNotFoundException | SQLException e) {
            e.printStackTrace();
        }
        return chromosomes;
    }

    public List queryChromosomesRegulatoryDB(Path dbPath, String tableName, String chromosome) {

        try {
            FileUtils.checkFile(dbPath);
        } catch (IOException e) {
            logger.warn(e.getMessage());
            return Collections.emptyList();
        }

        Connection conn;
        List regulatoryFeatures = new ArrayList<>();
        try {
            Class.forName("org.sqlite.JDBC");
            conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());

            Statement query = conn.createStatement();
            ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='" + chromosome + "'");
//            ResultSet rs = query.executeQuery("select * from " + tableName + " where seqname='chr" + chromosome + "'");
            while (rs.next()) {
                regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName));
            }
            conn.close();

        } catch (ClassNotFoundException | SQLException e) {
            e.printStackTrace();
        }
        return regulatoryFeatures;
    }

    public static List queryRegulatoryDB(Path dbPath, String tableName, String chrFile, int start, int end) {
        Connection conn = null;
        List regulatoryFeatures = new ArrayList<>();
        try {
            Class.forName("org.sqlite.JDBC");
            conn = DriverManager.getConnection("jdbc:sqlite:" + dbPath.toString());

            Statement query = conn.createStatement();
            ResultSet rs = query.executeQuery("select * from " + tableName + " where start<=" + end + " AND end>=" + start);

            while (rs.next()) {
                regulatoryFeatures.add(getDeprecatedRegulatoryFeature(rs, tableName));
            }
            conn.close();

        } catch (ClassNotFoundException | SQLException e) {
            e.printStackTrace();
        }
        return regulatoryFeatures;
    }

    private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs, String tableName) throws SQLException {
        RegulatoryFeature regulatoryFeature = null;
        switch (tableName.toLowerCase()) {
            case REGULATORY_FEATURES:
                regulatoryFeature = getRegulatoryFeature(rs);
                break;
            case MOTIF_FEATURES:
                regulatoryFeature = getMotifFeature(rs);
                break;
            case "annotated_features":
                regulatoryFeature = getAnnotatedFeature(rs);
                break;
            case "regulatory_features_multicell":
                regulatoryFeature = getDeprecatedRegulatoryFeature(rs);
                break;
            case DEPRECATED_MOTIF_FEATURES:
                regulatoryFeature = getDeprecatedMotifFeature(rs);
                break;
            case "mirna_uniq":
                regulatoryFeature = getMirnaFeature(rs);
                break;
            default:
                break;
        }
        return regulatoryFeature;
    }

    private static RegulatoryFeature getMotifFeature(ResultSet rs) throws SQLException {
        //   GFF     https://genome.ucsc.edu/FAQ/FAQformat.html#format3
        RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
        Map groupFields = getGroupFields(rs.getString(9));

        regulatoryFeature.setChromosome(rs.getString(1));
        regulatoryFeature.setSource(rs.getString(2));
        regulatoryFeature.setFeatureType(rs.getString(3));
        regulatoryFeature.setStart(rs.getInt(4));
        regulatoryFeature.setEnd(rs.getInt(5));
        regulatoryFeature.setScore(rs.getString(6));
        regulatoryFeature.setStrand(rs.getString(7));

        // Seems weird that the motif_feature_type property is used to fill the Name field. However, this is how the
        // it was being done from the previous ENSEMBL files
        regulatoryFeature.setName(groupFields.get(MOTIF_FEATURE_TYPE));

        regulatoryFeature.setMatrix(groupFields.get(BINDING_MATRIX));

        return regulatoryFeature;
    }

    private static RegulatoryFeature getRegulatoryFeature(ResultSet rs) throws SQLException {
        //   GFF     https://genome.ucsc.edu/FAQ/FAQformat.html#format3
        RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
        Map groupFields = getGroupFields(rs.getString(9));

        regulatoryFeature.setId(groupFields.get(ID));
        regulatoryFeature.setChromosome(rs.getString(1));
        regulatoryFeature.setSource(rs.getString(2));
        regulatoryFeature.setFeatureType(groupFields.get(FEATURE_TYPE).replace(" ", "_"));
        regulatoryFeature.setStart(rs.getInt(4));
        regulatoryFeature.setEnd(rs.getInt(5));
        regulatoryFeature.setScore(rs.getString(6));
        regulatoryFeature.setStrand(rs.getString(7));

        return regulatoryFeature;
    }

    private static RegulatoryFeature getAnnotatedFeature(ResultSet rs) throws SQLException {
        //   GFF     https://genome.ucsc.edu/FAQ/FAQformat.html#format3
        RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
        Map groupFields = getGroupFields(rs.getString(9));

        regulatoryFeature.setChromosome(rs.getString(1));
        regulatoryFeature.setSource(rs.getString(2));
        regulatoryFeature.setFeatureType(rs.getString(3));
        regulatoryFeature.setStart(rs.getInt(4));
        regulatoryFeature.setEnd(rs.getInt(5));
        regulatoryFeature.setScore(rs.getString(6));
        regulatoryFeature.setStrand(rs.getString(7));
        regulatoryFeature.setFrame(rs.getString(8));

        regulatoryFeature.setName(groupFields.get("name"));
        regulatoryFeature.setAlias(groupFields.get("alias"));
        regulatoryFeature.setFeatureClass(groupFields.get("class"));
        regulatoryFeature.getCellTypes().add(groupFields.get("cell_type"));

        return regulatoryFeature;
    }

    @Deprecated
    private static RegulatoryFeature getDeprecatedRegulatoryFeature(ResultSet rs) throws SQLException {
        //   GFF     https://genome.ucsc.edu/FAQ/FAQformat.html#format3
        RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
        Map groupFields = getGroupFields(rs.getString(9));

        regulatoryFeature.setChromosome(rs.getString(1));
        regulatoryFeature.setSource(rs.getString(2));
        regulatoryFeature.setFeatureType(rs.getString(3));
        regulatoryFeature.setStart(rs.getInt(4));
        regulatoryFeature.setEnd(rs.getInt(5));
        regulatoryFeature.setScore(rs.getString(6));
        regulatoryFeature.setStrand(rs.getString(7));
        regulatoryFeature.setFrame(rs.getString(8));
        regulatoryFeature.setFrame(rs.getString(9));

        return regulatoryFeature;
    }

    @Deprecated
    private static RegulatoryFeature getDeprecatedMotifFeature(ResultSet rs) throws SQLException {
        //   GFF     https://genome.ucsc.edu/FAQ/FAQformat.html#format3
        RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
        Map groupFields = getGroupFields(rs.getString(9));

        regulatoryFeature.setChromosome(rs.getString(1));
        regulatoryFeature.setSource(rs.getString(2));
        regulatoryFeature.setFeatureType(rs.getString(3) + "_motif");
        regulatoryFeature.setStart(rs.getInt(4));
        regulatoryFeature.setEnd(rs.getInt(5));
        regulatoryFeature.setScore(rs.getString(6));
        regulatoryFeature.setStrand(rs.getString(7));
        regulatoryFeature.setFrame(rs.getString(8));

        String[] split = groupFields.get("name").split(":");
        regulatoryFeature.setName(split[0]);
        regulatoryFeature.setMatrix(split[1]);

        return regulatoryFeature;
    }

    private static RegulatoryFeature getMirnaFeature(ResultSet rs) throws SQLException {
        //   GFF     https://genome.ucsc.edu/FAQ/FAQformat.html#format3
        RegulatoryFeature regulatoryFeature = new RegulatoryFeature();
        Map groupFields = getGroupFields(rs.getString(9));

        regulatoryFeature.setChromosome(rs.getString(1));
        regulatoryFeature.setSource(rs.getString(2));
        regulatoryFeature.setFeatureType(rs.getString(3));
        regulatoryFeature.setStart(rs.getInt(4));
        regulatoryFeature.setEnd(rs.getInt(5));
        regulatoryFeature.setScore(rs.getString(6));
        regulatoryFeature.setStrand(rs.getString(7));
        regulatoryFeature.setFrame(rs.getString(8));

        regulatoryFeature.setFeatureClass("microRNA");
        regulatoryFeature.setName(groupFields.get("name"));

        return regulatoryFeature;
    }

    private static Map getGroupFields(String group) {
        //process group column
        Map groupFields = new HashMap<>();
        String[] attributeFields = group.split(";");
        String[] attributeKeyValue;
        for (String attributeField : attributeFields) {
            attributeKeyValue = attributeField.trim().split("=");
            groupFields.put(attributeKeyValue[0].toLowerCase(), attributeKeyValue[1]);
        }
        return groupFields;
    }


    public static List getFields(String line, String tableName) {
        List fields = new ArrayList<>();
        switch (tableName.toLowerCase()) {
            case REGULATORY_FEATURES:
                fields = getRegulatoryFeaturesFields(line);
                break;
            case MOTIF_FEATURES:
                fields = getMotifFeaturesFields(line);
                break;
            case "annotated_features":
                fields = getAnnotatedFeaturesFields(line);
                break;
            case "regulatory_features_multicell":
                fields = getRegulatoryFeaturesFields(line);
                break;
            case DEPRECATED_MOTIF_FEATURES:
                fields = getMotifFeaturesFields(line);
                break;
            case "mirna_uniq":
                fields = getMirnaFeaturesFields(line);
                break;
            default:
                break;
        }
        return fields;
    }

    @Deprecated
    public static List getAnnotatedFeaturesFields(String line) {
        String[] fields = line.split("\t");
        fields[0] = fields[0].replace("chr", "");
        return Arrays.asList(fields);
    }

    public static List getRegulatoryFeaturesFields(String line) {
        String[] fields = line.split("\t");
        fields[0] = fields[0].replace("chr", "");
        return Arrays.asList(fields);
    }

    public static List getMotifFeaturesFields(String line) {
        String[] fields = line.split("\t");
        fields[0] = fields[0].replace("chr", "");
        return Arrays.asList(fields);
    }

    public static List getMirnaFeaturesFields(String line) {
        String[] fields = line.split("\t");
        fields[0] = fields[0].replace("chr", "");
        return Arrays.asList(fields);
    }

    public static void insertByType(PreparedStatement ps, List fields, List types) throws SQLException {
        //Datatypes In SQLite Version 3 -> http://www.sqlite.org/datatype3.html
        String raw;
        String type;
        if (types.size() == fields.size()) {
            for (int i = 0; i < fields.size(); i++) { //columnNames and columnTypes must have same size
                int sqliteIndex = i + 1;
                raw = fields.get(i);
                type = types.get(i);

                switch (type) {
                    case "INTEGER":
                    case "INT":
                        ps.setInt(sqliteIndex, Integer.parseInt(raw));
                        break;
                    case "REAL":
                        ps.setFloat(sqliteIndex, Float.parseFloat(raw));
                        break;
                    case "TEXT":
                        ps.setString(sqliteIndex, raw);
                        break;
                    default:
                        ps.setString(sqliteIndex, raw);
                        break;
                }
            }
        }

    }

    public String repeat(String s, int n) {
        if (s == null) {
            return null;
        }
        final StringBuilder sb = new StringBuilder();
        for (int i = 0; i < n; i++) {
            sb.append(s);
        }
        return sb.toString();
    }

    private int getChunkId(int position, int chunksize) {
        if (chunksize <= 0) {
            return position / CHUNK_SIZE;
        } else {
            return position / chunksize;
        }
    }

    private int getChunkStart(int id, int chunksize) {
        if (chunksize <= 0) {
            return (id == 0) ? 1 : id * CHUNK_SIZE;
        } else {
            return (id == 0) ? 1 : id * chunksize;
        }
    }

    private int getChunkEnd(int id, int chunksize) {
        if (chunksize <= 0) {
            return (id * CHUNK_SIZE) + CHUNK_SIZE - 1;
        } else {
            return (id * chunksize) + chunksize - 1;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy