All Downloads are FREE. Search and download functionalities are using the official Maven repository.

umcg.genetica.text.parsing.Demographics Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package umcg.genetica.text.parsing;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import umcg.genetica.io.text.TextFile;

/**
 *
 * @author juha
 */
public class Demographics {

    private String annotationFile;

    public Demographics(String annotationFile) {
        this.annotationFile = annotationFile;
    }

    public static void main(String[] args) throws IOException {

//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").checkColumns("age");
//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").writeMatchingSamples("age", "/Data/Sasha/GPL96GPL570AgeSamples.txt");
//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").writeAgesMethylation("/Data/Sasha/GPL96GPL570AgeSamplesFinal.txt", 27, true);
//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").combineAgesAndRanges("/Data/Sasha/GPL96GPL570AgeSamplesFinal.txt", "/Data/Sasha/GPL96GPL570AgeSamplesFinalRanges.txt",
//        "/Data/Sasha/GPL96GPL570AgeSamplesWithRangesAveraged.txt");
//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").writeGenders("/Data/Sasha/GPL96GPL570FemaleSamples.txt", "/Data/Sasha/GPL96GPL570MaleSamples.txt", 27);
//    new Demographics("/Data/GeneExpressionFinal/SampleAnnotation/GPL8490/GPL8490_family_annotation_mesh2012.txt").writeGenders("/Data/MJ/GPL8490FemaleSamples.txt", "/Data/MJ/GPL8490MaleSamples.txt", 27);
//    new Demographics("/Data/GeneExpressionFinal/SampleAnnotation/GPL8490/GPL8490_family_annotation_mesh2012.txt").writeAges("/Data/MJ/GPL8490Ages.txt", 27);
//    new Demographics("D:\\UMCG\\Methylation_GPL8490\\GPL8490_raw_3112012\\GPL8490_family_annotation.txt").writeAgesMethylation("D:\\UMCG\\Methylation_GPL8490\\GPL8490_raw_3112012\\GPL8490Ages_New.txt", 27, true);
    new Demographics("D:\\UMCG\\Methylation_GPL8490\\GPL8490_raw_3112012\\GPL8490_family_annotation.txt").writeGenders("D:\\UMCG\\Methylation_GPL8490\\GPL8490_raw_3112012\\GPL8490FemaleSamples.txt", "D:\\UMCG\\Methylation_GPL8490\\GPL8490_raw_3112012\\GPL8490MaleSamples.txt", 27);
//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").writeMatchingSamples("female", "male", new int[]{7, 11, 30, 31, 33, 34}, "/Data/Sasha/GPL96GPL570FemaleSamples.txt");
//    new Demographics("/Data/GeneExpression/SampleAnnotation/GPL96GPL570/GPL96GPL570SampleAnnotation.txt").writeMatchingSamples("male", "female", new int[]{7, 11, 30, 31, 33, 34}, "/Data/Sasha/GPL96GPL570MaleSamples.txt");
    }

    private void writeGenders(String femaleFileName, String maleFileName, int gseCol) throws IOException {

        Pattern femaleP = Pattern.compile("\\bfemale\\b", Pattern.CASE_INSENSITIVE);
        Pattern femaleP2 = Pattern.compile("\\b(gender|sex)[ ]*[:=]?[ ]*f", Pattern.CASE_INSENSITIVE);
        Pattern maleP = Pattern.compile("\\bmale\\b", Pattern.CASE_INSENSITIVE);
        Pattern maleP2 = Pattern.compile("\\b(gender|sex)[ ]*[:=]?[ ]*m(?!atched)", Pattern.CASE_INSENSITIVE);

        TextFile fOut = new TextFile(femaleFileName, true);
        TextFile mOut = new TextFile(maleFileName, true);

        TextFile tf = new TextFile(annotationFile, false);
        String line = tf.readLine();
        while ((line = tf.readLine()) != null) {
            String[] split = line.split("\t");
            for (int i = 0; i < split.length; i++) {
                Matcher femaleM = femaleP.matcher(split[i]);
                Matcher femaleM2 = femaleP2.matcher(split[i]);
                Matcher maleM = maleP.matcher(split[i]);
                Matcher maleM2 = maleP2.matcher(split[i]);
                String gse = split[gseCol].trim().replace("\t", " ");
                if (femaleM.find(0) && !maleM.find(0)) {
                    fOut.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                    break;
                } else if (maleM.find(0) && !femaleM.find(0)) {
                    mOut.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                    break;
                } else if (femaleM2.find()) {
                    if (!split[i].toLowerCase().contains("gender: f/m")) {
                        fOut.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        break;
                    }
                } else if (maleM2.find()) {
                    if (!split[i].toLowerCase().contains("gender: m/f")) {
                        mOut.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        break;
                    }
                }
            }
        }
        tf.close();
        fOut.close();
        mOut.close();
    }

    private void writeAges(String fileName, int gseCol) throws IOException {

        Pattern ageP = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern rangeP = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[ ]*(to|-)[ ]*[0-9]+[ ]*(?!week))", Pattern.CASE_INSENSITIVE);
        Pattern monthP = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*[ ]*m(?!enopaus|iller))", Pattern.CASE_INSENSITIVE);
        Pattern weekP = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*[ ]*(week|gestational week))", Pattern.CASE_INSENSITIVE);
        Pattern dayP = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*[ ]*days(?! of symptoms))", Pattern.CASE_INSENSITIVE);

        TextFile out = new TextFile(fileName, true);
        TextFile rangeOut = new TextFile(fileName.replace(".txt", "") + "Ranges.txt", true);

        TextFile tf = new TextFile(annotationFile, false);
        String line = tf.readLine();
        
        while ((line = tf.readLine()) != null) {
            String[] split = line.split("\t");
            for (int i = 0; i < split.length; i++) {
                Matcher m1 = ageP.matcher(split[i]);
                String gse = split[gseCol].trim().replace("\t", " ");
                if (m1.find()) {
                    Matcher rangeM = rangeP.matcher(split[i]);
                    Matcher monthM = monthP.matcher(split[i]);
                    Matcher weekM = weekP.matcher(split[i]);
                    Matcher dayM = dayP.matcher(split[i]);
                    if (rangeM.find()) {
                        rangeOut.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + rangeM.group(1) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        break;
                    } else if (!monthM.find() && !weekM.find() && !dayM.find()) {
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + m1.group(1) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        break;
                    }
                }
            }
        }
        tf.close();
        out.close();
        rangeOut.close();
    }

    private void writeAgesMethylation(String fileName, int gseCol, boolean specialSelection) throws IOException {

        Pattern agePatern = Pattern.compile("\\bage[ ]*\\(*(y|yrs|years|months|[0-9]{4})*\\)*[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern monthPatern1 = Pattern.compile("\\bage[ ]+\\(*(months|m)+\\)*[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern monthPatern2 = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*)[ ]*\\(*(month|m(?!enopaus|iller))+\\)*", Pattern.CASE_INSENSITIVE);
        Pattern rangeP = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[ ]*(to|-)[ ]*[0-9]+[ ]*(?!week))", Pattern.CASE_INSENSITIVE);

        Pattern weekPatern = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*)[ ]*\\(*week[s]*\\)*", Pattern.CASE_INSENSITIVE);
        Pattern dayPatern = Pattern.compile("\\bage[ ]*[:=]?[ ]*([0-9]+[\\.]?[0-9]*)[ ]*\\(*day[s]*\\)*", Pattern.CASE_INSENSITIVE);

        Pattern ageSpecial1Patern = Pattern.compile("\\bageatdraw[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern ageSpecial2Patern = Pattern.compile("\\bageatdiagnosis[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern ageSpecial3Patern = Pattern.compile("\\bdurationt1d[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern ageSpecial4Patern = Pattern.compile(".*age at collection \\(months\\): ([0-9]+[\\.]?[0-9]*).*", Pattern.CASE_INSENSITIVE);
        Pattern ageSpecial5Patern = Pattern.compile("\\bageatrecruitment[:=]?[ ]*([0-9]+[\\.]?[0-9]*)", Pattern.CASE_INSENSITIVE);
        Pattern maternalAgePatern = Pattern.compile("\\bmaternal age", Pattern.CASE_INSENSITIVE);


        TextFile out = new TextFile(fileName, true);
        TextFile rangeOut = new TextFile(fileName.replace(".txt", "Ranges.txt"), true);

        int numberMatches = 0;

        TextFile tf = new TextFile(annotationFile, false);
        String line = tf.readLine();
        
        while ((line = tf.readLine()) != null) {
            String[] split = line.split("\t");
            for (int i = 0; i < split.length; i++) {
                Matcher m1 = agePatern.matcher(split[i]);
                String gse = split[gseCol].trim().replace("\t", " ");
                if (m1.find()) {
                    Matcher monthM = monthPatern1.matcher(split[i]);
                    Matcher monthM2 = monthPatern2.matcher(split[i]);
                    Matcher maternalM = maternalAgePatern.matcher(split[i]);
                    Matcher weekM = weekPatern.matcher(split[i]);
                    Matcher dayM = dayPatern.matcher(split[i]);
                    Matcher rangeM = rangeP.matcher(split[i]);

                    if (rangeM.find(0)) {
                        rangeOut.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + rangeM.group(1) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        continue;
                    }

                    if (dayM.find(0) || weekM.find(0) || maternalM.find(0)) {
                        continue;
                    }

                    if (!monthM.find(0) && !monthM2.find(0) && Double.parseDouble(m1.group(2)) > 125) {
                        continue;
                    }

                    if (!monthM.find(0) && !monthM2.find(0)) {
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + m1.group(2) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        numberMatches++;
                        break;
                    } else if (monthM.find(0) || monthM2.find(0)) {
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + Double.parseDouble(m1.group(2)) / 12 + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        numberMatches++;
                        break;
                    }
                }

                if (specialSelection) {
                    Matcher special = ageSpecial1Patern.matcher(split[i]);
                    Matcher special0 = ageSpecial4Patern.matcher(split[i]);
                    Matcher special1 = ageSpecial2Patern.matcher(split[i]);
                    Matcher special2 = ageSpecial3Patern.matcher(split[i]);
                    Matcher special3 = ageSpecial5Patern.matcher(split[i]);

                    if (special.find(0)) {
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + special.group(1) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        numberMatches++;
                        break;
                    } else if(special3.find(0)){
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + special3.group(1) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        numberMatches++;
                        break;
                    } else if(special0.find(0)){
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + Double.parseDouble(special0.group(1)) / 12 + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        numberMatches++;
                        break;
                    } else if (special1.find(0) && special2.find(0)) {
                        out.writeln(split[12] + "\t" + gse + "\t" + split[0] + "\t" + (Double.parseDouble(special1.group(1)) + Double.parseDouble(special2.group(1))) + "\t" + split[3] + "\t" + split[13] + "\t" + split[i]);
                        numberMatches++;
                        break;
                    }
                }

            }
        }

        System.out.println(numberMatches);
        tf.close();
        out.close();
        rangeOut.close();
    }

    private void writeMatchingSamples(String matchThisWord, String fileName) throws IOException {

        Pattern p1 = Pattern.compile("\\b" + matchThisWord + "\\b", Pattern.CASE_INSENSITIVE);

        TextFile out = new TextFile(fileName, true);

        TextFile tf = new TextFile(annotationFile, false);
        String line = tf.readLine();
        while ((line = tf.readLine()) != null) {
            String[] split = line.split("\t");
            for (int i = 0; i < split.length; i++) {
                Matcher m1 = p1.matcher(split[i]);
                if (m1.find()) {
                    out.writeln(split[0] + "\t" + split[i]);
                }
            }
        }
        tf.close();
        out.close();
    }

    private void writeMatchingSamples(String matchThisWord, String dontMatchThisWord, int[] colsToSearch, String fileName) throws IOException {

        Pattern p1 = Pattern.compile("\\b" + matchThisWord + "\\b", Pattern.CASE_INSENSITIVE);
        Pattern p2 = Pattern.compile("\\b" + dontMatchThisWord + "\\b", Pattern.CASE_INSENSITIVE);

        TextFile out = new TextFile(fileName, true);

        TextFile tf = new TextFile(annotationFile, false);
        String line = tf.readLine();
        while ((line = tf.readLine()) != null) {
            String[] split = line.split("\t");
            boolean matches = false;
            for (int col : colsToSearch) {
                if (split.length > col) {
                    Matcher m1 = p1.matcher(split[col]);
                    Matcher m2 = p2.matcher(split[col]);
                    if (m1.find() && !m2.find()) {
                        matches = true;
                        break;
                    }
                }
            }
            if (matches) {
                out.writeln(split[0]);
            }
        }
        tf.close();
        out.close();
    }

    private void checkColumns(String query) throws IOException {
        TextFile tf = new TextFile(annotationFile, false);
        String line = tf.readLine();
        int[] counts = new int[line.split("\t").length];
        while ((line = tf.readLine()) != null) {
            String[] split = line.split("\t");
            for (int col = 0; col < split.length; col++) {
                if (col < counts.length && split[col].contains(query)) {
                    counts[col]++;
                }
            }
        }
        for (int col = 0; col < counts.length; col++) {
            if (counts[col] > 0) {
                System.out.println(col + "\t" + counts[col]);
            }
        }
    }

    private void combineAgesAndRanges(String agefile, String rangefile, String outfile) throws IOException {

        TextFile in = new TextFile(agefile, false);
        TextFile out = new TextFile(outfile, true);
        String line = null;
        while ((line = in.readLine()) != null) {
            out.writeln(line);
        }
        in.close();
        in = new TextFile(rangefile, false);
        while ((line = in.readLine()) != null) {
            String[] split = line.split("\t");
            String range = split[3].trim();
            int indexOf = range.indexOf("-");
            double age = -1;
            if (indexOf > 0) {
                int age1 = Integer.parseInt(range.substring(0, indexOf));
                int age2 = Integer.parseInt(range.substring(indexOf + 1));
                if (age2 - age1 < 19) {
                    age = (age2 + age1) / 2d;
                }
            } else {
                indexOf = range.indexOf(" to ");
                if (indexOf > 0) {
                    int age1 = Integer.parseInt(range.substring(0, indexOf));
                    int age2 = Integer.parseInt(range.substring(indexOf + 4));
                    if (age2 - age1 < 19) {
                        age = (age2 + age1) / 2d;
                    }
                }
            }
            if (age > 0) {
                String delim = "";
                for (int i = 0; i < 3; i++) {
                    out.write(delim + split[i]);
                    delim = "\t";
                }
                out.write(delim + age);
                for (int i = 4; i < split.length; i++) {
                    out.write(delim + split[i]);
                }
                out.writeln();
            }
        }
        in.close();
        out.close();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy