All Downloads are FREE. Search and download functionalities are using the official Maven repository.

GNormPluslib.GN Maven / Gradle / Ivy

/**
 * Project: GNormPlus
 * Function: Gene Normalization
 */

package GNormPluslib;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;
import java.text.DecimalFormat;
import java.math.RoundingMode;

import javax.xml.stream.XMLStreamException;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;

public class GN {
    public static HashMap MatchedTokens_hash = new HashMap();
    private GNPProcessingData data;

    public GN(GNPProcessingData data) {

        this.data = data;
    }

    private double ScoringFunction(String geneid, HashMap Mention_hash, String LF) {
        /*
         * define gene/homo id
         */

        //LF
        LF = LF.toLowerCase();
        LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
        LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
        LF = LF.replaceAll("([\\W\\-\\_])", " ");
        LF = LF.replaceAll("[ ]+", " ");
        String LF_tkn[] = LF.split(" ");
        int LF_ParticalMatch = 0;

        Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
        Matcher mtmp = ptmp.matcher(geneid);
        Pattern ptmp2 = Pattern.compile("([0-9]+)");
        Matcher mtmp2 = ptmp.matcher(geneid);
        if (mtmp.find()) {
            geneid = "Homo:" + mtmp.group(1);
        } else {
            geneid = "Gene:" + geneid;
        }

        if (GNormPlus.GeneScoring_hash.containsKey(geneid)) {
            HashMap TF = new HashMap(); // token i in gene j
            HashMap TermFrequency = new HashMap();

            /*
             * Tokens in Query (Gene id lexicon)
             */
            String l[] = GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293	cmk-1,cytidylate-1,kinase-1,mssa-1	0.4096	4	0.0625	1	2.0
            String tkns_Gene[] = l[0].split(",");
            for (int i = 0; i < tkns_Gene.length; i++) {
                String Tkn_Freq[] = tkns_Gene[i].split("-");
                TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
            }
            Double Cj = Double.parseDouble(l[1]);
            Double AllTknNum = Double.parseDouble(l[2]);
            //Double Cj_max =  Double.parseDouble(l[3]);
            //Double MaxTknNum = Double.parseDouble(l[4]);
            Double Norm = Double.parseDouble(l[5]);
            if (Norm == 0.0) {
                Norm = 1.0;
            }

            /*
             * Tokens in Document (recognized mentions)
             */
            for (String Mention : Mention_hash.keySet()) {
                Mention = Mention.toLowerCase();
                Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
                Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
                Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
                Mention = Mention.replaceAll("[ ]+", " ");
                String tkns_Mention[] = Mention.split(" ");
                for (int i = 0; i < tkns_Mention.length; i++) {
                    if (TermFrequency.containsKey(tkns_Mention[i])) {
                        TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
                    }
                }
            }

            Double score = 0.0;
            for (String Tkn : TF.keySet()) {
                //LF
                for (int t = 0; t < LF_tkn.length; t++) {
                    if (LF_tkn[t].equals(Tkn)) {
                        LF_ParticalMatch++;
                    }
                }

                double TFij = TF.get(Tkn) / AllTknNum;
                double IDFi = GNormPlus.GeneScoringDF_hash.get(Tkn);
                score = score + TFij * IDFi * (1 / (1 - TFij));
            }
            //score = Cj * (1/Norm) *score;
            if (LF_ParticalMatch > 0) {
                score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/
            }
            return score;
        } else {
            //System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
            return 0.0;
        }
    }

    public void PreProcessing4GN(String Filename, String FilenameBioC) throws IOException, XMLStreamException {
        for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++) {
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) {
                for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String start = anno[0];
                    String last = anno[1];
                    String mentions = anno[2];
                    String type = anno[3];
                    String id = "";
                    if (anno.length >= 5) {
                        id = anno[4];
                    }

                    if (type.equals("Gene")) {
                        String mentionArr[] = mentions.split("\\|");
                        boolean update = false;
                        for (int m = 0; m < mentionArr.length; m++) {
                            Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
                            Matcher mtmp = ptmp.matcher(mentionArr[m]);
                            Pattern ptmp2 = Pattern.compile("^(.+)nu$");
                            Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
                            Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
                            Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
                            Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
                            Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
                            Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
                            Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
                            Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
                            Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
                            Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
                            Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
                            Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
                            Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
                            if (mtmp.find()) {
                                mentions = mentions + "|" + mtmp.group(1);
                                update = true;
                            }
                            if (mtmp2.find()) {
                                mentions = mentions + "|" + mtmp2.group(1);
                                update = true;
                            }
                            if (mtmp3.find()) {
                                mentions = mentions + "|" + mtmp3.group(1) + "a" + mtmp3.group(2);
                                update = true;
                            }
                            if (mtmp4.find()) {
                                mentions = mentions + "|" + mtmp4.group(1) + "b" + mtmp4.group(2);
                                update = true;
                            }
                            if (mtmp5.find()) {
                                mentions = mentions + "|" + mtmp5.group(1) + "alpha";
                                update = true;
                            }
                            if (mtmp6.find()) {
                                mentions = mentions + "|" + mtmp6.group(1) + "beta";
                                update = true;
                            }
                            if (mtmp7.find()) {
                                mentions = mentions + "|" + mtmp7.group(1) + "2" + mtmp7.group(2);
                                update = true;
                            }
                            if (mtmp8.find()) {
                                mentions = mentions + "|" + mtmp8.group(1) + "3" + mtmp8.group(2);
                                update = true;
                            }
                        }
                        if (update == true) {
                            data.getBioCDocobj().Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id);
                        }
                    }
                }
            }
        }
        //data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
    }

    public void ChromosomeRecognition(String Filename, String FilenameBioC) throws IOException, XMLStreamException {
        for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++) /** PMIDs : i */ {
            String Pmid = data.getBioCDocobj().PMIDs.get(i);
            for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */ {
                String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context

                /** Chromosome recognition */
                ArrayList locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext, "ChromosomeLocation");
                for (int k = 0; k < locations.size(); k++) {
                    String anno[] = locations.get(k).split("\t");
                    //int start= Integer.parseInt(anno[0]);
                    //int last= Integer.parseInt(anno[1]);
                    //String mention = anno[2];
                    String ids = anno[3];
                    //data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
                    String IDs[] = ids.split("[\\|,]");
                    for (int idcount = 0; idcount < IDs.length; idcount++) {
                        //IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
                        data.getPmid2ChromosomeGene_hash().put(Pmid + "\t" + IDs[idcount], "");
                    }
                }
            }
        }
        //data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
    }

    public void GeneNormalization(String Filename, String FilenameBioC, boolean GeneIDMatch) throws IOException, XMLStreamException {
        final DecimalFormat df = new DecimalFormat("0.####");
        df.setRoundingMode(RoundingMode.HALF_UP);

        //Tokenization
        for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++) /** PMIDs : i */ {
            String Pmid = data.getBioCDocobj().PMIDs.get(i);

            /** Species */
            HashMap Species_hash = new HashMap();
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) /** Paragraphs : j */ {
                for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) /** Annotation : k */ {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String mentions = anno[2];
                    String type = anno[3];
                    if (type.matches("(Species|Genus|Strain|CellLine|Cell)")) {
                        Species_hash.put(mentions, "");
                    }
                }
            }


            /*
             * Collect Gene mentions :
             *
             *  GeneMention-taxid	->	"ID" : geneid
             *  					->	"type" : "Gene"
             *  					->	start1-last1 : ""
             *  					->	start2-last2 : ""
             *  					->	start3-last3 : ""
             */

            String tiabs = "";
            for (int j = 0; j < data.getBioCDocobj().PassageContexts.get(i).size(); j++) /** Paragraphs : j */ {
                tiabs = tiabs + data.getBioCDocobj().PassageContexts.get(i).get(j).toLowerCase();
            }
            HashMap> GeneMention_hash = new HashMap>();
            HashMap Mention_hash = new HashMap();
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) /** Paragraphs : j */ {
                for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) /** Annotation : k */ {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String start = anno[0];
                    String last = anno[1];
                    String mentions = anno[2];
                    String type = anno[3];
                    String taxids = "Tax:9606";

                    if (anno.length >= 5) {
                        taxids = anno[4];
                    }
                    String mentions_tmp = mentions.toLowerCase();
                    mentions_tmp = mentions_tmp.replaceAll("[\\W\\-\\_]", "");
                    mentions_tmp = mentions_tmp.replaceAll("[0-9]", "0");
                    taxids = taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):", "");
                    if (taxids.equals("")) {
                        taxids = "9606";
                    }
                    /** Filtering */
                    boolean found_filter = false;
                    if (data.getFiltering_hash().containsKey(mentions_tmp)) // filtering
                    {
                        found_filter = true;
                    }

                    if (found_filter == false) //abbreviation
                    {
                        for (String f : GNormPlus.Filtering_WithLongForm_hash.keySet()) {
                            if (data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]" + f + "\tGene.*") ||
                                    data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*\\t" + f + "\\|[^\t]+\tGene.*")
                            ) {
                                String lf = GNormPlus.Filtering_WithLongForm_hash.get(f);
                                if (tiabs.matches(".*" + lf + ".*")) {
                                    found_filter = true;
                                    break;
                                }
                            }
                        }
                    }

                    if (found_filter == false) {
                        if (data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
                                data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191	Wuhan's
                        ) {
                            found_filter = true;

                        }
                    }

                    if (found_filter == false) {
                        if (type.matches("Gene")) {
                            if (GeneMention_hash.containsKey(mentions + "\t" + taxids)) {
                                GeneMention_hash.get(mentions + "\t" + taxids).put(start + "\t" + last, "");
                            } else {
                                HashMap offset_hash = new HashMap();
                                offset_hash.put(start + "\t" + last, "");
                                GeneMention_hash.put(mentions + "\t" + taxids, offset_hash);
                                GeneMention_hash.get(mentions + "\t" + taxids).put("type", type);
                                Mention_hash.put(mentions, "Gene");
                            }
                        } else if (type.matches("(FamilyName|DomainMotif)")) {
                            String GMs[] = mentions.split("\\|");
                            for (int g = 0; g < GMs.length; g++) {
                                String mention = GMs[g];
                                Mention_hash.put(mention, "FamilyDomain");
                            }
                        }
                    }

                }
            }

            /*
             * Gene id refinement:
             *  1. Official name
             *  2. only one gene
             */
            HashMap GuaranteedGene2ID = new HashMap();
            HashMap MultiGene2ID = new HashMap();
            for (String GeneMentionTax : GeneMention_hash.keySet()) {
                String GT[] = GeneMentionTax.split("\\t");
                String mentions = GT[0];
                String taxids = GT[1];
                String GMs[] = mentions.split("\\|");

                HashMap taxids_hash = new HashMap();
                String taxids_arr[] = taxids.split(",");
                for (int t = 0; t < taxids_arr.length; t++) {
                    taxids_hash.put(taxids_arr[t], "");
                }

                for (int ms = 0; ms < GMs.length; ms++) {
                    String mention = GMs[ms];
                    String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
                    String IDs[] = IDstr.split("\\|");

                    /*
                     * printing the ambiguous gene mentions and candidates
                     */
                    //String IDs_s[]=IDstr.split(",");
                    //if(IDs_s.length>1)
                    //{
                    //	System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
                    //}

                    for (int c = 0; c < IDs.length; c++) {
                        String tax2ID[] = IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
                        if (taxids_hash.containsKey(tax2ID[0])) {
                            String geneid = tax2ID[1];
                            String TargetTax = tax2ID[0];
                            GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
                            GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
                            break;
                        }
                    }

                    //geneid refinement
                    if (GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
                        Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
                        Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));

                        if (mtmp.find()) // 1. Official Name
                        {
                            GeneMention_hash.get(GeneMentionTax).put("ID", mtmp.group(1));
                            GuaranteedGene2ID.put(GeneMentionTax, mtmp.group(1));
                        } else if (GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
                        {
                            GuaranteedGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
                        } else {
                            String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
                            boolean FoundByChroLoca = false;
                            for (int idcount = 0; idcount < ID.length; idcount++) {
                                if (data.getPmid2ChromosomeGene_hash().containsKey(Pmid + "\t" + ID[idcount])) // 3. Chromosome location
                                {
                                    GuaranteedGene2ID.put(GeneMentionTax, ID[idcount]);
                                    FoundByChroLoca = true;
                                    break;
                                }
                            }
                            if (FoundByChroLoca == false) {
                                MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
                            }
                        }
                    }
                    if (GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3"))) {
                        break;
                    }
                }
            }

            /*
             * Gene id refinement:
             *  3. multiple genes but can be inferred by 1. and 2.
             */
            for (String GeneMentionTax_M : MultiGene2ID.keySet()) {
                for (String GeneMentionTax_G : GuaranteedGene2ID.keySet()) {
                    String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
                    for (int m = 0; m < MG.length; m++) {
                        if (MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G))) {
                            GeneMention_hash.get(GeneMentionTax_M).put("ID", MG[m]);
                        }
                    }
                }
            }

            /*
             * Gene id refinement:
             *  4. FullName -> Abbreviation
             */
            for (String GeneMentionTax : GeneMention_hash.keySet()) {
                String MT[] = GeneMentionTax.split("\\t");
                if (data.getPmidLF2Abb_hash().containsKey(Pmid + "\t" + MT[0])) {
                    String GeneMentionTax_Abb = data.getPmidLF2Abb_hash().get(Pmid + "\t" + MT[0]) + "\t" + MT[1];
                    if (GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
                        GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
                    }
                }
            }

            /*
             * Gene id refinement:
             *  5. Ranking by scoring function (inference network)
             */
            for (String GeneMentionTax : GeneMention_hash.keySet()) {
                if (GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+")) {
                    String geneids = GeneMention_hash.get(GeneMentionTax).get("ID");
                    String geneid[] = geneids.split(",");

                    String OutputStyle = "Top1";
                    if (OutputStyle.equals("Top1")) {
                        //only return the best one
                        double max_score = 0.0;
                        String target_geneid = "";
                        for (int g = 0; g < geneid.length; g++) {
                            String MT[] = GeneMentionTax.split("\\t");
                            String LF = "";
                            if (data.getPmidAbb2LF_hash().containsKey(Pmid + "\t" + MT[0])) {
                                LF = data.getPmidAbb2LF_hash().get(Pmid + "\t" + MT[0]);
                            }
                            double score = ScoringFunction(geneid[g], Mention_hash, LF);
                            if (score > max_score) {
                                max_score = score;
                                target_geneid = geneid[g];
                            } else if (score == 0.0) {
                                //System.out.println(GeneMentionTax);
                            }
                        }
                        GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
                    } else // "All"
                    {
                        //return all geneids
                        String geneSTR = "";
                        for (int g = 0; g < geneid.length; g++) {
                            String MT[] = GeneMentionTax.split("\\t");
                            String LF = "";
                            if (data.getPmidAbb2LF_hash().containsKey(Pmid + "\t" + MT[0])) {
                                LF = data.getPmidAbb2LF_hash().get(Pmid + "\t" + MT[0]);
                            }
                            double score = ScoringFunction(geneid[g], Mention_hash, LF);
                            String hoge = df.format(score);
                            score = Double.parseDouble(hoge);

                            if (geneSTR.equals("")) {
                                geneSTR = geneid[g] + "-" + score;
                            } else {
                                geneSTR = geneSTR + "," + geneid[g] + "-" + score;
                            }
                        }
                        GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
                    }
                }
            }

            /*
             * Gene id refinement: - removed (Reason: cause too much False Positive)
             *  6. Abbreviation -> FullName
             *
             */
            for (String GeneMentionTax : GeneMention_hash.keySet()) {
                String MT[] = GeneMentionTax.split("\\t");
                if (data.getPmidAbb2LF_hash().containsKey(Pmid + "\t" + MT[0])) {
                    String GeneMentionTax_LF = data.getPmidAbb2LF_hash().get(Pmid + "\t" + MT[0]) + "\t" + MT[1];
                    if (GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
                        GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
                    }
                }
            }

            /*
             * Gene id refinement:
             *  7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
             *  8. The short mention should be filtered if not long form support
             */
            ArrayList removeGMT = new ArrayList();
            for (String GeneMentionTax : GeneMention_hash.keySet()) {
                String GT[] = GeneMentionTax.split("\\t");
                String mentions = GT[0];
                String tax = GT[1];
                if (GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
                    String type = GeneMention_hash.get(GeneMentionTax).get("type");
                    String id = GeneMention_hash.get(GeneMentionTax).get("ID");
                    String geneid = "";
                    Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
                    Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
                    Matcher mtmp1 = ptmp1.matcher(id);
                    Matcher mtmp2 = ptmp2.matcher(id);
                    //System.out.println(id);
                    if (mtmp1.find()) {
                        geneid = "Homo:" + mtmp1.group(2);
                    } else if (mtmp2.find()) {
                        geneid = "Gene:" + mtmp2.group(1);
                    }

                    boolean LongFormTknMatch = false;
                    boolean LongFormExist = true;
                    if (GNormPlus.GeneScoring_hash.containsKey(geneid)) {
                        if (data.getPmidAbb2LF_lc_hash().containsKey(Pmid + "\t" + mentions.toLowerCase())) {
                            /*
                             * token in lexicon : tkn_lexicon
                             * token in mention : tkn_mention
                             */
                            String l[] = GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293	cmk-1,cytidylate-1,kinase-1,mssa-1	0.4096	4	0.0625	1	2.0
                            String tkns_Gene[] = l[0].split(",");
                            ArrayList tkn_lexicon = new ArrayList();
                            for (int ti = 0; ti < tkns_Gene.length; ti++) {
                                String Tkn_Freq[] = tkns_Gene[ti].split("-");
                                tkn_lexicon.add(Tkn_Freq[0]);
                            }

                            String LF_lc = data.getPmidAbb2LF_lc_hash().get(Pmid + "\t" + mentions.toLowerCase());
                            LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
                            LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
                            String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
                            for (int tl = 0; tl < tkn_lexicon.size(); tl++) {
                                for (int tm = 0; tm < tkn_mention.length; tm++) {
                                    if (tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+"))) {
                                        LongFormTknMatch = true;
                                    }
                                }
                            }
                        } else {
                            LongFormExist = false;
                        }
                    } else {
                        LongFormTknMatch = true;
                    } // exception

                    if (LongFormTknMatch == false && LongFormExist == true) // 7.
                    {
                        removeGMT.add(GeneMentionTax); //remove short form
                        removeGMT.add(data.getPmidAbb2LF_hash().get(Pmid + "\t" + mentions) + "\t" + tax); //remove long form
                    } else if (mentions.length() <= 2 && LongFormExist == false) // 8.
                    {
                        removeGMT.add(GeneMentionTax);
                    }
                }
            }

            for (int gmti = 0; gmti < removeGMT.size(); gmti++) // remove
            {
                GeneMention_hash.remove(removeGMT.get(gmti));
            }

            // Append gene ids
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraphs : j
            {
                for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
                {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String start = anno[0];
                    String last = anno[1];
                    String mentions = anno[2];
                    String type = anno[3];
                    String taxid_org = "Tax:9606";
                    if (anno.length >= 5) {
                        taxid_org = anno[4];
                    }
                    String taxids = taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):", "");
                    String GMs[] = mentions.split("\\|");

                    if (GeneMention_hash.containsKey(mentions + "\t" + taxids) && GeneMention_hash.get(mentions + "\t" + taxids).containsKey("TargetTax")) {
                        String taxtype = taxid_org.replaceAll(":([0-9,]+)", "");
                        String taxid = GeneMention_hash.get(mentions + "\t" + taxids).get("TargetTax");
                        data.getBioCDocobj().Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + taxtype + ":" + taxid);
                    }

                    if (type.equals("Gene")) {
                        data.getBioCDocobj().Annotations.get(i).get(j).set(k, data.getBioCDocobj().Annotations.get(i).get(j).get(k) + "|");


                        if (GeneMention_hash.containsKey(mentions + "\t" + taxids) && GeneMention_hash.get(mentions + "\t" + taxids).containsKey("ID")) {
                            data.getBioCDocobj().Annotations.get(i).get(j).set(k, data.getBioCDocobj().Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions + "\t" + taxids).get("ID") + ",");
                        } else // cannot find appropriate species
                        {
                            //System.out.println(mention+"\t"+taxid);
                        }
                        data.getBioCDocobj().Annotations.get(i).get(j).set(k, data.getBioCDocobj().Annotations.get(i).get(j).get(k).substring(0, data.getBioCDocobj().Annotations.get(i).get(j).get(k).length() - 1)); // remove ",$"
                    }
                }
            }

            //Extend to all gene mentions
            HashMap GeneMentions = new HashMap(); // Extending Gene mentions
            HashMap GeneMentionLocation = new HashMap(); // Extending Gene mentions
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
            {
                for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
                {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    int start = Integer.parseInt(anno[0]);
                    int last = Integer.parseInt(anno[1]);
                    String mentions = anno[2];
                    String type = anno[3];
                    String id = "Tax:9606";
                    if (anno.length >= 5) {
                        id = anno[4];
                    }
                    if (type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)")) {
                        GeneMentions.put(mentions.toLowerCase(), id);
                        for (int s = start; s <= last; s++) {
                            GeneMentionLocation.put(j + "\t" + s, "");
                        }
                    } else if (type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)")) {
                        GeneMentions.put(mentions.toLowerCase(), id);
                        for (int s = start; s <= last; s++) {
                            GeneMentionLocation.put(j + "\t" + s, "");
                        }
                    }
                }
            }
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
            {
                if (data.getBioCDocobj().PassageContexts.size() > i && data.getBioCDocobj().PassageContexts.get(i).size() > j) {
                    String PassageContexts = " " + data.getBioCDocobj().PassageContexts.get(i).get(j) + " ";
                    String PassageContexts_tmp = PassageContexts.toLowerCase();
                    for (String gm : GeneMentions.keySet()) {
                        String id = GeneMentions.get(gm);
                        if (gm.length() >= 3) {
                            gm = gm.replaceAll("[ ]*[\\|]*$", "");
                            gm = gm.replaceAll("^[\\|]*[ ]*", "");
                            gm = gm.replaceAll("[\\|][\\|]+", "\\|");
                            if (!gm.matches("[\\W\\-\\_]*")) {
                                gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
                                Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])(" + gm + ")([\\W\\-\\_].*)$");
                                Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
                                while (mtmp.find()) {
                                    String pre = mtmp.group(1);
                                    String gmtmp = mtmp.group(2);
                                    String post = mtmp.group(3);

                                    int start = pre.length() - 1;
                                    int last = start + gmtmp.length();
                                    if (PassageContexts.length() >= last + 1) {
                                        String mention = PassageContexts.substring(start + 1, last + 1);
                                        if (!GeneMentionLocation.containsKey(j + "\t" + start) && !GeneMentionLocation.containsKey(j + "\t" + last)) {
                                            data.getBioCDocobj().Annotations.get(i).get(j).add(start + "\t" + last + "\t" + mention + "\tGene\t" + id);
                                        }
                                    }
                                    gmtmp = gmtmp.replaceAll(".", "\\@");
                                    PassageContexts_tmp = pre + "" + gmtmp + "" + post;
                                    mtmp = ptmp.matcher(PassageContexts_tmp);
                                }
                            }
                        }
                    }
                }
            }

            //Apply to FamilyNames
            HashMap geneids = new HashMap(); // Extending Gene mentions
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
            {
                for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
                {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String type = anno[3];
                    if (type.equals("Gene")) {
                        String id = "Tax:9606";
                        if (anno.length >= 5) {
                            id = anno[4];
                        }
                        Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
                        Matcher mtmp0 = ptmp0.matcher(id);
                        Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
                        Matcher mtmp1 = ptmp1.matcher(id);
                        if (mtmp0.find()) {
                            geneids.put(mtmp0.group(3), "");
                        }
                        if (mtmp1.find()) {
                            geneids.put(mtmp1.group(3), "");
                        }
                    }
                }
            }
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
            {
                for (int k = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k >= 0; k--) // Annotation : k
                {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String mention = anno[2];
                    String type = anno[3];
                    if (type.matches("(FamilyName|DomainMotif)")) {
                        String id = "Tax:9606";
                        if (anno.length >= 5) {
                            id = anno[4];
                        }
                        String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
                        String IDstr[] = IDstrs.split("\\|");
                        String ids = "";
                        for (int id_i = 0; id_i < IDstr.length; id_i++) {
                            if (geneids.containsKey(IDstr[id_i])) {
                                if (ids.equals("")) {
                                    ids = IDstr[id_i];
                                } else {
                                    ids = ids + ";" + IDstr[id_i];
                                }
                            }
                        }
                        if (!ids.equals("")) {
                            if (type.equals("FamilyName")) {
                                type = "Gene";
                            }
                            String Annotation_k = anno[0] + "\t" + anno[1] + "\t" + anno[2] + "\t" + type + "\tTax:9606";
                            if (anno.length >= 5) {
                                Annotation_k = anno[0] + "\t" + anno[1] + "\t" + anno[2] + "\t" + type + "\t" + anno[4];
                            }
                            data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotation_k + "|" + ids);
                        } else {
                            // Added by Erik Faessler, JULIE Lab: Output FamilyNames but suppress output of empty annotations
                            if (type.equals("DomainMotif") || Integer.parseInt(anno[1]) - Integer.parseInt(anno[0]) <= 0)
                                data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
                        }
                    }
                }
            }
            //Species "*" and "(anti)" removed.
            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
            {
                for (int k = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k >= 0; k--) // Annotation : k
                {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    String type = anno[3];
                    if (type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell")) {
                        String id = anno[4];
                        id = id.replaceAll("\\*", "");
                        id = id.replaceAll("\\(anti\\)", "");
                        String Annotation_k = anno[0] + "\t" + anno[1] + "\t" + anno[2] + "\t" + type + "\t" + id;
                        data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotation_k);
                    }
                }
            }

            for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
            {

                for (int k = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k >= 0; k--) // Annotation : k
                {
                    String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
                    int start = Integer.parseInt(anno[0]);
                    int last = Integer.parseInt(anno[1]);
                    String mention = anno[2];
                    String type = anno[3];
                    String id = anno[4];
                    if (type.equals("Gene") && Species_hash.containsKey(mention)) {
                        data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
                    } else if (type.equals("Gene") && id.equals("")) {
                        data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
                    } else {
                        for (int k1 = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k1 >= 0; k1--) // Annotation : k
                        {
                            if (k1 != k) {
                                String anno1[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k1).split("\t");
                                int start1 = Integer.parseInt(anno1[0]);
                                int last1 = Integer.parseInt(anno1[1]);
                                if ((start1 < start && last1 >= last) || (start1 <= start && last1 > last)) {
                                    data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }
        if (GeneIDMatch == true) {
            //data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
        } else {
            data.getBioCDocobj().BioCOutput(Filename, FilenameBioC, data.getBioCDocobj().Annotations, true, true);
        }
    }

    /*
     * Search Potential GeneID in the Prefix Tree
     */
    public ArrayList SearchGeneIDLocation(String Doc) {
        ArrayList location = new ArrayList();

        String Doc_tmp = " " + Doc + " ";
        Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
        Matcher mtmp = ptmp.matcher(Doc_tmp);
        while (mtmp.find()) {
            String str1 = mtmp.group(1);
            String str2 = mtmp.group(2);
            String str3 = mtmp.group(3);
            for (int m = str1.length(); m <= (str1.length() + str2.length()); m++) {
                int start = str1.length() - 1;
                int last = start + str2.length();
                String mention = Doc.substring(start, last);
                if (!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*")) {
                    if (last - start > 6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+"))) {
                        Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
                        Matcher mtmp1 = ptmp1.matcher(mention);
                        Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
                        Matcher mtmp2 = ptmp2.matcher(mention);
                        if (mtmp1.find()) {
                            String S1 = mtmp1.group(1);
                            if (mtmp1.group(2).length() <= 6 && mtmp1.group(3).length() <= 6) {
                                int Num1 = Integer.parseInt(mtmp1.group(2));
                                int Num2 = Integer.parseInt(mtmp1.group(3));
                                String prefix = "";
                                Pattern ptmp3 = Pattern.compile("^([0]+)");
                                Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
                                if (mtmp3.find()) {
                                    prefix = mtmp3.group(1);
                                }
                                if (Num2 - Num1 > 0 && (Num2 - Num1 <= 20)) {
                                    for (int n = Num1; n <= Num2; n++) {
                                        String StrNum = S1 + prefix + n;
                                        if (StrNum.length() >= 5) {
                                            location.add(start + "\t" + last + "\t" + StrNum + "\tGeneID");
                                        }
                                    }
                                }
                            }
                        } else if (mtmp2.find()) {
                            if (mtmp2.group(2).length() <= 6 && mtmp2.group(4).length() <= 6) {
                                String S1 = mtmp2.group(1);
                                int Num1 = Integer.parseInt(mtmp2.group(2));
                                String S2 = mtmp2.group(3);
                                int Num2 = Integer.parseInt(mtmp2.group(4));
                                if (S1.equals(S2)) {
                                    String prefix = "";
                                    Pattern ptmp3 = Pattern.compile("^([0]+)");
                                    Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
                                    if (mtmp3.find()) {
                                        prefix = mtmp3.group(1);
                                    }
                                    if (Num2 - Num1 > 0 && (Num2 - Num1 <= 20)) {
                                        for (int n = Num1; n <= Num2; n++) {
                                            String StrNum = S1 + prefix + n;
                                            if (StrNum.length() >= 5) {
                                                location.add(start + "\t" + last + "\t" + StrNum + "\tGeneID");
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                    location.add(start + "\t" + last + "\t" + mention + "\tGeneID");
                }
            }
            String men = "";
            for (int m = 0; m < str2.length(); m++) {
                men = men + "@";
            }
            Doc_tmp = str1 + men + str3;
            mtmp = ptmp.matcher(Doc_tmp);
        }
        return location;
    }

    public void GeneIDRecognition(String Filename, String FilenameBioC) throws IOException, XMLStreamException {
        for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++) /** PMIDs : i */ {
            for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */ {
                String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
                /** GeneID recognition by pattern match */
                ArrayList locations = SearchGeneIDLocation(PassageContext);
                for (int k = 0; k < locations.size(); k++) {
                    String anno[] = locations.get(k).split("\t");
                    String mention = anno[2].toLowerCase();
                    mention = mention.replaceAll("[\\W\\-\\_]+", "");
                    if (GNormPlus.GeneIDs_hash.containsKey(mention)) {
                        data.getBioCDocobj().Annotations.get(i).get(j).add(locations.get(k) + "\tGeneID:" + GNormPlus.GeneIDs_hash.get(mention)); //paragraph
                    }
                }
            }
        }
        data.getBioCDocobj().BioCOutput(Filename, FilenameBioC, data.getBioCDocobj().Annotations, true, true);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy