GNormPluslib.GN Maven / Gradle / Ivy
/**
* Project: GNormPlus
* Function: Gene Normalization
*/
package GNormPluslib;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;
import java.text.DecimalFormat;
import java.math.RoundingMode;
import javax.xml.stream.XMLStreamException;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
public class GN {
public static HashMap MatchedTokens_hash = new HashMap();
private GNPProcessingData data;
public GN(GNPProcessingData data) {
this.data = data;
}
private double ScoringFunction(String geneid, HashMap Mention_hash, String LF) {
/*
* define gene/homo id
*/
//LF
LF = LF.toLowerCase();
LF = LF.replaceAll("([0-9])([a-z])", "$1 $2");
LF = LF.replaceAll("([a-z])([0-9])", "$1 $2");
LF = LF.replaceAll("([\\W\\-\\_])", " ");
LF = LF.replaceAll("[ ]+", " ");
String LF_tkn[] = LF.split(" ");
int LF_ParticalMatch = 0;
Pattern ptmp = Pattern.compile("[0-9]+\\-([0-9]+)");
Matcher mtmp = ptmp.matcher(geneid);
Pattern ptmp2 = Pattern.compile("([0-9]+)");
Matcher mtmp2 = ptmp.matcher(geneid);
if (mtmp.find()) {
geneid = "Homo:" + mtmp.group(1);
} else {
geneid = "Gene:" + geneid;
}
if (GNormPlus.GeneScoring_hash.containsKey(geneid)) {
HashMap TF = new HashMap(); // token i in gene j
HashMap TermFrequency = new HashMap();
/*
* Tokens in Query (Gene id lexicon)
*/
String l[] = GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
String tkns_Gene[] = l[0].split(",");
for (int i = 0; i < tkns_Gene.length; i++) {
String Tkn_Freq[] = tkns_Gene[i].split("-");
TermFrequency.put(Tkn_Freq[0], Double.parseDouble(Tkn_Freq[1]));
}
Double Cj = Double.parseDouble(l[1]);
Double AllTknNum = Double.parseDouble(l[2]);
//Double Cj_max = Double.parseDouble(l[3]);
//Double MaxTknNum = Double.parseDouble(l[4]);
Double Norm = Double.parseDouble(l[5]);
if (Norm == 0.0) {
Norm = 1.0;
}
/*
* Tokens in Document (recognized mentions)
*/
for (String Mention : Mention_hash.keySet()) {
Mention = Mention.toLowerCase();
Mention = Mention.replaceAll("([0-9])([a-z])", "$1 $2");
Mention = Mention.replaceAll("([a-z])([0-9])", "$1 $2");
Mention = Mention.replaceAll("([\\W\\-\\_])", " ");
Mention = Mention.replaceAll("[ ]+", " ");
String tkns_Mention[] = Mention.split(" ");
for (int i = 0; i < tkns_Mention.length; i++) {
if (TermFrequency.containsKey(tkns_Mention[i])) {
TF.put(tkns_Mention[i], TermFrequency.get(tkns_Mention[i]));
}
}
}
Double score = 0.0;
for (String Tkn : TF.keySet()) {
//LF
for (int t = 0; t < LF_tkn.length; t++) {
if (LF_tkn[t].equals(Tkn)) {
LF_ParticalMatch++;
}
}
double TFij = TF.get(Tkn) / AllTknNum;
double IDFi = GNormPlus.GeneScoringDF_hash.get(Tkn);
score = score + TFij * IDFi * (1 / (1 - TFij));
}
//score = Cj * (1/Norm) *score;
if (LF_ParticalMatch > 0) {
score = score + LF_ParticalMatch;/*System.out.println(geneid+"\t"+LF+"\t"+score);*/
}
return score;
} else {
//System.out.println("Error: cannot find geneid: "+geneid+" in GeneScoring_hash");
return 0.0;
}
}
public void PreProcessing4GN(String Filename, String FilenameBioC) throws IOException, XMLStreamException {
for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++) {
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) {
for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) {
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String start = anno[0];
String last = anno[1];
String mentions = anno[2];
String type = anno[3];
String id = "";
if (anno.length >= 5) {
id = anno[4];
}
if (type.equals("Gene")) {
String mentionArr[] = mentions.split("\\|");
boolean update = false;
for (int m = 0; m < mentionArr.length; m++) {
Pattern ptmp = Pattern.compile("^(.*[0-9A-Z])[ ]*p$");
Matcher mtmp = ptmp.matcher(mentionArr[m]);
Pattern ptmp2 = Pattern.compile("^(.+)nu$");
Matcher mtmp2 = ptmp2.matcher(mentionArr[m]);
Pattern ptmp3 = Pattern.compile("^(.*)alpha(.*)$");
Matcher mtmp3 = ptmp3.matcher(mentionArr[m]);
Pattern ptmp4 = Pattern.compile("^(.*)beta(.*)$");
Matcher mtmp4 = ptmp4.matcher(mentionArr[m]);
Pattern ptmp5 = Pattern.compile("^(.+[0-9])a$");
Matcher mtmp5 = ptmp5.matcher(mentionArr[m]);
Pattern ptmp6 = Pattern.compile("^(.+[0-9])b$");
Matcher mtmp6 = ptmp6.matcher(mentionArr[m]);
Pattern ptmp7 = Pattern.compile("^(.+)II([a-z])$");
Matcher mtmp7 = ptmp7.matcher(mentionArr[m]);
Pattern ptmp8 = Pattern.compile("^(.+)III([a-z])$");
Matcher mtmp8 = ptmp8.matcher(mentionArr[m]);
if (mtmp.find()) {
mentions = mentions + "|" + mtmp.group(1);
update = true;
}
if (mtmp2.find()) {
mentions = mentions + "|" + mtmp2.group(1);
update = true;
}
if (mtmp3.find()) {
mentions = mentions + "|" + mtmp3.group(1) + "a" + mtmp3.group(2);
update = true;
}
if (mtmp4.find()) {
mentions = mentions + "|" + mtmp4.group(1) + "b" + mtmp4.group(2);
update = true;
}
if (mtmp5.find()) {
mentions = mentions + "|" + mtmp5.group(1) + "alpha";
update = true;
}
if (mtmp6.find()) {
mentions = mentions + "|" + mtmp6.group(1) + "beta";
update = true;
}
if (mtmp7.find()) {
mentions = mentions + "|" + mtmp7.group(1) + "2" + mtmp7.group(2);
update = true;
}
if (mtmp8.find()) {
mentions = mentions + "|" + mtmp8.group(1) + "3" + mtmp8.group(2);
update = true;
}
}
if (update == true) {
data.getBioCDocobj().Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + id);
}
}
}
}
}
//data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
}
public void ChromosomeRecognition(String Filename, String FilenameBioC) throws IOException, XMLStreamException {
for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++) /** PMIDs : i */ {
String Pmid = data.getBioCDocobj().PMIDs.get(i);
for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */ {
String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
/** Chromosome recognition */
ArrayList locations = GNormPlus.PT_GeneChromosome.SearchMentionLocation(PassageContext, "ChromosomeLocation");
for (int k = 0; k < locations.size(); k++) {
String anno[] = locations.get(k).split("\t");
//int start= Integer.parseInt(anno[0]);
//int last= Integer.parseInt(anno[1]);
//String mention = anno[2];
String ids = anno[3];
//data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tChromosomeLocation\t"+ids); //paragraph
String IDs[] = ids.split("[\\|,]");
for (int idcount = 0; idcount < IDs.length; idcount++) {
//IDs[idcount] = IDs[idcount].replaceAll("\\-[0-9]+", "");
data.getPmid2ChromosomeGene_hash().put(Pmid + "\t" + IDs[idcount], "");
}
}
}
}
//data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
}
public void GeneNormalization(String Filename, String FilenameBioC, boolean GeneIDMatch) throws IOException, XMLStreamException {
final DecimalFormat df = new DecimalFormat("0.####");
df.setRoundingMode(RoundingMode.HALF_UP);
//Tokenization
for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++) /** PMIDs : i */ {
String Pmid = data.getBioCDocobj().PMIDs.get(i);
/** Species */
HashMap Species_hash = new HashMap();
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) /** Paragraphs : j */ {
for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) /** Annotation : k */ {
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String mentions = anno[2];
String type = anno[3];
if (type.matches("(Species|Genus|Strain|CellLine|Cell)")) {
Species_hash.put(mentions, "");
}
}
}
/*
* Collect Gene mentions :
*
* GeneMention-taxid -> "ID" : geneid
* -> "type" : "Gene"
* -> start1-last1 : ""
* -> start2-last2 : ""
* -> start3-last3 : ""
*/
String tiabs = "";
for (int j = 0; j < data.getBioCDocobj().PassageContexts.get(i).size(); j++) /** Paragraphs : j */ {
tiabs = tiabs + data.getBioCDocobj().PassageContexts.get(i).get(j).toLowerCase();
}
HashMap> GeneMention_hash = new HashMap>();
HashMap Mention_hash = new HashMap();
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) /** Paragraphs : j */ {
for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) /** Annotation : k */ {
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String start = anno[0];
String last = anno[1];
String mentions = anno[2];
String type = anno[3];
String taxids = "Tax:9606";
if (anno.length >= 5) {
taxids = anno[4];
}
String mentions_tmp = mentions.toLowerCase();
mentions_tmp = mentions_tmp.replaceAll("[\\W\\-\\_]", "");
mentions_tmp = mentions_tmp.replaceAll("[0-9]", "0");
taxids = taxids.replaceAll("(Focus|Right|Left|Prefix|Tax):", "");
if (taxids.equals("")) {
taxids = "9606";
}
/** Filtering */
boolean found_filter = false;
if (data.getFiltering_hash().containsKey(mentions_tmp)) // filtering
{
found_filter = true;
}
if (found_filter == false) //abbreviation
{
for (String f : GNormPlus.Filtering_WithLongForm_hash.keySet()) {
if (data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*[\\t\\|]" + f + "\tGene.*") ||
data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*\\t" + f + "\\|[^\t]+\tGene.*")
) {
String lf = GNormPlus.Filtering_WithLongForm_hash.get(f);
if (tiabs.matches(".*" + lf + ".*")) {
found_filter = true;
break;
}
}
}
}
if (found_filter == false) {
if (data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*[\\t\\|][a-z]\tGene.*") ||
data.getBioCDocobj().Annotations.get(i).get(j).get(k).matches(".*\\t[a-z]\\|[^\t]+\tGene.*") //32171191 Wuhan's
) {
found_filter = true;
}
}
if (found_filter == false) {
if (type.matches("Gene")) {
if (GeneMention_hash.containsKey(mentions + "\t" + taxids)) {
GeneMention_hash.get(mentions + "\t" + taxids).put(start + "\t" + last, "");
} else {
HashMap offset_hash = new HashMap();
offset_hash.put(start + "\t" + last, "");
GeneMention_hash.put(mentions + "\t" + taxids, offset_hash);
GeneMention_hash.get(mentions + "\t" + taxids).put("type", type);
Mention_hash.put(mentions, "Gene");
}
} else if (type.matches("(FamilyName|DomainMotif)")) {
String GMs[] = mentions.split("\\|");
for (int g = 0; g < GMs.length; g++) {
String mention = GMs[g];
Mention_hash.put(mention, "FamilyDomain");
}
}
}
}
}
/*
* Gene id refinement:
* 1. Official name
* 2. only one gene
*/
HashMap GuaranteedGene2ID = new HashMap();
HashMap MultiGene2ID = new HashMap();
for (String GeneMentionTax : GeneMention_hash.keySet()) {
String GT[] = GeneMentionTax.split("\\t");
String mentions = GT[0];
String taxids = GT[1];
String GMs[] = mentions.split("\\|");
HashMap taxids_hash = new HashMap();
String taxids_arr[] = taxids.split(",");
for (int t = 0; t < taxids_arr.length; t++) {
taxids_hash.put(taxids_arr[t], "");
}
for (int ms = 0; ms < GMs.length; ms++) {
String mention = GMs[ms];
String IDstr = GNormPlus.PT_Gene.MentionMatch(mention); /** searched by PT_Gene */
String IDs[] = IDstr.split("\\|");
/*
* printing the ambiguous gene mentions and candidates
*/
//String IDs_s[]=IDstr.split(",");
//if(IDs_s.length>1)
//{
// System.out.println(Pmid+"\t"+mention+"\t"+mentions+"\t"+IDstr);
//}
for (int c = 0; c < IDs.length; c++) {
String tax2ID[] = IDs[c].split(":"); // tax2ID[0] = taxid ; tax2ID[1] = geneids
if (taxids_hash.containsKey(tax2ID[0])) {
String geneid = tax2ID[1];
String TargetTax = tax2ID[0];
GeneMention_hash.get(GeneMentionTax).put("ID", geneid);
GeneMention_hash.get(GeneMentionTax).put("TargetTax", TargetTax);
break;
}
}
//geneid refinement
if (GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
Pattern ptmp = Pattern.compile("\\*([0-9]+(\\-[0-9]+|))");
Matcher mtmp = ptmp.matcher(GeneMention_hash.get(GeneMentionTax).get("ID"));
if (mtmp.find()) // 1. Official Name
{
GeneMention_hash.get(GeneMentionTax).put("ID", mtmp.group(1));
GuaranteedGene2ID.put(GeneMentionTax, mtmp.group(1));
} else if (GeneMention_hash.get(GeneMentionTax).get("ID").matches("[0-9]+(\\-[0-9]+|)")) // 2. only one gene
{
GuaranteedGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
} else {
String ID[] = GeneMention_hash.get(GeneMentionTax).get("ID").split(",");
boolean FoundByChroLoca = false;
for (int idcount = 0; idcount < ID.length; idcount++) {
if (data.getPmid2ChromosomeGene_hash().containsKey(Pmid + "\t" + ID[idcount])) // 3. Chromosome location
{
GuaranteedGene2ID.put(GeneMentionTax, ID[idcount]);
FoundByChroLoca = true;
break;
}
}
if (FoundByChroLoca == false) {
MultiGene2ID.put(GeneMentionTax, GeneMention_hash.get(GeneMentionTax).get("ID"));
}
}
}
if (GNormPlus.suffixprefix_orig2modified.containsKey(mention) && (!IDstr.equals("-1")) && (!IDstr.equals("-2")) && (!IDstr.equals("-3"))) {
break;
}
}
}
/*
* Gene id refinement:
* 3. multiple genes but can be inferred by 1. and 2.
*/
for (String GeneMentionTax_M : MultiGene2ID.keySet()) {
for (String GeneMentionTax_G : GuaranteedGene2ID.keySet()) {
String MG[] = MultiGene2ID.get(GeneMentionTax_M).split(",");
for (int m = 0; m < MG.length; m++) {
if (MG[m].equals(GuaranteedGene2ID.get(GeneMentionTax_G))) {
GeneMention_hash.get(GeneMentionTax_M).put("ID", MG[m]);
}
}
}
}
/*
* Gene id refinement:
* 4. FullName -> Abbreviation
*/
for (String GeneMentionTax : GeneMention_hash.keySet()) {
String MT[] = GeneMentionTax.split("\\t");
if (data.getPmidLF2Abb_hash().containsKey(Pmid + "\t" + MT[0])) {
String GeneMentionTax_Abb = data.getPmidLF2Abb_hash().get(Pmid + "\t" + MT[0]) + "\t" + MT[1];
if (GeneMention_hash.containsKey(GeneMentionTax_Abb) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
GeneMention_hash.get(GeneMentionTax_Abb).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
}
}
}
/*
* Gene id refinement:
* 5. Ranking by scoring function (inference network)
*/
for (String GeneMentionTax : GeneMention_hash.keySet()) {
if (GeneMention_hash.get(GeneMentionTax).containsKey("ID") && GeneMention_hash.get(GeneMentionTax).get("ID").matches(".+,.+")) {
String geneids = GeneMention_hash.get(GeneMentionTax).get("ID");
String geneid[] = geneids.split(",");
String OutputStyle = "Top1";
if (OutputStyle.equals("Top1")) {
//only return the best one
double max_score = 0.0;
String target_geneid = "";
for (int g = 0; g < geneid.length; g++) {
String MT[] = GeneMentionTax.split("\\t");
String LF = "";
if (data.getPmidAbb2LF_hash().containsKey(Pmid + "\t" + MT[0])) {
LF = data.getPmidAbb2LF_hash().get(Pmid + "\t" + MT[0]);
}
double score = ScoringFunction(geneid[g], Mention_hash, LF);
if (score > max_score) {
max_score = score;
target_geneid = geneid[g];
} else if (score == 0.0) {
//System.out.println(GeneMentionTax);
}
}
GeneMention_hash.get(GeneMentionTax).put("ID", target_geneid);
} else // "All"
{
//return all geneids
String geneSTR = "";
for (int g = 0; g < geneid.length; g++) {
String MT[] = GeneMentionTax.split("\\t");
String LF = "";
if (data.getPmidAbb2LF_hash().containsKey(Pmid + "\t" + MT[0])) {
LF = data.getPmidAbb2LF_hash().get(Pmid + "\t" + MT[0]);
}
double score = ScoringFunction(geneid[g], Mention_hash, LF);
String hoge = df.format(score);
score = Double.parseDouble(hoge);
if (geneSTR.equals("")) {
geneSTR = geneid[g] + "-" + score;
} else {
geneSTR = geneSTR + "," + geneid[g] + "-" + score;
}
}
GeneMention_hash.get(GeneMentionTax).put("ID", geneSTR);
}
}
}
/*
* Gene id refinement: - removed (Reason: cause too much False Positive)
* 6. Abbreviation -> FullName
*
*/
for (String GeneMentionTax : GeneMention_hash.keySet()) {
String MT[] = GeneMentionTax.split("\\t");
if (data.getPmidAbb2LF_hash().containsKey(Pmid + "\t" + MT[0])) {
String GeneMentionTax_LF = data.getPmidAbb2LF_hash().get(Pmid + "\t" + MT[0]) + "\t" + MT[1];
if (GeneMention_hash.containsKey(GeneMentionTax_LF) && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
GeneMention_hash.get(GeneMentionTax_LF).put("ID", GeneMention_hash.get(GeneMentionTax).get("ID"));
}
}
}
/*
* Gene id refinement:
* 7. The inference network tokens of Abbreviation.ID should contain at least LF tokens
* 8. The short mention should be filtered if not long form support
*/
ArrayList removeGMT = new ArrayList();
for (String GeneMentionTax : GeneMention_hash.keySet()) {
String GT[] = GeneMentionTax.split("\\t");
String mentions = GT[0];
String tax = GT[1];
if (GeneMention_hash.get(GeneMentionTax).containsKey("type") && GeneMention_hash.get(GeneMentionTax).get("type").equals("Gene") && GeneMention_hash.get(GeneMentionTax).containsKey("ID")) {
String type = GeneMention_hash.get(GeneMentionTax).get("type");
String id = GeneMention_hash.get(GeneMentionTax).get("ID");
String geneid = "";
Pattern ptmp1 = Pattern.compile("^([0-9]+)\\-([0-9]+)$");
Pattern ptmp2 = Pattern.compile("^([0-9]+)$");
Matcher mtmp1 = ptmp1.matcher(id);
Matcher mtmp2 = ptmp2.matcher(id);
//System.out.println(id);
if (mtmp1.find()) {
geneid = "Homo:" + mtmp1.group(2);
} else if (mtmp2.find()) {
geneid = "Gene:" + mtmp2.group(1);
}
boolean LongFormTknMatch = false;
boolean LongFormExist = true;
if (GNormPlus.GeneScoring_hash.containsKey(geneid)) {
if (data.getPmidAbb2LF_lc_hash().containsKey(Pmid + "\t" + mentions.toLowerCase())) {
/*
* token in lexicon : tkn_lexicon
* token in mention : tkn_mention
*/
String l[] = GNormPlus.GeneScoring_hash.get(geneid).split("\t"); // Gene:2664293 cmk-1,cytidylate-1,kinase-1,mssa-1 0.4096 4 0.0625 1 2.0
String tkns_Gene[] = l[0].split(",");
ArrayList tkn_lexicon = new ArrayList();
for (int ti = 0; ti < tkns_Gene.length; ti++) {
String Tkn_Freq[] = tkns_Gene[ti].split("-");
tkn_lexicon.add(Tkn_Freq[0]);
}
String LF_lc = data.getPmidAbb2LF_lc_hash().get(Pmid + "\t" + mentions.toLowerCase());
LF_lc = LF_lc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
LF_lc = LF_lc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
String tkn_mention[] = LF_lc.split("[\\W\\-\\_]");
for (int tl = 0; tl < tkn_lexicon.size(); tl++) {
for (int tm = 0; tm < tkn_mention.length; tm++) {
if (tkn_lexicon.get(tl).equals(tkn_mention[tm]) && (!tkn_mention[tm].matches("[0-9]+"))) {
LongFormTknMatch = true;
}
}
}
} else {
LongFormExist = false;
}
} else {
LongFormTknMatch = true;
} // exception
if (LongFormTknMatch == false && LongFormExist == true) // 7.
{
removeGMT.add(GeneMentionTax); //remove short form
removeGMT.add(data.getPmidAbb2LF_hash().get(Pmid + "\t" + mentions) + "\t" + tax); //remove long form
} else if (mentions.length() <= 2 && LongFormExist == false) // 8.
{
removeGMT.add(GeneMentionTax);
}
}
}
for (int gmti = 0; gmti < removeGMT.size(); gmti++) // remove
{
GeneMention_hash.remove(removeGMT.get(gmti));
}
// Append gene ids
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraphs : j
{
for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
{
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String start = anno[0];
String last = anno[1];
String mentions = anno[2];
String type = anno[3];
String taxid_org = "Tax:9606";
if (anno.length >= 5) {
taxid_org = anno[4];
}
String taxids = taxid_org.replaceAll("(Focus|Right|Left|Prefix|Tax):", "");
String GMs[] = mentions.split("\\|");
if (GeneMention_hash.containsKey(mentions + "\t" + taxids) && GeneMention_hash.get(mentions + "\t" + taxids).containsKey("TargetTax")) {
String taxtype = taxid_org.replaceAll(":([0-9,]+)", "");
String taxid = GeneMention_hash.get(mentions + "\t" + taxids).get("TargetTax");
data.getBioCDocobj().Annotations.get(i).get(j).set(k, start + "\t" + last + "\t" + mentions + "\t" + type + "\t" + taxtype + ":" + taxid);
}
if (type.equals("Gene")) {
data.getBioCDocobj().Annotations.get(i).get(j).set(k, data.getBioCDocobj().Annotations.get(i).get(j).get(k) + "|");
if (GeneMention_hash.containsKey(mentions + "\t" + taxids) && GeneMention_hash.get(mentions + "\t" + taxids).containsKey("ID")) {
data.getBioCDocobj().Annotations.get(i).get(j).set(k, data.getBioCDocobj().Annotations.get(i).get(j).get(k) + GeneMention_hash.get(mentions + "\t" + taxids).get("ID") + ",");
} else // cannot find appropriate species
{
//System.out.println(mention+"\t"+taxid);
}
data.getBioCDocobj().Annotations.get(i).get(j).set(k, data.getBioCDocobj().Annotations.get(i).get(j).get(k).substring(0, data.getBioCDocobj().Annotations.get(i).get(j).get(k).length() - 1)); // remove ",$"
}
}
}
//Extend to all gene mentions
HashMap GeneMentions = new HashMap(); // Extending Gene mentions
HashMap GeneMentionLocation = new HashMap(); // Extending Gene mentions
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
{
for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
{
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
int start = Integer.parseInt(anno[0]);
int last = Integer.parseInt(anno[1]);
String mentions = anno[2];
String type = anno[3];
String id = "Tax:9606";
if (anno.length >= 5) {
id = anno[4];
}
if (type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)")) {
GeneMentions.put(mentions.toLowerCase(), id);
for (int s = start; s <= last; s++) {
GeneMentionLocation.put(j + "\t" + s, "");
}
} else if (type.equals("Gene") && id.matches("(Focus|Right|Left|Prefix|Tax)\\:([0-9]+)\\|([0-9]+)")) {
GeneMentions.put(mentions.toLowerCase(), id);
for (int s = start; s <= last; s++) {
GeneMentionLocation.put(j + "\t" + s, "");
}
}
}
}
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
{
if (data.getBioCDocobj().PassageContexts.size() > i && data.getBioCDocobj().PassageContexts.get(i).size() > j) {
String PassageContexts = " " + data.getBioCDocobj().PassageContexts.get(i).get(j) + " ";
String PassageContexts_tmp = PassageContexts.toLowerCase();
for (String gm : GeneMentions.keySet()) {
String id = GeneMentions.get(gm);
if (gm.length() >= 3) {
gm = gm.replaceAll("[ ]*[\\|]*$", "");
gm = gm.replaceAll("^[\\|]*[ ]*", "");
gm = gm.replaceAll("[\\|][\\|]+", "\\|");
if (!gm.matches("[\\W\\-\\_]*")) {
gm = gm.replaceAll("([^A-Za-z0-9\\| ])", "\\\\$1");
Pattern ptmp = Pattern.compile("^(.*[\\W\\-\\_])(" + gm + ")([\\W\\-\\_].*)$");
Matcher mtmp = ptmp.matcher(PassageContexts_tmp);
while (mtmp.find()) {
String pre = mtmp.group(1);
String gmtmp = mtmp.group(2);
String post = mtmp.group(3);
int start = pre.length() - 1;
int last = start + gmtmp.length();
if (PassageContexts.length() >= last + 1) {
String mention = PassageContexts.substring(start + 1, last + 1);
if (!GeneMentionLocation.containsKey(j + "\t" + start) && !GeneMentionLocation.containsKey(j + "\t" + last)) {
data.getBioCDocobj().Annotations.get(i).get(j).add(start + "\t" + last + "\t" + mention + "\tGene\t" + id);
}
}
gmtmp = gmtmp.replaceAll(".", "\\@");
PassageContexts_tmp = pre + "" + gmtmp + "" + post;
mtmp = ptmp.matcher(PassageContexts_tmp);
}
}
}
}
}
}
//Apply to FamilyNames
HashMap geneids = new HashMap(); // Extending Gene mentions
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
{
for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
{
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String type = anno[3];
if (type.equals("Gene")) {
String id = "Tax:9606";
if (anno.length >= 5) {
id = anno[4];
}
Pattern ptmp0 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)$");
Matcher mtmp0 = ptmp0.matcher(id);
Pattern ptmp1 = Pattern.compile("^(Focus|Right|Left|Prefix|GeneID|Tax)\\:([0-9]+)\\|([0-9]+)\\-([0-9]+)$");
Matcher mtmp1 = ptmp1.matcher(id);
if (mtmp0.find()) {
geneids.put(mtmp0.group(3), "");
}
if (mtmp1.find()) {
geneids.put(mtmp1.group(3), "");
}
}
}
}
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
{
for (int k = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k >= 0; k--) // Annotation : k
{
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String mention = anno[2];
String type = anno[3];
if (type.matches("(FamilyName|DomainMotif)")) {
String id = "Tax:9606";
if (anno.length >= 5) {
id = anno[4];
}
String IDstrs = GNormPlus.PT_FamilyName.MentionMatch(mention);
String IDstr[] = IDstrs.split("\\|");
String ids = "";
for (int id_i = 0; id_i < IDstr.length; id_i++) {
if (geneids.containsKey(IDstr[id_i])) {
if (ids.equals("")) {
ids = IDstr[id_i];
} else {
ids = ids + ";" + IDstr[id_i];
}
}
}
if (!ids.equals("")) {
if (type.equals("FamilyName")) {
type = "Gene";
}
String Annotation_k = anno[0] + "\t" + anno[1] + "\t" + anno[2] + "\t" + type + "\tTax:9606";
if (anno.length >= 5) {
Annotation_k = anno[0] + "\t" + anno[1] + "\t" + anno[2] + "\t" + type + "\t" + anno[4];
}
data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotation_k + "|" + ids);
} else {
// Added by Erik Faessler, JULIE Lab: Output FamilyNames but suppress output of empty annotations
if (type.equals("DomainMotif") || Integer.parseInt(anno[1]) - Integer.parseInt(anno[0]) <= 0)
data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
}
}
}
}
//Species "*" and "(anti)" removed.
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
{
for (int k = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k >= 0; k--) // Annotation : k
{
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
String type = anno[3];
if (type.equals("Species") || type.equals("Genus") || type.equals("Strain") || type.equals("CellLine") || type.equals("Cell")) {
String id = anno[4];
id = id.replaceAll("\\*", "");
id = id.replaceAll("\\(anti\\)", "");
String Annotation_k = anno[0] + "\t" + anno[1] + "\t" + anno[2] + "\t" + type + "\t" + id;
data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotation_k);
}
}
}
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) // Paragraph
{
for (int k = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k >= 0; k--) // Annotation : k
{
String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
int start = Integer.parseInt(anno[0]);
int last = Integer.parseInt(anno[1]);
String mention = anno[2];
String type = anno[3];
String id = anno[4];
if (type.equals("Gene") && Species_hash.containsKey(mention)) {
data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
} else if (type.equals("Gene") && id.equals("")) {
data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
} else {
for (int k1 = data.getBioCDocobj().Annotations.get(i).get(j).size() - 1; k1 >= 0; k1--) // Annotation : k
{
if (k1 != k) {
String anno1[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k1).split("\t");
int start1 = Integer.parseInt(anno1[0]);
int last1 = Integer.parseInt(anno1[1]);
if ((start1 < start && last1 >= last) || (start1 <= start && last1 > last)) {
data.getBioCDocobj().Annotations.get(i).get(j).remove(k);
break;
}
}
}
}
}
}
}
if (GeneIDMatch == true) {
//data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
} else {
data.getBioCDocobj().BioCOutput(Filename, FilenameBioC, data.getBioCDocobj().Annotations, true, true);
}
}
/*
* Search Potential GeneID in the Prefix Tree
*/
public ArrayList SearchGeneIDLocation(String Doc) {
ArrayList location = new ArrayList();
String Doc_tmp = " " + Doc + " ";
Pattern ptmp = Pattern.compile("^(.*[^A-Za-z0-9]+)([0-9]+\\S*[A-Za-z]+|[A-Za-z]+\\S*[0-9]+|[0-9]+\\S*[A-Za-z]+\\S*[0-9]+|[A-Za-z]+\\S*[0-9]+\\S*[A-Za-z]+)([^A-Za-z0-9]+.*)$");
Matcher mtmp = ptmp.matcher(Doc_tmp);
while (mtmp.find()) {
String str1 = mtmp.group(1);
String str2 = mtmp.group(2);
String str3 = mtmp.group(3);
for (int m = str1.length(); m <= (str1.length() + str2.length()); m++) {
int start = str1.length() - 1;
int last = start + str2.length();
String mention = Doc.substring(start, last);
if (!mention.matches(".*[\\'\\;\\[\\]\\+\\*\\\\].*")) {
if (last - start > 6 && (mention.matches(".*\\(.*\\).*") || mention.matches("[^\\(\\)]+"))) {
Pattern ptmp1 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-([0-9]+)$");
Matcher mtmp1 = ptmp1.matcher(mention);
Pattern ptmp2 = Pattern.compile("^(.+[^0-9])([0-9]+)\\-(.+[^0-9])([0-9]+)$");
Matcher mtmp2 = ptmp2.matcher(mention);
if (mtmp1.find()) {
String S1 = mtmp1.group(1);
if (mtmp1.group(2).length() <= 6 && mtmp1.group(3).length() <= 6) {
int Num1 = Integer.parseInt(mtmp1.group(2));
int Num2 = Integer.parseInt(mtmp1.group(3));
String prefix = "";
Pattern ptmp3 = Pattern.compile("^([0]+)");
Matcher mtmp3 = ptmp3.matcher(mtmp1.group(2));
if (mtmp3.find()) {
prefix = mtmp3.group(1);
}
if (Num2 - Num1 > 0 && (Num2 - Num1 <= 20)) {
for (int n = Num1; n <= Num2; n++) {
String StrNum = S1 + prefix + n;
if (StrNum.length() >= 5) {
location.add(start + "\t" + last + "\t" + StrNum + "\tGeneID");
}
}
}
}
} else if (mtmp2.find()) {
if (mtmp2.group(2).length() <= 6 && mtmp2.group(4).length() <= 6) {
String S1 = mtmp2.group(1);
int Num1 = Integer.parseInt(mtmp2.group(2));
String S2 = mtmp2.group(3);
int Num2 = Integer.parseInt(mtmp2.group(4));
if (S1.equals(S2)) {
String prefix = "";
Pattern ptmp3 = Pattern.compile("^([0]+)");
Matcher mtmp3 = ptmp3.matcher(mtmp2.group(2));
if (mtmp3.find()) {
prefix = mtmp3.group(1);
}
if (Num2 - Num1 > 0 && (Num2 - Num1 <= 20)) {
for (int n = Num1; n <= Num2; n++) {
String StrNum = S1 + prefix + n;
if (StrNum.length() >= 5) {
location.add(start + "\t" + last + "\t" + StrNum + "\tGeneID");
}
}
}
}
}
}
}
location.add(start + "\t" + last + "\t" + mention + "\tGeneID");
}
}
String men = "";
for (int m = 0; m < str2.length(); m++) {
men = men + "@";
}
Doc_tmp = str1 + men + str3;
mtmp = ptmp.matcher(Doc_tmp);
}
return location;
}
public void GeneIDRecognition(String Filename, String FilenameBioC) throws IOException, XMLStreamException {
for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++) /** PMIDs : i */ {
for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */ {
String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
/** GeneID recognition by pattern match */
ArrayList locations = SearchGeneIDLocation(PassageContext);
for (int k = 0; k < locations.size(); k++) {
String anno[] = locations.get(k).split("\t");
String mention = anno[2].toLowerCase();
mention = mention.replaceAll("[\\W\\-\\_]+", "");
if (GNormPlus.GeneIDs_hash.containsKey(mention)) {
data.getBioCDocobj().Annotations.get(i).get(j).add(locations.get(k) + "\tGeneID:" + GNormPlus.GeneIDs_hash.get(mention)); //paragraph
}
}
}
}
data.getBioCDocobj().BioCOutput(Filename, FilenameBioC, data.getBioCDocobj().Annotations, true, true);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy