GNormPluslib.SimConcept Maven / Gradle / Ivy
/**
* Project: GNormPlus
* Function: SimConcept : Simplify Composite mentions
*/
package GNormPluslib;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;
import java.text.DecimalFormat;
import java.math.RoundingMode;
import javax.xml.stream.XMLStreamException;
import org.tartarus.snowball.SnowballStemmer;
import org.tartarus.snowball.ext.englishStemmer;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
public class SimConcept
{
private GNPProcessingData data;
public SimConcept(GNPProcessingData data) {
this.data = data;
}
/*
* Feature Extraction
*/
public void FeatureExtraction_Train(String FilenameData) throws XMLStreamException
{
try
{
/** output files */
BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data
//NLP modules
SnowballStemmer stemmer = new englishStemmer();
/** PMIDs : i */
for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++)
{
String Pmid = data.getBioCDocobj().PMIDs.get(i);
/** Paragraphs : j */
for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++)
{
ArrayList Annotation = data.getBioCDocobj().Annotations.get(i).get(j);
/** Annotations : k
* 0 start
* 1 last
* 2 mention
* 3 type
* 4 id
*/
int Inital_Annotation_size=Annotation.size();
for (int k = 0; k < Annotation.size() ; k++) // k : Annotations
{
String anno[]=Annotation.get(k).split("\\t",-1);
int MentionStart= Integer.parseInt(anno[0]);
int MentionLast= Integer.parseInt(anno[1]);
String Mention = anno[2];
String Type = anno[3];
if(anno.length>4)
{
String ID = anno[4];
String TokenSTR = Mention;
TokenSTR = TokenSTR.replaceAll("([0-9])([A-Za-z])", "$1 $2");
TokenSTR = TokenSTR.replaceAll("([A-Za-z])([0-9])", "$1 $2");
TokenSTR = TokenSTR.replaceAll("([A-Z])([a-z])", "$1 $2");
TokenSTR = TokenSTR.replaceAll("([a-z])([A-Z])", "$1 $2");
TokenSTR = TokenSTR.replaceAll("([\\W])", " $1 ");
TokenSTR = TokenSTR.replaceAll("[ ]+", " ");
TokenSTR = TokenSTR.replaceAll("^[ ]+", "");
TokenSTR = TokenSTR.replaceAll("[ ]+$", "");
/*
* Only for Gene
*/
if(ID.equals("ASJAS") && kInteger.parseInt(t2))
{
tmp_ment=t1+" "+t2+" to "+t5;
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNS");
tmp_ment=t1+" "+t2+" to -"+t5;
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNOS");
tmp_ment=t1+" -"+t2+" to -"+t5;
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tAASNOS");
tmp_ment=t1+" "+t2+" to "+t1+" "+t5;
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNAS");
tmp_ment=t1+" "+t2+"-"+t5;
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNS");
tmp_ment=t1+" "+t2+", "+t5+", and "+(Integer.parseInt(t5)+2);
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASCSCCS");
tmp_ment=t1+" -"+t2+", -"+t5+", and -"+(Integer.parseInt(t5)+2);
Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tAASC0SCC0S");
}
}
}
String Mention_tmp = Mention;
String tokens[]=TokenSTR.split(" ",-1);
//For Repeat
HashMap Token2Num = new HashMap ();
for(int p=0;p AbbLFStatus_hash = new HashMap ();
for(String Pmid_LF : data.getPmidLF2Abb_hash().keySet())
{
String pf[] = Pmid_LF.split("\\t",-1);
if(pf[0].equals(Pmid))
{
String Abb = data.getPmidLF2Abb_hash().get(Pmid_LF);
String LF = pf[1];
Abb = Abb.replaceAll("([0-9])([A-Za-z])", "$1 $2");
Abb = Abb.replaceAll("([A-Za-z])([0-9])", "$1 $2");
Abb = Abb.replaceAll("([A-Z])([a-z])", "$1 $2");
Abb = Abb.replaceAll("([a-z])([A-Z])", "$1 $2");
Abb = Abb.replaceAll("([\\W])", " $1 ");
Abb = Abb.replaceAll("[ ]+", " ");
Abb = Abb.replaceAll("^[ ]+", "");
LF = LF.replaceAll("([0-9])([A-Za-z])", "$1 $2");
LF = LF.replaceAll("([A-Za-z])([0-9])", "$1 $2");
LF = LF.replaceAll("([A-Z])([a-z])", "$1 $2");
LF = LF.replaceAll("([a-z])([A-Z])", "$1 $2");
LF = LF.replaceAll("([\\W])", " $1 ");
LF = LF.replaceAll("[ ]+", " ");
LF = LF.replaceAll("^[ ]+", "");
LF = LF.replaceAll("[ ]+$", "");
Abb=Abb.replaceAll("([^A-Za-z0-9@ ])","\\\\$1");
LF=LF.replaceAll("([^A-Za-z0-9@ ])","\\\\$1");
Abb=Abb.toLowerCase();
LF=LF.toLowerCase();
Pattern ptmp1 = Pattern.compile("(.*)("+LF+")([ ]*\\([ ]*)("+Abb+")[ ]*\\).*");
Matcher mtmp1 = ptmp1.matcher(TokenSTR.toLowerCase());
Pattern ptmp2 = Pattern.compile("(.*)("+Abb+")([ ]*\\([ ]*)("+LF+")[ ]*\\).*");
Matcher mtmp2 = ptmp2.matcher(TokenSTR.toLowerCase());
int start_LF=0;
int last_LF=0;
int start_Abb=0;
int last_Abb=0;
if(mtmp1.find())
{
start_LF = mtmp1.group(1).length();
last_LF = start_LF+mtmp1.group(2).length();
start_Abb = last_LF+mtmp1.group(3).length();
last_Abb = start_Abb+mtmp1.group(4).length();
}
else if(mtmp2.find())
{
start_Abb = mtmp2.group(1).length();
last_Abb = start_LF+mtmp2.group(2).length();
start_LF = last_LF+mtmp2.group(3).length();
last_LF = start_Abb+mtmp2.group(4).length();
}
for(int l=start_LF;l0)
{
String B=tokens[p-1];
B=B.replaceAll("[A-Za-z]+", "A");
B=B.replaceAll("[0-9]+", "0");
WSB="WSB:"+B;
}
if(p3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();}
//Number of Uppercase [A-Z]
String Num_Uc="";
tmp=tokens[p];
tmp=tmp.replaceAll("[^A-Z]","");
if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();}
//Number of Lowercase [a-z]
String Num_lc="";
tmp=tokens[p];
tmp=tmp.replaceAll("[^a-z]","");
if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();}
//Number of ALL char
String Num_All="";
if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();}
//specific character (;:,.->+_)
String SpecificC="__nil__";
if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_"))
{
SpecificC="-SpecificC1-";
}
else if(tokens[p].equals("(") || tokens[p].equals(")"))
{
SpecificC="-SpecificC2-";
}
else if(tokens[p].equals("{") || tokens[p].equals("}"))
{
SpecificC="-SpecificC3-";
}
else if(tokens[p].equals("[") || tokens[p].equals("]"))
{
SpecificC="-SpecificC4-";
}
else if(tokens[p].equals("\\") || tokens[p].equals("/"))
{
SpecificC="-SpecificC5-";
}
//Chemical Prefix/Suffix
String ChemPreSuf="__nil__";
if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";}
else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";}
else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";}
else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";}
else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";}
//MentionType
String MentionType="__nil__";
if(GNormPlus.SimConceptMention2Type_hash.containsKey(tokens[p]))
{
MentionType = "-"+GNormPlus.SimConceptMention2Type_hash.get(tokens[p])+"-";
}
//Protein symbols
String ProteinSym="__nil__";
if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ChemPreSuf="-ProteinSymFull-";}
else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ChemPreSuf="-ProteinSymTri-";}
else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ChemPreSuf="-ProteinSymChar-";}
//Repeat
String Repeat="__nil__";
if(Token2Num.get(tokens[p])>1 && tokens[p].length()>1 && (!tokens[p].matches("([\\W\\-\\_0-9]+|and|or|alpha|beta|gamma|theta|zeta|delta|kappa|II|VI|IV|III)")))
{
Repeat="-Repeat-";
}
//Patterns
String Pattern1 = tokens[p];
if(Pattern1.matches(".*[\\W\\-\\_].*"))
{
Pattern1="__nil__";
}
else
{
Pattern1=Pattern1.replaceAll("[A-Z]", "A");
Pattern1=Pattern1.replaceAll("[a-z]", "a");
Pattern1=Pattern1.replaceAll("[0-9]", "0");
Pattern1="P1:"+Pattern1;
}
String Pattern2 = tokens[p];
if(Pattern2.matches(".*[\\W\\-\\_].*"))
{
Pattern2="__nil__";
}
else
{
Pattern2=Pattern2.replaceAll("[A-Za-z]", "a");
Pattern2=Pattern2.replaceAll("[0-9]", "0");
Pattern2="P2:"+Pattern2;
}
String Pattern3 = tokens[p];
if(Pattern3.matches(".*[\\W\\-\\_].*"))
{
Pattern3="__nil__";
}
else
{
Pattern3=Pattern3.replaceAll("[A-Z]+", "A");
Pattern3=Pattern3.replaceAll("[a-z]+", "a");
Pattern3=Pattern3.replaceAll("[0-9]+", "0");
Pattern3="P3:"+Pattern3;
}
String Pattern4 = tokens[p];
if(Pattern4.matches(".*[\\W\\-\\_].*"))
{
Pattern4="__nil__";
}
else
{
Pattern4=Pattern4.replaceAll("[A-Za-z]+", "a");
Pattern4=Pattern4.replaceAll("[0-9]+", "0");
Pattern4="P4:"+Pattern4;
}
//prefix
String prefix="";
tmp=tokens[p];
if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";}
if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";}
if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";}
if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";}
if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";}
//suffix
String suffix="";
tmp=tokens[p];
if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";}
if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";}
if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";}
if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";}
if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";}
//Abbreviation & Long Form
String AbbLF="__nil__";
if(AbbLFStatus_hash.containsKey(Offset))
{
AbbLF=AbbLFStatus_hash.get(Offset);
}
String Status = ID.substring(p, p+1);
FileData.write(tokens[p]+" "+WSB+" "+WSF+" "+stem
+" "+Num_num+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC
+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+Repeat
+" "+Pattern1+" "+Pattern2+" "+Pattern3+" "+Pattern4
+" "+prefix+" "+suffix+" "+AbbLF
+" "+Status+"\n");
Offset=Offset+tokens[p].length()+1;
if(ID.length()>tokens.length)
{
System.out.println(ID+"\t"+TokenSTR);
}
}
FileData.write("\n");
}
}
}
}
FileData.close();
}
catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");}
}
public void FeatureExtraction_Test(String FilenameData) throws XMLStreamException
{
try
{
/** output files */
BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data
//NLP modules
SnowballStemmer stemmer = new englishStemmer();
/** PMIDs : i */
for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++)
{
String Pmid = data.getBioCDocobj().PMIDs.get(i);
/** Paragraphs : j */
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++)
{
ArrayList Annotation = data.getBioCDocobj().Annotations.get(i).get(j);
/** Annotations : k
* 0 start
* 1 last
* 2 mention
* 3 type
* 4 id
*/
for (int k = 0; k < Annotation.size() ; k++) // k : Annotations
{
String anno[]=Annotation.get(k).split("\\t",-1);
String Mentions = anno[2];
String Type = anno[3];
String MentionArr[]=Mentions.split("\\|",-1);
if(Type.equals("Gene"))
{
for(int m=0;m Token2Num = new HashMap ();
for(int p=0;p AbbLFStatus_hash = new HashMap ();
for(String Pmid_LF : data.getPmidLF2Abb_hash().keySet())
{
String pf[] = Pmid_LF.split("\\t",-1);
if(pf[0].equals(Pmid))
{
String Abb = data.getPmidLF2Abb_hash().get(Pmid_LF);
String LF = pf[1];
Abb = Abb.replaceAll("([0-9])([A-Za-z])", "$1 $2");
Abb = Abb.replaceAll("([A-Za-z])([0-9])", "$1 $2");
Abb = Abb.replaceAll("([A-Z])([a-z])", "$1 $2");
Abb = Abb.replaceAll("([a-z])([A-Z])", "$1 $2");
Abb = Abb.replaceAll("([\\W])", " $1 ");
Abb = Abb.replaceAll("[ ]+", " ");
Abb = Abb.replaceAll("^[ ]+", "");
LF = LF.replaceAll("([0-9])([A-Za-z])", "$1 $2");
LF = LF.replaceAll("([A-Za-z])([0-9])", "$1 $2");
LF = LF.replaceAll("([A-Z])([a-z])", "$1 $2");
LF = LF.replaceAll("([a-z])([A-Z])", "$1 $2");
LF = LF.replaceAll("([\\W])", " $1 ");
LF = LF.replaceAll("[ ]+", " ");
LF = LF.replaceAll("^[ ]+", "");
Abb=Abb.replaceAll("([\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\-\\=\\[\\]\\;\\'\\,\\.\\/\\{\\}\\|\\:\\?])","\\\\$1");
LF=LF.replaceAll("([\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\-\\=\\[\\]\\;\\'\\,\\.\\/\\{\\}\\|\\:\\?])","\\\\$1");
Abb=Abb.toLowerCase();
LF=LF.toLowerCase();
Pattern ptmp1 = Pattern.compile("(.*)"
+ "("+LF+")"
+ "([ ]*\\([ ]*)"
+ "("+Abb+")"
+ "[ ]*\\).*");
Matcher mtmp1 = ptmp1.matcher(TokenSTR.toLowerCase());
Pattern ptmp2 = Pattern.compile("(.*)"
+ "("+Abb+")"
+ "([ ]*\\([ ]*)"
+ "("+LF+")"
+ "[ ]*\\).*");
Matcher mtmp2 = ptmp2.matcher(TokenSTR.toLowerCase());
int start_LF=0;
int last_LF=0;
int start_Abb=0;
int last_Abb=0;
if(mtmp1.find())
{
start_LF = mtmp1.group(1).length();
last_LF = start_LF+mtmp1.group(2).length();
start_Abb = last_LF+mtmp1.group(3).length();
last_Abb = start_Abb+mtmp1.group(4).length();
}
else if(mtmp2.find())
{
start_Abb = mtmp2.group(1).length();
last_Abb = start_LF+mtmp2.group(2).length();
start_LF = last_LF+mtmp2.group(3).length();
last_LF = start_Abb+mtmp2.group(4).length();
}
for(int l=start_LF;l0)
{
String B=tokens[p-1];
B=B.replaceAll("[A-Za-z]+", "A");
B=B.replaceAll("[0-9]+", "0");
WSB="WSB:"+B;
}
if(p3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();}
//Number of Uppercase [A-Z]
String Num_Uc="";
tmp=tokens[p];
tmp=tmp.replaceAll("[^A-Z]","");
if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();}
//Number of Lowercase [a-z]
String Num_lc="";
tmp=tokens[p];
tmp=tmp.replaceAll("[^a-z]","");
if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();}
//Number of ALL char
String Num_All="";
if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();}
//specific character (;:,.->+_)
String SpecificC="__nil__";
if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_"))
{
SpecificC="-SpecificC1-";
}
else if(tokens[p].equals("(") || tokens[p].equals(")"))
{
SpecificC="-SpecificC2-";
}
else if(tokens[p].equals("{") || tokens[p].equals("}"))
{
SpecificC="-SpecificC3-";
}
else if(tokens[p].equals("[") || tokens[p].equals("]"))
{
SpecificC="-SpecificC4-";
}
else if(tokens[p].equals("\\") || tokens[p].equals("/"))
{
SpecificC="-SpecificC5-";
}
//Chemical Prefix/Suffix
String ChemPreSuf="__nil__";
if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";}
else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";}
else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";}
else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";}
else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";}
//MentionType
String MentionType="__nil__";
if(GNormPlus.SimConceptMention2Type_hash.containsKey(tokens[p]))
{
MentionType = "-"+GNormPlus.SimConceptMention2Type_hash.get(tokens[p])+"-";
}
//Protein symbols
String ProteinSym="__nil__";
if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ChemPreSuf="-ProteinSymFull-";}
else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ChemPreSuf="-ProteinSymTri-";}
else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ChemPreSuf="-ProteinSymChar-";}
//Repeat
String Repeat="__nil__";
if(Token2Num.get(tokens[p])>1 && tokens[p].length()>1 && (!tokens[p].matches("([\\W\\-\\_0-9]+|and|or|alpha|beta|gamma|theta|zeta|delta|kappa|II|VI|IV|III)")))
{
Repeat="-Repeat-";
}
//Patterns
String Pattern1 = tokens[p];
if(Pattern1.matches(".*[\\W\\-\\_].*"))
{
Pattern1="__nil__";
}
else
{
Pattern1=Pattern1.replaceAll("[A-Z]", "A");
Pattern1=Pattern1.replaceAll("[a-z]", "a");
Pattern1=Pattern1.replaceAll("[0-9]", "0");
Pattern1="P1:"+Pattern1;
}
String Pattern2 = tokens[p];
if(Pattern2.matches(".*[\\W\\-\\_].*"))
{
Pattern2="__nil__";
}
else
{
Pattern2=Pattern2.replaceAll("[A-Za-z]", "a");
Pattern2=Pattern2.replaceAll("[0-9]", "0");
Pattern2="P2:"+Pattern2;
}
String Pattern3 = tokens[p];
if(Pattern3.matches(".*[\\W\\-\\_].*"))
{
Pattern3="__nil__";
}
else
{
Pattern3=Pattern3.replaceAll("[A-Z]+", "A");
Pattern3=Pattern3.replaceAll("[a-z]+", "a");
Pattern3=Pattern3.replaceAll("[0-9]+", "0");
Pattern3="P3:"+Pattern3;
}
String Pattern4 = tokens[p];
if(Pattern4.matches(".*[\\W\\-\\_].*"))
{
Pattern4="__nil__";
}
else
{
Pattern4=Pattern4.replaceAll("[A-Za-z]+", "a");
Pattern4=Pattern4.replaceAll("[0-9]+", "0");
Pattern4="P4:"+Pattern4;
}
//prefix
String prefix="";
tmp=tokens[p];
if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";}
if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";}
if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";}
if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";}
if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";}
//suffix
String suffix="";
tmp=tokens[p];
if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";}
if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";}
if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";}
if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";}
if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";}
//Abbreviation & Long Form
String AbbLF="__nil__";
if(AbbLFStatus_hash.containsKey(Offset))
{
AbbLF=AbbLFStatus_hash.get(Offset);
}
FileData.write(tokens[p]+" "+WSB+" "+WSF+" "+stem
+" "+Num_num+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC
+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+Repeat
+" "+Pattern1+" "+Pattern2+" "+Pattern3+" "+Pattern4
+" "+prefix+" "+suffix+" "+AbbLF+"\n");
Offset=Offset+tokens[p].length()+1;
}
FileData.write("\n");
}
}
}
}
}
FileData.close();
}
catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");}
}
public void CRF_test(String model, String FilenameData,String FilenameOutput) throws IOException
{
File f = new File(FilenameOutput);
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"));
Runtime runtime = Runtime.getRuntime();
String cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData;
try {
Process process = runtime.exec(cmd);
InputStream is = process.getInputStream();
InputStreamReader isr = new InputStreamReader(is, "UTF-8");
BufferedReader br = new BufferedReader(isr);
String line="";
while ( (line = br.readLine()) != null)
{
fr.write(line);
fr.newLine();
fr.flush();
}
is.close();
isr.close();
br.close();
fr.close();
}
catch (IOException e) {
System.out.println(e);
runtime.exit(0);
}
}
public void CRF_learn(String model,String FilenameData) throws IOException
{
Runtime runtime = Runtime.getRuntime();
Process process = null;
String line = null;
InputStream is = null;
InputStreamReader isr = null;
BufferedReader br = null;
String cmd = "CRF/crf_learn -f 3 -c 4.0 CRF/template_SimConcept "+FilenameData+" "+model;
try {
process = runtime.exec(cmd);
is = process.getInputStream();
isr = new InputStreamReader(is, "UTF-8");
br = new BufferedReader(isr);
while ( (line = br.readLine()) != null)
{
System.out.println(line);
System.out.flush();
}
is.close();
isr.close();
br.close();
}
catch (IOException e) {
System.out.println(e);
runtime.exit(0);
}
}
public void ReadCRFresult(String Filename,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException
{
/** load CRF output */
ArrayList outputArr1 = new ArrayList();
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8"));
String line;
while ((line = inputfile.readLine()) != null)
{
outputArr1.add(line);
}
inputfile.close();
/**
* Recognize the mentions which can be simplified
*/
int Count_mention=0;
boolean Simplified=false;
String Mention="";
String Mention_NoSpace="";
String States="";
HashMap Mentions_hash = new HashMap();
HashMap States_hash = new HashMap();
HashMap Output_Split_mention_Ind = new HashMap();
HashMap Output_Split_mention = new HashMap();
for(int i=0;i Split_mention = new ArrayList();
ArrayList Split_state = new ArrayList();
String tmp_mention="";
String tmp_state="";
/**
* count = Mentions_count.get(i) : # of the mention in the corpus (543)
* Mentions_hash.get(count) : Original Mention (ORP - 1 to ORP - 6)
* States_hash.get(count) : States (AASNOOS)
*/
String TokenArr[]=Mentions_hash.get(MNoSpace).split(" ",-1);
String StateArr[]=States_hash.get(MNoSpace).split("",-1);
//refinement : isn't used
Pattern ptmp1 = Pattern.compile("^([S]+)([CN])([S]+)$");
Matcher mtmp1 = ptmp1.matcher(States_hash.get(MNoSpace));
if(mtmp1.find())
{
States_hash.put(MNoSpace, mtmp1.group(1)+"J"+mtmp1.group(3));
}
//Split BE
int len=TokenArr.length;
if(StateArr.length0)
{
Split_mention.add(tmp_mention);
Split_state.add(tmp_state);
}
tmp_mention = "";
tmp_state = "";
}
else //CNBF
{
tmp_mention = tmp_mention + TokenArr[s] + " ";
tmp_state = tmp_state + StateArr[s];
}
}
if(!tmp_mention.equals(""))
{
Split_mention.add(tmp_mention);
Split_state.add(tmp_state);
}
//Split B/F
for(int m=0;m strainsX = new ArrayList();
ArrayList STAstrainsX = new ArrayList();
String each_token[] = Split_mention.get(m).split(" ");
String each_state[] = Split_state.get(m).split("");
for(int s=0;s strainsCN = new ArrayList();
String CorN="";
String each_token[] = Split_mention.get(m).split(" ",-1);
String each_state[] = Split_state.get(m).split("",-1);
for(int k=0;k=4)
{
A=A.replace("s $", "");
}
A=A+"STRAINXXX";
strainCN=strainCN+each_token[k]+" ";
CNO_continous=0;
}
else if(each_state[k].matches("[CN]") && CNO_continous==0)
{
CorN=each_state[k];
strainsCN.add(strainCN);
strainCN="";
CNO_continous++;
}
else if(each_state[k].equals("J"))
{
if(!strainCN.equals("")){strainsCN.add(strainCN);}
A=A.replaceAll("STRAINXXXSTRAINXXX","STRAINXXX");
A=A.replaceAll("STRAINXXXSTRAINXXX","STRAINXXX");
ptmp1 = Pattern.compile("^(.+)s (.*)$");
mtmp1 = ptmp1.matcher(A);
if(mtmp1.find() && mtmp1.group(1).length()>=3 )
{
A = mtmp1.group(1)+ " "+mtmp1.group(2);
}
if(CorN.equals("C"))
{
for(int x=0;x2 && (tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")))
{
tmp = tmp.substring(0,tmp.length()-2);
}
if(Output_Split_mention_Ind.containsKey(MNoSpace))
{
Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
}
else
{
Output_Split_mention_Ind.put(MNoSpace, tmp);
}
}
}
else if(CorN.equals("N"))
{
if(strainsCN.contains(0) && strainsCN.contains(1))
{
String strain1= strainsCN.get(0).replaceAll(" ", "");
String strain2= strainsCN.get(1).replaceAll(" ", "");
if(strain1.matches("[0-9]+") && strain2.matches("[0-9]+"))
{
if(Integer.parseInt(strain2)-Integer.parseInt(strain1)<=20)
{
for(int strCount=Integer.parseInt(strain1);strCount<=Integer.parseInt(strain2);strCount++)
{
String tmp=A;
tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
tmp = tmp.replaceAll("[ ]+"," ");
if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
{
tmp = tmp.substring(0,tmp.length()-2);
}
if(Output_Split_mention_Ind.containsKey(MNoSpace))
{
Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
}
else
{
Output_Split_mention_Ind.put(MNoSpace, tmp);
}
}
}
}
else if(strain1.matches("[A-Z]+ ") && strain2.matches("[A-Z]+ "))
{
int strInt1 = (int) strain1.replaceAll(" ", "").charAt(0);
int strInt2 = (int) strain2.replaceAll(" ", "").charAt(0);
if(strInt2-strInt1<=20)
{
for(int strCount=strInt1;strCount<=strInt2;strCount++)
{
String tmp=A;
tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
tmp = tmp.replaceAll("[ ]+"," ");
if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
{
tmp = tmp.substring(0,tmp.length()-2);
}
if(Output_Split_mention_Ind.containsKey(MNoSpace))
{
Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
}
else
{
Output_Split_mention_Ind.put(MNoSpace, tmp);
}
}
}
}
else
{
if(Output_Split_mention.containsKey(MNoSpace))
{
Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
}
else
{
Output_Split_mention.put(MNoSpace, Split_mention.get(m));
}
}
}
}
else
{
if(Output_Split_mention.containsKey(MNoSpace))
{
Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
}
else
{
Output_Split_mention.put(MNoSpace, Split_mention.get(m));
}
}
A="";
strainCN="";
CNO_continous=0;
strainsCN = new ArrayList();
CorN="";
}
}
if(!strainCN.equals("")){strainsCN.add(strainCN);}
A=A.replaceAll("(STRAINXXX){2,}","STRAINXXX");
ptmp1 = Pattern.compile("^(.+)s (.*)$");
mtmp1 = ptmp1.matcher(A);
if(mtmp1.find() && mtmp1.group(1).length()>=3 )
{
A = mtmp1.group(1)+ " "+mtmp1.group(2);
}
if(CorN.equals("C"))
{
for(int x=0;x2 && (tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")))
{
tmp = tmp.substring(0,tmp.length()-2);
}
if(Output_Split_mention_Ind.containsKey(MNoSpace))
{
Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
}
else
{
Output_Split_mention_Ind.put(MNoSpace, tmp);
}
}
}
else if(CorN.equals("N"))
{
if(strainsCN.size()==2)
{
String strain1= strainsCN.get(0).replaceAll(" ", "");
String strain2= strainsCN.get(1).replaceAll(" ", "");
if(strain1.matches("[0-9]{1,7}") && strain2.matches("[0-9]{1,7}"))
{
if(Integer.parseInt(strain2)-Integer.parseInt(strain1)<=20)
{
for(int strCount=Integer.parseInt(strain1);strCount<=Integer.parseInt(strain2);strCount++)
{
String tmp=A;
tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
tmp = tmp.replaceAll("[ ]+"," ");
if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
{
tmp = tmp.substring(0,tmp.length()-2);
}
if(Output_Split_mention_Ind.containsKey(MNoSpace))
{
Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
}
else
{
Output_Split_mention_Ind.put(MNoSpace, tmp);
}
}
}
}
else if(strain1.matches("[A-Z]+ ") && strain2.matches("[A-Z]+ "))
{
int strInt1 = (int) strain1.replaceAll(" ", "").charAt(0);
int strInt2 = (int) strain2.replaceAll(" ", "").charAt(0);
if(strInt2-strInt1<=20)
{
for(int strCount=strInt1;strCount<=strInt2;strCount++)
{
String tmp=A;
tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
tmp = tmp.replaceAll("[ ]+"," ");
if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
{
tmp = tmp.substring(0,tmp.length()-2);
}
if(Output_Split_mention_Ind.containsKey(MNoSpace))
{
Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
}
else
{
Output_Split_mention_Ind.put(MNoSpace, tmp);
}
}
}
}
else
{
if(Output_Split_mention.containsKey(MNoSpace))
{
Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
}
else
{
Output_Split_mention.put(MNoSpace, Split_mention.get(m));
}
}
}
}
else
{
if(Output_Split_mention.containsKey(MNoSpace))
{
Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
}
else
{
Output_Split_mention.put(MNoSpace, Split_mention.get(m));
}
}
}
}
for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++)
{
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++)
{
int Annotation_Num = data.getBioCDocobj().Annotations.get(i).get(j).size();
for (int k = 0; k < Annotation_Num ; k++) // k : Annotations
{
String anno[]=data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\\t"); //Mention
String MenArr[]=anno[2].split("\\|");
for(int m=0;m Mentions = new ArrayList();
for(int m=0;m ii
// ii --> 2
for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++)
{
for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++)
{
int Annotation_Num = data.getBioCDocobj().Annotations.get(i).get(j).size();
for (int k = 0; k < Annotation_Num ; k++) // k : Annotations
{
String anno[]=data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\\t"); //Mention
String MenArr[]=anno[2].split("\\|");
HashMap Mentions = new HashMap();
for(int m=0;m
© 2015 - 2025 Weber Informatics LLC | Privacy Policy