All Downloads are FREE. Search and download functionalities are using the official Maven repository.

GNormPluslib.SimConcept Maven / Gradle / Ivy

/**
 * Project: GNormPlus
 * Function: SimConcept : Simplify Composite mentions
 */

package GNormPluslib;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;
import java.text.DecimalFormat;
import java.math.RoundingMode;

import javax.xml.stream.XMLStreamException;

import org.tartarus.snowball.SnowballStemmer;
import org.tartarus.snowball.ext.englishStemmer;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;

public class SimConcept 
{
	private GNPProcessingData data;

	public SimConcept(GNPProcessingData data) {

		this.data = data;
	}

	/*
	 * Feature Extraction
	 */
	public void FeatureExtraction_Train(String FilenameData) throws XMLStreamException
	{
		try 
		{
			/** output files */ 
			BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data
			//NLP modules
			SnowballStemmer stemmer = new englishStemmer();
			/** PMIDs : i */
			for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++)
			{
				String Pmid = data.getBioCDocobj().PMIDs.get(i);
				
				/** Paragraphs : j */
				for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++)
				{
					ArrayList Annotation = data.getBioCDocobj().Annotations.get(i).get(j);
					/** Annotations : k 
					 * 0 start
					 * 1 last
					 * 2 mention
					 * 3 type
					 * 4 id
					 */
					int Inital_Annotation_size=Annotation.size();
					for (int k = 0; k < Annotation.size() ; k++)   // k : Annotations
					{
						String anno[]=Annotation.get(k).split("\\t",-1);
						int MentionStart= Integer.parseInt(anno[0]);
		        		int MentionLast= Integer.parseInt(anno[1]);
		        		String Mention = anno[2];
		        		String Type = anno[3];
		        		if(anno.length>4)
		        		{
			        		String ID = anno[4];
			        		
			        		String TokenSTR = Mention;
			        		TokenSTR = TokenSTR.replaceAll("([0-9])([A-Za-z])", "$1 $2");
			        		TokenSTR = TokenSTR.replaceAll("([A-Za-z])([0-9])", "$1 $2");
			        		TokenSTR = TokenSTR.replaceAll("([A-Z])([a-z])", "$1 $2");
			        		TokenSTR = TokenSTR.replaceAll("([a-z])([A-Z])", "$1 $2");
							TokenSTR = TokenSTR.replaceAll("([\\W])", " $1 ");
			        		TokenSTR = TokenSTR.replaceAll("[ ]+", " ");
			        		TokenSTR = TokenSTR.replaceAll("^[ ]+", "");
			        		TokenSTR = TokenSTR.replaceAll("[ ]+$", "");
			        		
			        		/*
			        		 * Only for Gene
			        		 */
			        		if(ID.equals("ASJAS") && kInteger.parseInt(t2))
									{
										tmp_ment=t1+" "+t2+" to "+t5;
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNS");
										tmp_ment=t1+" "+t2+" to -"+t5;
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNOS");
										tmp_ment=t1+" -"+t2+" to -"+t5;
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tAASNOS");
										tmp_ment=t1+" "+t2+" to "+t1+" "+t5;
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNAS");
										tmp_ment=t1+" "+t2+"-"+t5;
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASNS");
										tmp_ment=t1+" "+t2+", "+t5+", and "+(Integer.parseInt(t5)+2);
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tASCSCCS");
										tmp_ment=t1+" -"+t2+", -"+t5+", and -"+(Integer.parseInt(t5)+2);
										Annotation.add(MentionStart+"\t"+MentionLast+"\t"+tmp_ment+"\t"+Type+"\tAASC0SCC0S");
									}
								}
			        		}
			        		
			        		String Mention_tmp = Mention;
							String tokens[]=TokenSTR.split(" ",-1);
							
							//For Repeat
							HashMap  Token2Num = new HashMap ();
							for(int p=0;p AbbLFStatus_hash = new HashMap ();
							for(String Pmid_LF : data.getPmidLF2Abb_hash().keySet())
							{
								String pf[] = Pmid_LF.split("\\t",-1);
								if(pf[0].equals(Pmid))
								{
									String Abb = data.getPmidLF2Abb_hash().get(Pmid_LF);
									String LF = pf[1];
									
									Abb = Abb.replaceAll("([0-9])([A-Za-z])", "$1 $2");
									Abb = Abb.replaceAll("([A-Za-z])([0-9])", "$1 $2");
					        		Abb = Abb.replaceAll("([A-Z])([a-z])", "$1 $2");
					        		Abb = Abb.replaceAll("([a-z])([A-Z])", "$1 $2");
					        		Abb = Abb.replaceAll("([\\W])", " $1 ");
									Abb = Abb.replaceAll("[ ]+", " ");
					        		Abb = Abb.replaceAll("^[ ]+", "");
					        		
					        		LF = LF.replaceAll("([0-9])([A-Za-z])", "$1 $2");
					        		LF = LF.replaceAll("([A-Za-z])([0-9])", "$1 $2");
					        		LF = LF.replaceAll("([A-Z])([a-z])", "$1 $2");
					        		LF = LF.replaceAll("([a-z])([A-Z])", "$1 $2");
					        		LF = LF.replaceAll("([\\W])", " $1 ");
									LF = LF.replaceAll("[ ]+", " ");
					        		LF = LF.replaceAll("^[ ]+", "");
					        		LF = LF.replaceAll("[ ]+$", "");
					        		
									
									Abb=Abb.replaceAll("([^A-Za-z0-9@ ])","\\\\$1");
									LF=LF.replaceAll("([^A-Za-z0-9@ ])","\\\\$1");
									Abb=Abb.toLowerCase();
									LF=LF.toLowerCase();
									Pattern ptmp1 = Pattern.compile("(.*)("+LF+")([ ]*\\([ ]*)("+Abb+")[ ]*\\).*");
									Matcher mtmp1 = ptmp1.matcher(TokenSTR.toLowerCase());
									Pattern ptmp2 = Pattern.compile("(.*)("+Abb+")([ ]*\\([ ]*)("+LF+")[ ]*\\).*");
									Matcher mtmp2 = ptmp2.matcher(TokenSTR.toLowerCase());
									int start_LF=0;
									int last_LF=0;
									int start_Abb=0;
									int last_Abb=0;
									if(mtmp1.find())
									{
										start_LF = mtmp1.group(1).length();
										last_LF = start_LF+mtmp1.group(2).length();
										start_Abb = last_LF+mtmp1.group(3).length();
										last_Abb = start_Abb+mtmp1.group(4).length();
									}
									else if(mtmp2.find())
									{
										start_Abb = mtmp2.group(1).length();
										last_Abb = start_LF+mtmp2.group(2).length();
										start_LF = last_LF+mtmp2.group(3).length();
										last_LF = start_Abb+mtmp2.group(4).length();
									}
									for(int l=start_LF;l0)
			        			{
			        				String B=tokens[p-1];
			        				B=B.replaceAll("[A-Za-z]+", "A");
			        				B=B.replaceAll("[0-9]+", "0");
			        				WSB="WSB:"+B;
			        			}
			        			if(p3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();}
								
								//Number of Uppercase [A-Z]
								String Num_Uc="";
								tmp=tokens[p];
								tmp=tmp.replaceAll("[^A-Z]","");
								if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();}
								
								//Number of Lowercase [a-z]
								String Num_lc="";
								tmp=tokens[p];
								tmp=tmp.replaceAll("[^a-z]","");
								if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();}
								
								//Number of ALL char
								String Num_All="";
								if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();}
								
								//specific character (;:,.->+_)
								String SpecificC="__nil__";
								if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_"))
								{
									SpecificC="-SpecificC1-";
								}
								else if(tokens[p].equals("(") || tokens[p].equals(")"))
								{
									SpecificC="-SpecificC2-";
								}
								else if(tokens[p].equals("{") || tokens[p].equals("}"))
								{
									SpecificC="-SpecificC3-";
								}
								else if(tokens[p].equals("[") || tokens[p].equals("]"))
								{
									SpecificC="-SpecificC4-";
								}
								else if(tokens[p].equals("\\") || tokens[p].equals("/"))
								{
									SpecificC="-SpecificC5-";
								}
								
								//Chemical Prefix/Suffix
								String ChemPreSuf="__nil__";
								if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";}
								else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";}
								else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";}
								else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";}
								else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";}
								
								//MentionType
								String MentionType="__nil__";
								if(GNormPlus.SimConceptMention2Type_hash.containsKey(tokens[p]))
								{
									MentionType = "-"+GNormPlus.SimConceptMention2Type_hash.get(tokens[p])+"-";
								}
								
								//Protein symbols
								String ProteinSym="__nil__";
								if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ChemPreSuf="-ProteinSymFull-";}
								else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ChemPreSuf="-ProteinSymTri-";}
								else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ChemPreSuf="-ProteinSymChar-";}
								
								//Repeat
								String Repeat="__nil__";
								if(Token2Num.get(tokens[p])>1 && tokens[p].length()>1 && (!tokens[p].matches("([\\W\\-\\_0-9]+|and|or|alpha|beta|gamma|theta|zeta|delta|kappa|II|VI|IV|III)")))
								{
									Repeat="-Repeat-";
								}
								
								//Patterns
								String Pattern1 = tokens[p];
								if(Pattern1.matches(".*[\\W\\-\\_].*"))
								{
									Pattern1="__nil__";
								}
								else
								{
									Pattern1=Pattern1.replaceAll("[A-Z]", "A");
									Pattern1=Pattern1.replaceAll("[a-z]", "a");
									Pattern1=Pattern1.replaceAll("[0-9]", "0");
									Pattern1="P1:"+Pattern1;
								}
								String Pattern2 = tokens[p];
								if(Pattern2.matches(".*[\\W\\-\\_].*"))
								{
									Pattern2="__nil__";
								}
								else
								{
									Pattern2=Pattern2.replaceAll("[A-Za-z]", "a");
									Pattern2=Pattern2.replaceAll("[0-9]", "0");
									Pattern2="P2:"+Pattern2;
								}
								String Pattern3 = tokens[p];
								if(Pattern3.matches(".*[\\W\\-\\_].*"))
								{
									Pattern3="__nil__";
								}
								else
								{
									Pattern3=Pattern3.replaceAll("[A-Z]+", "A");
									Pattern3=Pattern3.replaceAll("[a-z]+", "a");
									Pattern3=Pattern3.replaceAll("[0-9]+", "0");
									Pattern3="P3:"+Pattern3;
								}
								String Pattern4 = tokens[p];
								if(Pattern4.matches(".*[\\W\\-\\_].*"))
								{
									Pattern4="__nil__";
								}
								else
								{
									Pattern4=Pattern4.replaceAll("[A-Za-z]+", "a");
									Pattern4=Pattern4.replaceAll("[0-9]+", "0");
									Pattern4="P4:"+Pattern4;
								}
								
								//prefix
								String prefix="";
								tmp=tokens[p];
								if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";}
								if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";}
								if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";}
								if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";}
								if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";}
								
								//suffix
								String suffix="";
								tmp=tokens[p];
								if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";}
								if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";}
								if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";}
								if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";}
								if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";}
								
								//Abbreviation & Long Form
								String AbbLF="__nil__";
								if(AbbLFStatus_hash.containsKey(Offset))
								{
									AbbLF=AbbLFStatus_hash.get(Offset);
								}
								
								String Status = ID.substring(p, p+1);
			        			FileData.write(tokens[p]+" "+WSB+" "+WSF+" "+stem
			        					+" "+Num_num+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC
			        					+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+Repeat
			        					+" "+Pattern1+" "+Pattern2+" "+Pattern3+" "+Pattern4
			        					+" "+prefix+" "+suffix+" "+AbbLF
			        					+" "+Status+"\n");
			        			Offset=Offset+tokens[p].length()+1;
			        			if(ID.length()>tokens.length)
			        			{
			        				System.out.println(ID+"\t"+TokenSTR);
			        			}
			                }	
			        		FileData.write("\n");
		        		}
					}
	        		
				}
			}
			FileData.close();
		}
		catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");}
	}
	public void FeatureExtraction_Test(String FilenameData) throws XMLStreamException
	{
		try 
		{
			/** output files */ 
			BufferedWriter FileData = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameData), "UTF-8")); // .data
			//NLP modules
			SnowballStemmer stemmer = new englishStemmer();
			/** PMIDs : i */
			for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++)
			{
				String Pmid = data.getBioCDocobj().PMIDs.get(i);
				
				/** Paragraphs : j */
				for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++)
				{
					ArrayList Annotation = data.getBioCDocobj().Annotations.get(i).get(j);
					/** Annotations : k 
					 * 0 start
					 * 1 last
					 * 2 mention
					 * 3 type
					 * 4 id
					 */
					for (int k = 0; k < Annotation.size() ; k++)   // k : Annotations
					{
						String anno[]=Annotation.get(k).split("\\t",-1);
						String Mentions = anno[2];
						String Type = anno[3];
		        		String MentionArr[]=Mentions.split("\\|",-1);
		        		if(Type.equals("Gene"))
		        		{
			        		for(int m=0;m Token2Num = new HashMap ();
								for(int p=0;p AbbLFStatus_hash = new HashMap ();
								for(String Pmid_LF : data.getPmidLF2Abb_hash().keySet())
								{
									String pf[] = Pmid_LF.split("\\t",-1);
									if(pf[0].equals(Pmid))
									{
										String Abb = data.getPmidLF2Abb_hash().get(Pmid_LF);
										String LF = pf[1];
										
										Abb = Abb.replaceAll("([0-9])([A-Za-z])", "$1 $2");
										Abb = Abb.replaceAll("([A-Za-z])([0-9])", "$1 $2");
						        		Abb = Abb.replaceAll("([A-Z])([a-z])", "$1 $2");
						        		Abb = Abb.replaceAll("([a-z])([A-Z])", "$1 $2");
						        		Abb = Abb.replaceAll("([\\W])", " $1 ");
										Abb = Abb.replaceAll("[ ]+", " ");
						        		Abb = Abb.replaceAll("^[ ]+", "");
						        		
						        		LF = LF.replaceAll("([0-9])([A-Za-z])", "$1 $2");
						        		LF = LF.replaceAll("([A-Za-z])([0-9])", "$1 $2");
						        		LF = LF.replaceAll("([A-Z])([a-z])", "$1 $2");
						        		LF = LF.replaceAll("([a-z])([A-Z])", "$1 $2");
						        		LF = LF.replaceAll("([\\W])", " $1 ");
										LF = LF.replaceAll("[ ]+", " ");
						        		LF = LF.replaceAll("^[ ]+", "");
						        		
										
										Abb=Abb.replaceAll("([\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\-\\=\\[\\]\\;\\'\\,\\.\\/\\{\\}\\|\\:\\?])","\\\\$1");
										LF=LF.replaceAll("([\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)\\_\\+\\-\\=\\[\\]\\;\\'\\,\\.\\/\\{\\}\\|\\:\\?])","\\\\$1");
										Abb=Abb.toLowerCase();
										LF=LF.toLowerCase();
										Pattern ptmp1 = Pattern.compile("(.*)"
														+ "("+LF+")"
														+ "([ ]*\\([ ]*)"
														+ "("+Abb+")"
														+ "[ ]*\\).*");
										Matcher mtmp1 = ptmp1.matcher(TokenSTR.toLowerCase());
										Pattern ptmp2 = Pattern.compile("(.*)"
														+ "("+Abb+")"
														+ "([ ]*\\([ ]*)"
														+ "("+LF+")"
														+ "[ ]*\\).*");
										Matcher mtmp2 = ptmp2.matcher(TokenSTR.toLowerCase());
										int start_LF=0;
										int last_LF=0;
										int start_Abb=0;
										int last_Abb=0;
										if(mtmp1.find())
										{
											start_LF = mtmp1.group(1).length();
											last_LF = start_LF+mtmp1.group(2).length();
											start_Abb = last_LF+mtmp1.group(3).length();
											last_Abb = start_Abb+mtmp1.group(4).length();
										}
										else if(mtmp2.find())
										{
											start_Abb = mtmp2.group(1).length();
											last_Abb = start_LF+mtmp2.group(2).length();
											start_LF = last_LF+mtmp2.group(3).length();
											last_LF = start_Abb+mtmp2.group(4).length();
										}
										for(int l=start_LF;l0)
				        			{
				        				String B=tokens[p-1];
				        				B=B.replaceAll("[A-Za-z]+", "A");
				        				B=B.replaceAll("[0-9]+", "0");
				        				WSB="WSB:"+B;
				        			}
				        			if(p3){Num_num="N:4+";}else{Num_num="N:"+ tmp.length();}
									
									//Number of Uppercase [A-Z]
									String Num_Uc="";
									tmp=tokens[p];
									tmp=tmp.replaceAll("[^A-Z]","");
									if(tmp.length()>3){Num_Uc="U:4+";}else{Num_Uc="U:"+ tmp.length();}
									
									//Number of Lowercase [a-z]
									String Num_lc="";
									tmp=tokens[p];
									tmp=tmp.replaceAll("[^a-z]","");
									if(tmp.length()>3){Num_lc="L:4+";}else{Num_lc="L:"+ tmp.length();}
									
									//Number of ALL char
									String Num_All="";
									if(tokens[p].length()>3){Num_All="A:4+";}else{Num_All="A:"+ tokens[p].length();}
									
									//specific character (;:,.->+_)
									String SpecificC="__nil__";
									if(tokens[p].equals(";") || tokens[p].equals(":") || tokens[p].equals(",") || tokens[p].equals(".") || tokens[p].equals("-") || tokens[p].equals(">") || tokens[p].equals("+") || tokens[p].equals("_"))
									{
										SpecificC="-SpecificC1-";
									}
									else if(tokens[p].equals("(") || tokens[p].equals(")"))
									{
										SpecificC="-SpecificC2-";
									}
									else if(tokens[p].equals("{") || tokens[p].equals("}"))
									{
										SpecificC="-SpecificC3-";
									}
									else if(tokens[p].equals("[") || tokens[p].equals("]"))
									{
										SpecificC="-SpecificC4-";
									}
									else if(tokens[p].equals("\\") || tokens[p].equals("/"))
									{
										SpecificC="-SpecificC5-";
									}
									
									//Chemical Prefix/Suffix
									String ChemPreSuf="__nil__";
									if(tokens[p].matches(".*(yl|ylidyne|oyl|sulfonyl)")){ChemPreSuf="-CHEMinlineSuffix-";}
									else if(tokens[p].matches("(meth|eth|prop|tetracos).*")){ChemPreSuf="-CHEMalkaneStem-";}
									else if(tokens[p].matches("(di|tri|tetra).*")){ChemPreSuf="-CHEMsimpleMultiplier-";}
									else if(tokens[p].matches("(benzen|pyridin|toluen).*")){ChemPreSuf="-CHEMtrivialRing-";}
									else if(tokens[p].matches(".*(one|ol|carboxylic|amide|ate|acid|ium|ylium|ide|uide|iran|olan|inan|pyrid|acrid|amid|keten|formazan|fydrazin)(s|)")){ChemPreSuf="-CHEMsuffix-";}
									
									//MentionType
									String MentionType="__nil__";
									if(GNormPlus.SimConceptMention2Type_hash.containsKey(tokens[p]))
									{
										MentionType = "-"+GNormPlus.SimConceptMention2Type_hash.get(tokens[p])+"-";
									}
									
									//Protein symbols
									String ProteinSym="__nil__";
									if(tokens[p].matches(".*(glutamine|glutamic|leucine|valine|isoleucine|lysine|alanine|glycine|aspartate|methionine|threonine|histidine|aspartic|asparticacid|arginine|asparagine|tryptophan|proline|phenylalanine|cysteine|serine|glutamate|tyrosine|stop|frameshift).*")){ChemPreSuf="-ProteinSymFull-";}
									else if(tokens[p].matches("(cys|ile|ser|gln|met|asn|pro|lys|asp|thr|phe|ala|gly|his|leu|arg|trp|val|glu|tyr|fs|fsx)")){ChemPreSuf="-ProteinSymTri-";}
									else if(tokens[p].matches("[CISQMNPKDTFAGHLRWVEYX]")){ChemPreSuf="-ProteinSymChar-";}
									
									//Repeat
									String Repeat="__nil__";
									if(Token2Num.get(tokens[p])>1 && tokens[p].length()>1 && (!tokens[p].matches("([\\W\\-\\_0-9]+|and|or|alpha|beta|gamma|theta|zeta|delta|kappa|II|VI|IV|III)")))
									{
										Repeat="-Repeat-";
									}
									
									//Patterns
									String Pattern1 = tokens[p];
									if(Pattern1.matches(".*[\\W\\-\\_].*"))
									{
										Pattern1="__nil__";
									}
									else
									{
										Pattern1=Pattern1.replaceAll("[A-Z]", "A");
										Pattern1=Pattern1.replaceAll("[a-z]", "a");
										Pattern1=Pattern1.replaceAll("[0-9]", "0");
										Pattern1="P1:"+Pattern1;
									}
									String Pattern2 = tokens[p];
									if(Pattern2.matches(".*[\\W\\-\\_].*"))
									{
										Pattern2="__nil__";
									}
									else
									{
										Pattern2=Pattern2.replaceAll("[A-Za-z]", "a");
										Pattern2=Pattern2.replaceAll("[0-9]", "0");
										Pattern2="P2:"+Pattern2;
									}
									String Pattern3 = tokens[p];
									if(Pattern3.matches(".*[\\W\\-\\_].*"))
									{
										Pattern3="__nil__";
									}
									else
									{
										Pattern3=Pattern3.replaceAll("[A-Z]+", "A");
										Pattern3=Pattern3.replaceAll("[a-z]+", "a");
										Pattern3=Pattern3.replaceAll("[0-9]+", "0");
										Pattern3="P3:"+Pattern3;
									}
									String Pattern4 = tokens[p];
									if(Pattern4.matches(".*[\\W\\-\\_].*"))
									{
										Pattern4="__nil__";
									}
									else
									{
										Pattern4=Pattern4.replaceAll("[A-Za-z]+", "a");
										Pattern4=Pattern4.replaceAll("[0-9]+", "0");
										Pattern4="P4:"+Pattern4;
									}
									
									//prefix
									String prefix="";
									tmp=tokens[p];
									if(tmp.length()>=1){ prefix=tmp.substring(0, 1);}else{prefix="__nil__";}
									if(tmp.length()>=2){ prefix=prefix+" "+tmp.substring(0, 2);}else{prefix=prefix+" __nil__";}
									if(tmp.length()>=3){ prefix=prefix+" "+tmp.substring(0, 3);}else{prefix=prefix+" __nil__";}
									if(tmp.length()>=4){ prefix=prefix+" "+tmp.substring(0, 4);}else{prefix=prefix+" __nil__";}
									if(tmp.length()>=5){ prefix=prefix+" "+tmp.substring(0, 5);}else{prefix=prefix+" __nil__";}
									
									//suffix
									String suffix="";
									tmp=tokens[p];
									if(tmp.length()>=1){ suffix=tmp.substring(tmp.length()-1, tmp.length());}else{suffix="__nil__";}
									if(tmp.length()>=2){ suffix=suffix+" "+tmp.substring(tmp.length()-2, tmp.length());}else{suffix=suffix+" __nil__";}
									if(tmp.length()>=3){ suffix=suffix+" "+tmp.substring(tmp.length()-3, tmp.length());}else{suffix=suffix+" __nil__";}
									if(tmp.length()>=4){ suffix=suffix+" "+tmp.substring(tmp.length()-4, tmp.length());}else{suffix=suffix+" __nil__";}
									if(tmp.length()>=5){ suffix=suffix+" "+tmp.substring(tmp.length()-5, tmp.length());}else{suffix=suffix+" __nil__";}
									
									//Abbreviation & Long Form
									String AbbLF="__nil__";
									if(AbbLFStatus_hash.containsKey(Offset))
									{
										AbbLF=AbbLFStatus_hash.get(Offset);
									}
									
									FileData.write(tokens[p]+" "+WSB+" "+WSF+" "+stem
				        					+" "+Num_num+" "+Num_num+" "+Num_Uc+" "+Num_lc+" "+Num_All+" "+SpecificC
				        					+" "+ChemPreSuf+" "+MentionType+" "+ProteinSym+" "+Repeat
				        					+" "+Pattern1+" "+Pattern2+" "+Pattern3+" "+Pattern4
				        					+" "+prefix+" "+suffix+" "+AbbLF+"\n");
				        			Offset=Offset+tokens[p].length()+1;
				                }
								FileData.write("\n");
				        	}
		        		}
					}
	        		
				}
			}
			FileData.close();
		}
		catch(IOException e1){ System.out.println("[MR]: Input file is not exist.");}
	}
	public void CRF_test(String model, String FilenameData,String FilenameOutput) throws IOException 
	{
		File f = new File(FilenameOutput);
        BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF-8"));
		
		Runtime runtime = Runtime.getRuntime();
	    
		String cmd ="CRF/crf_test -m "+model+" -o "+FilenameOutput+" "+FilenameData;
	    
	    try {
	    	Process process = runtime.exec(cmd);
	    	InputStream is = process.getInputStream();
	    	InputStreamReader isr = new InputStreamReader(is, "UTF-8");
	    	BufferedReader br = new BufferedReader(isr);
	    	String line="";
		    while ( (line = br.readLine()) != null) 
		    {
		    	fr.write(line);
		    	fr.newLine();
		        fr.flush();
		    }
		    is.close();
		    isr.close();
		    br.close();
		    fr.close();
	    }
	    catch (IOException e) {
	    	System.out.println(e);
	    	runtime.exit(0);
	    }
	}
	public void CRF_learn(String model,String FilenameData) throws IOException 
	{
		Runtime runtime = Runtime.getRuntime();
	    
	    Process process = null;
	    String line = null;
	    InputStream is = null;
	    InputStreamReader isr = null;
	    BufferedReader br = null;
	    String cmd = "CRF/crf_learn -f 3 -c 4.0 CRF/template_SimConcept "+FilenameData+" "+model; 
	    
	    try {
	    	process = runtime.exec(cmd);
		    is = process.getInputStream();
		    isr = new InputStreamReader(is, "UTF-8");
		    br = new BufferedReader(isr);
		    while ( (line = br.readLine()) != null) 
		    {
		    	System.out.println(line);
		        System.out.flush();
		    }
		    is.close();
		    isr.close();
		    br.close();
	    }
	    catch (IOException e) {
	    	System.out.println(e);
	    	runtime.exit(0);
	    }
	}
	public void ReadCRFresult(String Filename,String FilenameOutput,String FilenameBioC) throws XMLStreamException, IOException
	{
		/** load CRF output */
		ArrayList outputArr1 = new ArrayList();
		BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(FilenameOutput), "UTF-8"));
		String line;
		while ((line = inputfile.readLine()) != null)  
		{	
			outputArr1.add(line);
		}
		inputfile.close();
		
		/**
		 * Recognize the mentions which can be simplified
		 */
		int Count_mention=0;
		boolean Simplified=false;
		String Mention="";
		String Mention_NoSpace="";
		String States="";
		HashMap Mentions_hash = new HashMap();
		HashMap States_hash = new HashMap();
		HashMap Output_Split_mention_Ind = new HashMap();
		HashMap Output_Split_mention = new HashMap();
		for(int i=0;i Split_mention = new ArrayList();
			ArrayList Split_state = new ArrayList();
			String tmp_mention="";
			String tmp_state="";
			/**
			 * count = Mentions_count.get(i) : # of the mention in the corpus (543)
			 * Mentions_hash.get(count) : Original Mention (ORP - 1 to ORP - 6)
			 * States_hash.get(count) : States (AASNOOS)
			 */
			
			String TokenArr[]=Mentions_hash.get(MNoSpace).split(" ",-1);
			String StateArr[]=States_hash.get(MNoSpace).split("",-1);
			
			//refinement : isn't used
			Pattern ptmp1 = Pattern.compile("^([S]+)([CN])([S]+)$");
			Matcher mtmp1 = ptmp1.matcher(States_hash.get(MNoSpace));
			if(mtmp1.find())
			{
				States_hash.put(MNoSpace, mtmp1.group(1)+"J"+mtmp1.group(3));
			}

			//Split BE
			int len=TokenArr.length;
			if(StateArr.length0)
					{
						Split_mention.add(tmp_mention);
						Split_state.add(tmp_state);
					}
					tmp_mention = "";
					tmp_state = "";
				}
				else //CNBF
				{
					tmp_mention = tmp_mention + TokenArr[s] + " ";
					tmp_state = tmp_state + StateArr[s];
				}
			}
			if(!tmp_mention.equals(""))
			{	
				Split_mention.add(tmp_mention);
				Split_state.add(tmp_state);
			}
			
			//Split B/F
			for(int m=0;m strainsX = new ArrayList();
				ArrayList STAstrainsX = new ArrayList();
				String each_token[] = Split_mention.get(m).split(" ");
				String each_state[] = Split_state.get(m).split("");
				for(int s=0;s strainsCN = new ArrayList();
				String CorN="";
				
				String each_token[] = Split_mention.get(m).split(" ",-1);
				String each_state[] = Split_state.get(m).split("",-1);
				
				for(int k=0;k=4)
						{
							A=A.replace("s $", "");
						}
						A=A+"STRAINXXX";
						strainCN=strainCN+each_token[k]+" ";
						CNO_continous=0;
					}
					else if(each_state[k].matches("[CN]") && CNO_continous==0)
					{
						CorN=each_state[k];
						strainsCN.add(strainCN);
						strainCN="";
						CNO_continous++;
					}
					else if(each_state[k].equals("J"))
					{
						if(!strainCN.equals("")){strainsCN.add(strainCN);}
						
						A=A.replaceAll("STRAINXXXSTRAINXXX","STRAINXXX");
						A=A.replaceAll("STRAINXXXSTRAINXXX","STRAINXXX");
						
						ptmp1 = Pattern.compile("^(.+)s (.*)$");
						mtmp1 = ptmp1.matcher(A);
						if(mtmp1.find() && mtmp1.group(1).length()>=3 )
						{
							A = mtmp1.group(1)+ " "+mtmp1.group(2);
						}
						
						if(CorN.equals("C"))
						{
							for(int x=0;x2 && (tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")))
								{
									tmp = tmp.substring(0,tmp.length()-2);
								}
								if(Output_Split_mention_Ind.containsKey(MNoSpace))
								{
									Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
								}
								else
								{
									Output_Split_mention_Ind.put(MNoSpace, tmp);
								}
							}
						}
						else if(CorN.equals("N"))
						{
							if(strainsCN.contains(0) && strainsCN.contains(1))
							{
								String strain1= strainsCN.get(0).replaceAll(" ", "");
								String strain2= strainsCN.get(1).replaceAll(" ", "");
								if(strain1.matches("[0-9]+") && strain2.matches("[0-9]+"))
								{
									if(Integer.parseInt(strain2)-Integer.parseInt(strain1)<=20)
									{
										for(int strCount=Integer.parseInt(strain1);strCount<=Integer.parseInt(strain2);strCount++)
										{
											String tmp=A;
											tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
											tmp = tmp.replaceAll("[ ]+"," ");
											if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
											{
												tmp = tmp.substring(0,tmp.length()-2);
											}
											if(Output_Split_mention_Ind.containsKey(MNoSpace))
											{
												Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
											}
											else
											{
												Output_Split_mention_Ind.put(MNoSpace, tmp);
											}
										}
									}
								}
								else if(strain1.matches("[A-Z]+ ") && strain2.matches("[A-Z]+ "))
								{
									int strInt1 = (int) strain1.replaceAll(" ", "").charAt(0);
									int strInt2 = (int) strain2.replaceAll(" ", "").charAt(0);
									if(strInt2-strInt1<=20)
									{
										for(int strCount=strInt1;strCount<=strInt2;strCount++)
										{
											String tmp=A;
											tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
											tmp = tmp.replaceAll("[ ]+"," ");
											if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
											{
												tmp = tmp.substring(0,tmp.length()-2);
											}
											if(Output_Split_mention_Ind.containsKey(MNoSpace))
											{
												Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
											}
											else
											{
												Output_Split_mention_Ind.put(MNoSpace, tmp);
											}
										}
									}
								}
								else
								{
									if(Output_Split_mention.containsKey(MNoSpace))
									{
										Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
									}
									else
									{
										Output_Split_mention.put(MNoSpace, Split_mention.get(m));
									}
								}
							}
						}
						else
						{
							if(Output_Split_mention.containsKey(MNoSpace))
							{
								Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
							}
							else
							{
								Output_Split_mention.put(MNoSpace, Split_mention.get(m));
							}
						}
						
						A="";
						strainCN="";
						CNO_continous=0;
						strainsCN = new ArrayList();
						CorN="";
					}
				}
				if(!strainCN.equals("")){strainsCN.add(strainCN);}
				
				A=A.replaceAll("(STRAINXXX){2,}","STRAINXXX");
				
				ptmp1 = Pattern.compile("^(.+)s (.*)$");
				mtmp1 = ptmp1.matcher(A);
				if(mtmp1.find() && mtmp1.group(1).length()>=3 )
				{
					A = mtmp1.group(1)+ " "+mtmp1.group(2);
				}
				
				if(CorN.equals("C"))
				{
					for(int x=0;x2 && (tmp.substring(tmp.length()-2, tmp.length()-2).equals(" ")))
						{
							tmp = tmp.substring(0,tmp.length()-2);
						}
						if(Output_Split_mention_Ind.containsKey(MNoSpace))
						{
							Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
						}
						else
						{
							Output_Split_mention_Ind.put(MNoSpace, tmp);
						}
					}
				}
				else if(CorN.equals("N"))
				{
					if(strainsCN.size()==2)
					{
						String strain1= strainsCN.get(0).replaceAll(" ", "");
						String strain2= strainsCN.get(1).replaceAll(" ", "");
						if(strain1.matches("[0-9]{1,7}") && strain2.matches("[0-9]{1,7}"))
						{
							if(Integer.parseInt(strain2)-Integer.parseInt(strain1)<=20)
							{
								for(int strCount=Integer.parseInt(strain1);strCount<=Integer.parseInt(strain2);strCount++)
								{
									String tmp=A;
									tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
									tmp = tmp.replaceAll("[ ]+"," ");
									if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
									{
										tmp = tmp.substring(0,tmp.length()-2);
									}
									if(Output_Split_mention_Ind.containsKey(MNoSpace))
									{
										Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
									}
									else
									{
										Output_Split_mention_Ind.put(MNoSpace, tmp);
									}
								}
							}
						}
						else if(strain1.matches("[A-Z]+ ") && strain2.matches("[A-Z]+ "))
						{
							int strInt1 = (int) strain1.replaceAll(" ", "").charAt(0);
							int strInt2 = (int) strain2.replaceAll(" ", "").charAt(0);
							if(strInt2-strInt1<=20)
							{
								for(int strCount=strInt1;strCount<=strInt2;strCount++)
								{
									String tmp=A;
									tmp = tmp.replace("STRAINXXX", Integer.toString(strCount));
									tmp = tmp.replaceAll("[ ]+"," ");
									if(tmp.length()>2 && tmp.substring(tmp.length()-2, tmp.length()-2).equals(" "))
									{
										tmp = tmp.substring(0,tmp.length()-2);
									}
									if(Output_Split_mention_Ind.containsKey(MNoSpace))
									{
										Output_Split_mention_Ind.put(MNoSpace, Output_Split_mention_Ind.get(MNoSpace)+"|"+tmp);
									}
									else
									{
										Output_Split_mention_Ind.put(MNoSpace, tmp);
									}
								}
							}
						}
						else
						{
							if(Output_Split_mention.containsKey(MNoSpace))
							{
								Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
							}
							else
							{
								Output_Split_mention.put(MNoSpace, Split_mention.get(m));
							}
						}
					}
				}
				else
				{
					if(Output_Split_mention.containsKey(MNoSpace))
					{
						Output_Split_mention.put(MNoSpace, Output_Split_mention.get(MNoSpace)+"|"+Split_mention.get(m));
					}
					else
					{
						Output_Split_mention.put(MNoSpace, Split_mention.get(m));
					}
				}
			}
		}
		
		for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++)
		{
			for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++)
			{
				int Annotation_Num = data.getBioCDocobj().Annotations.get(i).get(j).size();
				for (int k = 0; k < Annotation_Num ; k++)   // k : Annotations
				{
					String anno[]=data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\\t"); //Mention
					String MenArr[]=anno[2].split("\\|");
	        		for(int m=0;m Mentions = new ArrayList();
					for(int m=0;m ii
		// ii --> 2
		for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++)
		{
			for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++)
			{
				int Annotation_Num = data.getBioCDocobj().Annotations.get(i).get(j).size();
				for (int k = 0; k < Annotation_Num ; k++)   // k : Annotations
				{
					String anno[]=data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\\t"); //Mention
					String MenArr[]=anno[2].split("\\|");
					HashMap Mentions = new HashMap();
					for(int m=0;m




© 2015 - 2025 Weber Informatics LLC | Privacy Policy