All Downloads are FREE. Search and download functionalities are using the official Maven repository.

GNormPluslib.SR Maven / Gradle / Ivy

/**
 * Project: GNormPlus
 * Function: Species recognition and Species assignment
 */

package GNormPluslib;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.BreakIterator;
import java.time.LocalDate;
import java.time.ZoneId;

import javax.xml.stream.XMLStreamException;

import org.tartarus.snowball.SnowballStemmer;
import org.tartarus.snowball.ext.englishStemmer;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;

public class SR 
{
	private GNPProcessingData data;

	public SR(GNPProcessingData data) {

		this.data = data;
	}

	public void SpeciesRecognition(String Filename, String FilenameBioC, String StrainFilename, String FilterAntibody) throws IOException, XMLStreamException
	{
		/** Recognizing Species Names: SP */
		for (int i = 0; i < data.getBioCDocobj().PMIDs.size(); i++) /** PMIDs : i */
		{
			String Pmid = data.getBioCDocobj().PMIDs.get(i);
			PrefixTree PT_Genus = new PrefixTree();
			HashMap SPID_hash = new HashMap();
			ArrayList TargetedLocation = new ArrayList();
			HashMap GenusNames = new HashMap();
			HashMap Mention2ID_lc = new HashMap();
			ArrayList IDset = new ArrayList();
			for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */
			{
				String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
				
				/** Species recognition */
				ArrayList locations = GNormPlus.PT_Species.SearchMentionLocation(PassageContext,"Species"); /** PT_Species */
				for (int k = 0 ; k < locations.size() ; k++)
				{
					String anno[]=locations.get(k).split("\t");
					int start= Integer.parseInt(anno[0]);
	        		int last= Integer.parseInt(anno[1]);
	        		
	        		// For anti-serum filtering
	        		String ForwardSTR="";
	        		String BackwardSTR="";
					try {
						if(start>21)
						{
							ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(start-21,last);
						}
						else
						{
							ForwardSTR = (PassageContext+"ZZZZZZZZZZZZZZZZZZZZZZZZZZZ").substring(0,last);
						}
						if(PassageContext.length()>last+21)
						{
							BackwardSTR = PassageContext.substring(start,last+21);
						}
						else
						{
							BackwardSTR = PassageContext.substring(start,PassageContext.length());
						}
					} catch (Exception e) {
						throw new InconsistentDataException("Exception in document " + Pmid + " in paragraph with offset " + data.getBioCDocobj().PassageOffsets.get(i).get(j) + " and length " + PassageContext.length() + " beginning with " + PassageContext.substring(0, Math.min(PassageContext.length(), 80)), e);
					}

					String mention = anno[2];
	        		String id = anno[3];
	        		String mention_tmp=mention.toLowerCase();
	        		mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
	        		String antibody="";
	        		if(ForwardSTR.toLowerCase().matches(".*(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg)[\\W\\-\\_]+"+mention_tmp)) {antibody="(anti)";}//filtering : antibody
	        		else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
	        		else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[\\W\\-\\_]+[A-Za-z0-9]+[\\W\\-\\_]+(anti|antibody|antibodies|serum|polyclonal|monoclonal|igg).*")){antibody="(anti)";} //filtering : antibody
	        		
					if(mention.matches(".*[\\(\\[\\{].*") && BackwardSTR.toLowerCase().matches(mention_tmp+"\\).*") )
    				{
	        			last=last+1;
	        			mention=mention+")";
    				}
	        		
	        		if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9].*")){} // filtered: Bee1p
	        		else if((mention.matches(".*[;:,].*")) && mention.length()<=10){} // filtered : x, XXX
	        		else if(mention.matches("to[\\W\\-\\_]+[0-9]+")){} // to 7
	        		else if(mention.matches("[a-z][\\)\\]\\}].*") && (!mention.matches(".*[\\(\\[\\{].*")) && mention.length()<=10){} // s). Major
	        		else if(mention.matches(".*[\\(\\[\\{].*") && (!mention.matches(".*[\\)\\]\\}].*")) && mention.length()<=10){} // s). Major
	        		else if(!id.equals("NA"))
	        		{
	        			if(data.getBioCDocobj().Annotations.size()>i && data.getBioCDocobj().Annotations.get(i).size()>j)
						{
	        				if((!mention.matches("^[A-Za-z] [A-Za-z0-9]+$")) && (mention.length()>=3)) // invalid species: "a group/a GAL4/a strain"
	        				{
	        					if(FilterAntibody.equals("False") || (!antibody.equals("(anti)")))
	        					{
	        						String patt="^(.+?) [sS]train";
									Pattern ptmp = Pattern.compile(patt);
									Matcher mtmp = ptmp.matcher(mention);
									if(mtmp.find())
									{
										mention=mtmp.group(1);
										last=last-7;
									}
			        				data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id); //+antibody
			        				String mentions_tmp=mention.toLowerCase();
		    						mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
		    						mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
		    						data.getFiltering_hash().put(mentions_tmp,"");
			        				Mention2ID_lc.put(mention.toLowerCase(), id); //+antibody
			        				
			         				String mention_genus = "";
			        				patt="^([A-Za-z]+) ";
									ptmp = Pattern.compile(patt);
									mtmp = ptmp.matcher(mention);
									if(mtmp.find())
									{
										mention_genus=mtmp.group(1); // get genus
									}
									
			        				IDset.add(id);
			        				for(int s=start;s PassageContext.length()) {
						// Erik Faessler: We had offset issues with texts that contain non-ASCII characters
						continue;
					}
	        		String mention = anno[2];
	        		String id = anno[3];
	        		if(data.getBioCDocobj().Annotations.size()>i && data.getBioCDocobj().Annotations.get(i).size()>j)
					{
	        			if(!TargetedLocation.contains(j+"\t"+start)) //already exists
	        			{
	        				int last40=0;
		        			if(PassageContext.length()>=last+40)
		        			{
		        				last40=last+40;
		        			}
		        			else
		        			{
		        				last40=PassageContext.length();
		        			}
		        			
		        			// For anti-serum filtering
			        		String ForwardSTR="";
			        		String BackwardSTR="";
			        		if(start>21)
			        		{
			        			ForwardSTR = PassageContext.substring(start-21,last);
			        		}
			        		else
			        		{
			        			ForwardSTR = PassageContext.substring(0,last);
			        		}
			        		if(PassageContext.length()>last+21)
			        		{
			        			BackwardSTR = PassageContext.substring(start,last+21);
			        		}
			        		else
			        		{
			        			BackwardSTR = PassageContext.substring(start,PassageContext.length());
			        		}
			        		String mention_tmp=mention.toLowerCase();
			        		mention_tmp = mention_tmp.replaceAll("([^A-Za-z0-9@ ])", "\\\\$1");
			        		if(mention_tmp.matches(".*[\\[\\]\\(\\)\\{\\}].*")){}
			        		else if(BackwardSTR.toLowerCase().matches(mention_tmp+"[0-9\\-\\_].*")){} // filtered: Bee1p
			        		else if(ForwardSTR.toLowerCase().matches(".*[0-9\\-\\_]"+mention_tmp)){} // filtered: IL-22RA1
			        		else
			        		{
			        			String patt="[\\W\\-]cell([\\- ]*line|)[s]*[\\W\\-]";
			    				Pattern ptmp = Pattern.compile(patt);
			    				Matcher mtmp = ptmp.matcher(PassageContext.substring(last, last40).toLowerCase());
			    				if(mtmp.find())
			    				{
			    					if(GNormPlus.taxid4gene.contains(id)) // for gene
				        			{
				        				id="*"+id;
					        		}
			    					data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tCell\t"+id);
									String mentions_tmp=mention.toLowerCase();
		    						mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
		    						mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
		    						data.getFiltering_hash().put(mentions_tmp,"");
			    					IDset.add(id);
			    					for(int s=start;s=7)
					{
						GenusNames.put(ID,SPID_hash.get(ID));
					}
				}
			}
			
			GenusNames.put("3702", "arabidopsis");
			GenusNames.put("4932", "saccharomyces");
			GenusNames.put("562", "escherichia");
			GenusNames.put("7227", "drosophila");
			GenusNames.put("8355", "xenopus");
			
			PT_Genus.Hash2Tree(GenusNames);
			
			/** Genus recognition */
			for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */
			{
				if(data.getBioCDocobj().PassageContexts.size()>i && 
					data.getBioCDocobj().PassageContexts.get(i).size()>j &&  
					data.getBioCDocobj().Annotations.size()>i &&  
					data.getBioCDocobj().Annotations.get(i).size()>j 
					)
				{
					String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j);
					ArrayList locations_Genus = PT_Genus.SearchMentionLocation(PassageContext,"Genus"); /** PT_Genus*/
					for (int k = 0 ; k < locations_Genus.size() ; k++)
					{
						String anno[]=locations_Genus.get(k).split("\t");
						String start= anno[0];
		        		String last= anno[1];
		        		String mention = anno[2];
		        		String id = anno[3];
		        		if(!TargetedLocation.contains(j+"\t"+start)) //already exists
	        			{
		        			String patt="^\\**([0-9]+)$";
							Pattern ptmp = Pattern.compile(patt);
							Matcher mtmp = ptmp.matcher(id);
							if(mtmp.find())
							{
								id = mtmp.group(1);
							}
							
							if(GNormPlus.taxid4gene.contains(id)) // for gene
		        			{
		        				id="*"+id;
			        		}
							data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tGenus\t"+id);
							String mentions_tmp=mention.toLowerCase();
    						mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
    						mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
    						data.getFiltering_hash().put(mentions_tmp,"");
							IDset.add(id);
							for(int s=Integer.parseInt(start);s StrainID_hash = new HashMap();
			BufferedReader br = new BufferedReader(new FileReader(StrainFilename));
			String line="";
			while ((line = br.readLine()) != null)  
			{
				String l[]=line.split("\t");
				String ancestor = l[0];
				String tax_id = l[1];
				String tax_names = l[2];
				if(SPID_hash.containsKey(ancestor))
				{
					StrainID_hash.put(tax_id, tax_names); // tax id -> strain
				}
				else if(SPID_hash.containsKey(tax_id))
				{
					StrainID_hash.put(tax_id, tax_names); // tax id -> strain
				}
			}
			br.close();
			HashMap StrainNames = new HashMap();
			for(String ID: StrainID_hash.keySet())
			{
				StrainNames.put(ID,StrainID_hash.get(ID));
			}
			
			PT_Strain.Hash2Tree(StrainNames);
			
			/** Strain recognition */
			for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */
			{
				if(data.getBioCDocobj().PassageContexts.size()>i && 
					data.getBioCDocobj().PassageContexts.get(i).size()>j &&  
					data.getBioCDocobj().Annotations.size()>i &&  
					data.getBioCDocobj().Annotations.get(i).size()>j 
					)
				{
					String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
					ArrayList locations_Strain = PT_Strain.SearchMentionLocation(PassageContext,"Strain"); /** PT_Strain*/
					for (int k = 0 ; k < locations_Strain.size() ; k++)
					{
						String anno[]=locations_Strain.get(k).split("\t");
						String start= anno[0];
		        		String last= anno[1];
		        		String mention = anno[2];
		        		String id = anno[3];
		        		if(!TargetedLocation.contains(j+"\t"+start)) //already exists
	        			{
		        			if((!mention.matches(".*[;,\\{\\}\\(\\)\\[\\]].*")) && !mention.matches("[a-z]{1,4} [0-9]{1,3}"))
		        			{
			        			if(GNormPlus.taxid4gene.contains(id)) // for gene
			        			{
			        				id="*"+id;
				        		}
			        			data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tStrain\t"+id);
								String mentions_tmp=mention.toLowerCase();
	    						mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
	    						mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
	    						data.getFiltering_hash().put(mentions_tmp,"");
			        			IDset.add(id);
			        			for(int s=Integer.parseInt(start);s OtherNames = new HashMap();
			for(String men : Mention2ID_lc.keySet())
			{
				String men_id= Mention2ID_lc.get(men);
				if(data.getPmidLF2Abb_lc_hash().containsKey(Pmid+"\t"+men))
				{
					String Abb = data.getPmidLF2Abb_lc_hash().get(Pmid+"\t"+men);
					// Abbreviation
					if(OtherNames.containsKey(men_id))
					{
						OtherNames.put(men_id, OtherNames.get(men_id)+"|"+Abb);
					}
					else
					{
						OtherNames.put(men_id,Abb);
					}
				}
				String men_nospace=men.replaceAll(" ", "");
				// no space
				if(OtherNames.containsKey(men_id))
				{
					OtherNames.put(men_id, OtherNames.get(men_id)+"|"+men_nospace);
				}
				else
				{
					OtherNames.put(men_id,men_nospace);
				}
			}
			PrefixTree PT_Others = new PrefixTree();
			PT_Others.Hash2Tree(OtherNames);
			
			/** 
			 *
			 * Others: 
			 * 1) Abbreviation 
			 * 2) no space
			 * 
			 * */
			for (int j = 0; j < data.getBioCDocobj().PassageNames.get(i).size(); j++) /** Paragraphs : j */
			{
				if(data.getBioCDocobj().PassageContexts.size()>i && 
					data.getBioCDocobj().PassageContexts.get(i).size()>j &&  
					data.getBioCDocobj().Annotations.size()>i &&  
					data.getBioCDocobj().Annotations.get(i).size()>j 
					)
				{
					String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
					ArrayList locations_Abb = PT_Others.SearchMentionLocation(PassageContext,"Species"); /** PT_Abb*/
					for (int k = 0 ; k < locations_Abb.size() ; k++)
					{
						String anno[]=locations_Abb.get(k).split("\t");
						String start= anno[0];
		        		String last= anno[1];
		        		String mention = anno[2];
		        		String id = anno[3];
		        		if(!TargetedLocation.contains(j+"\t"+start)) //already exists
	        			{
		        			if(GNormPlus.taxid4gene.contains(id)) // for gene
		        			{
		        				id="*"+id;
			        		}
		        			data.getBioCDocobj().Annotations.get(i).get(j).add(start+"\t"+last+"\t"+mention+"\tSpecies\t"+id);
							String mentions_tmp=mention.toLowerCase();
    						mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
    						mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
    						data.getFiltering_hash().put(mentions_tmp,"");
		        			Mention2ID_lc.put(mention.toLowerCase(), id);
		        			IDset.add(id);
		        			for(int s=Integer.parseInt(start);si && data.getBioCDocobj().PassageContexts.get(i).size()>j && data.getBioCDocobj().Annotations.size()>i && data.getBioCDocobj().Annotations.get(i).size()>j)
				{
					for (int a = 0; a < data.getBioCDocobj().Annotations.get(i).get(j).size(); a++) /** Annotations : a */
					{
						String SpAnno[]=data.getBioCDocobj().Annotations.get(i).get(j).get(a).split("\t");
						String start= SpAnno[0];
		        		String last= SpAnno[1];
		        		String mention = SpAnno[2];
		        		String type = SpAnno[3];
		        		
		        		/** Abbreviation solution */
		    			if(data.getPmidAbb2LF_lc_hash().containsKey(Pmid+"\t"+mention.toLowerCase()) && Mention2ID_lc.containsKey(data.getPmidAbb2LF_lc_hash().containsKey(Pmid+"\t"+mention.toLowerCase())))
						{
							String LF_lc=data.getPmidAbb2LF_lc_hash().get(Pmid+"\t"+mention.toLowerCase());
							if(Mention2ID_lc.containsKey(LF_lc))
							{
								String LF_ID=Mention2ID_lc.get(LF_lc);
								data.getBioCDocobj().Annotations.get(i).get(j).set(a, start+"\t"+last+"\t"+mention+"\t"+type+"\t"+LF_ID);
								String mentions_tmp=mention.toLowerCase();
	    						mentions_tmp=mentions_tmp.replaceAll("[\\W\\-\\_]","");
	    						mentions_tmp=mentions_tmp.replaceAll("[0-9]","0");
	    						data.getFiltering_hash().put(mentions_tmp,"");
							}
						}
		    			else if (SpAnno.length>4) 
		    			{
		    				String id = SpAnno[4];
		    				String id_split[]=id.split(";");
	    					if(id_split.length>=2)
	    					{
	    						/** Smallest set of tax ids */
			    				boolean found=false;
	    						for(int x=0;x PrefixIDTarget_hash = new HashMap();
			PrefixIDTarget_hash.put("9606", "h");
			PrefixIDTarget_hash.put("10090", "m");
			PrefixIDTarget_hash.put("10116", "r");
			PrefixIDTarget_hash.put("4932", "y");
			PrefixIDTarget_hash.put("7227", "d");
			PrefixIDTarget_hash.put("7955", "z|zf|Zf|dr|Dr");
			PrefixIDTarget_hash.put("3702", "at|At");
			
			HashMap SP2Num_hash = new HashMap();
			for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) /** Paragraphs : j */
			{
				for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
				{
					String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
					if(anno.length==5) //Species
	        		{
						String patt="^\\**([0-9]+)$";
						Pattern ptmp = Pattern.compile(patt);
						Matcher mtmp = ptmp.matcher(anno[4]);
						if(mtmp.find())
						{
							String id = mtmp.group(1);
							
							if(!PrefixIDTarget_hash.containsKey(id))
							{
								PrefixIDTarget_hash.put(id,GNormPlus.PrefixID_hash.get(id)); // taxid -> prefix
							}
							if(j == 0)//title
			        		{
			        			if(SP2Num_hash.containsKey(id))
			        			{
			        				SP2Num_hash.put(id, SP2Num_hash.get(id)+2);
			        			}
			        			else
			        			{
			        				if(GNormPlus.TaxFreq_hash.containsKey(id))
			        				{
			        					SP2Num_hash.put(id, GNormPlus.TaxFreq_hash.get(id)+2);
			        				}
			        				else
			        				{
			        					SP2Num_hash.put(id, 2.0);
			        				}
			        			}
			        			// Virus -> Human (not to double weight human to virus)
			        			/*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
		        				{
			        				if(SP2Num_hash.containsKey("9606"))
				        			{
				        				SP2Num_hash.put("9606", SP2Num_hash.get("9606")+2);
				        			}
				        			else
				        			{
				        				SP2Num_hash.put("9606", 2 + GNormPlus.TaxFreq_hash.get("9606")+1);
				        			}
		        				}*/
			        		}
			        		else
			        		{
			        			if(SP2Num_hash.containsKey(id))
			        			{
			        				SP2Num_hash.put(id, SP2Num_hash.get(id)+1);
			        			}
			        			else
			        			{
			        				if(GNormPlus.TaxFreq_hash.containsKey(id))
			        				{
			        					SP2Num_hash.put(id, 1 + GNormPlus.TaxFreq_hash.get(id));
			        				}
			        				else
			        				{
			        					SP2Num_hash.put(id, 1.0);
			        				}
			        			}
			        			// Virus -> Human
			        			/*if(GNormPlus.SP_Virus2Human_hash.containsKey(id))
		        				{
			        				if(SP2Num_hash.containsKey("9606"))
				        			{
				        				SP2Num_hash.put("9606", SP2Num_hash.get("9606")+1);
				        			}
				        			else
				        			{
				        				SP2Num_hash.put("9606", GNormPlus.TaxFreq_hash.get("9606")+1);
				        			}
		        				}*/
			        		}
						}
	        		}
				}
			}
			String MajorSP="9606";
			double MaxSP=0;
			for(String tid : SP2Num_hash.keySet())
			{
				if(SP2Num_hash.get(tid)>MaxSP)
				{
					MajorSP=tid;
					MaxSP=SP2Num_hash.get(tid);
				}
			}
			
			for (int j = 0; j < data.getBioCDocobj().PassageContexts.get(i).size(); j++) /** Paragraphs : j */
			{
				String PassageContext = data.getBioCDocobj().PassageContexts.get(i).get(j); // Passage context
				//int PassageOffset = data.getBioCDocobj().PassageOffsets.get(i).get(j); // Passage offset
				iterator.setText(PassageContext);
				ArrayList Sentence_offsets = new ArrayList();
				int Sent_start = iterator.first();
				for (int Sent_last = iterator.next(); Sent_last != BreakIterator.DONE; Sent_start = Sent_last, Sent_last = iterator.next()) 
				{
					Sentence_offsets.add(Sent_start);
				}
				
				HashMap Annotations_Gene_hash = new HashMap();
				ArrayList Annotations_Species = new ArrayList();
				if(data.getBioCDocobj().Annotations.get(i).size()>j)
				{
					for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
					{
						String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
						if(anno.length==5) //Species
		        		{
							Annotations_Species.add(data.getBioCDocobj().Annotations.get(i).get(j).get(k));
		        		}
		        		else //Gene : if(anno.length==3)
		        		{
		        			//String mention = PassageContext.substring(Integer.parseInt(anno[0]), Integer.parseInt(anno[1]));
		        			Annotations_Gene_hash.put(k,data.getBioCDocobj().Annotations.get(i).get(j).get(k)); // k -> Gene Annotation
		        		}
					}
	
					//Gene --> Species Inference (PMID:28777492)
					HashMap> mention2Location2Species_hash = new HashMap>(); 
					HashMap Location2Species_hash = new HashMap(); 
					for (int k : Annotations_Gene_hash.keySet()) // k is the index of data.getBioCDocobj().Annotations.get(i).get(j) 
	    			{
						boolean SPfound = false;
						String anno[] = Annotations_Gene_hash.get(k).split("\t");
	    				int G_Start= Integer.parseInt(anno[0]);
		        		int G_Last= Integer.parseInt(anno[1]);
		        		String G_mentions = anno[2];
		        		/**
		        		 *  2. Co-occurring word
		        		 *  boundary : 
		        		 *  Sentence Start: Sentence_offsets.get(Target_Sentence)
		        		 *  Sentence Last: Sentence_offsets.get(Target_Sentence+1)
		        		 */
		        		//Find the target sentence
		        		int Target_Sentence=0;
		        		if(SPfound == false) // 1. left : Closed to start of the gene mention 
		        		{
		        			for(int s=0;s Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
		        		if(SPfound == false) // 1. left : Closed to start of the gene mention 
		        		{
		        			int closet_Sp_Start=0;
		        			for(int sp=0;sp= Sentence_Start && Sp_Start >closet_Sp_Start)
					        		{
					        			closet_Sp_Start=Sp_Start;
					        			Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);

					        			if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
					        			{
					        				mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
					        			}
					        			else
					        			{
					        				mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
					        			}
					        			
					        			SPfound=true;
					        		}
								}
				        	}
			        	}
		        		if(SPfound == false) // 2. right : Closed to last of the gene mention
		        		{
		        			int closet_Sp_Last=1000000;
		        			for(int sp=0;sp= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
					        		{
					        			closet_Sp_Last=Sp_Last;
					        			Location2Species_hash.put(Integer.parseInt(anno[0]), taxid);
					        			
					        			if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
					        			{
					        				mention2Location2Species_hash.get(G_mentions.toLowerCase()).put(Integer.parseInt(anno[0]), taxid);
					        			}
					        			else
					        			{
					        				mention2Location2Species_hash.put(G_mentions.toLowerCase(),Location2Species_hash);
					        			}
					        			
					        			SPfound=true;
					        		}
								}
				        	}
		        		}
	    			}
					
					for (int k : Annotations_Gene_hash.keySet()) // k is the index of data.getBioCDocobj().Annotations.get(i).get(j) 
	    			{
						String anno[] = Annotations_Gene_hash.get(k).split("\t");
	    				int G_Start= Integer.parseInt(anno[0]);
		        		int G_Last= Integer.parseInt(anno[1]);
		        		String G_mentions = anno[2];
		        		String G_type = anno[3];
		        		String G_mention_list[]=G_mentions.split("\\|");
						if (G_mention_list.length == 0) {
							InconsistentDataException e = new InconsistentDataException("There is no gene mention but at least one was expected in document with ID " + data.getBioCDocobj().PMIDs.get(i) + " in paragraph with offset " + data.getBioCDocobj().PassageOffsets.get(i).get(j) + " and length " + PassageContext.length() + " beginning with " + PassageContext.substring(0, Math.min(PassageContext.length(), 80)));
							e.setDocId(data.getBioCDocobj().PMIDs.get(i));
							throw e;
						}
		        		String G_mention=G_mention_list[0]; // only use the first term to detect species ; should be updated after SimConcept
		        		
		        		/** 1. prefix */
		        		boolean SPfound = false;
		        		for(String taxid: PrefixIDTarget_hash.keySet())
		        		{
							if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(G_mention.toLowerCase()))
							{
								//special case, and no need for prefix - SA
							}
							else
							{
								Pattern ptmp = Pattern.compile("^("+PrefixIDTarget_hash.get(taxid)+")([A-Z].*)$");
								Matcher mtmp = ptmp.matcher(G_mention);
								if(mtmp.find())
								{
									String MentionWoPrefix=mtmp.group(2);
									data.getBioCDocobj().Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+taxid);
									SPfound=true;
									break;
								}
							}
		        		}
		        		
		        		/**
		        		 *  2. Co-occurring word
		        		 *  boundary : 
		        		 *  Sentence Start: Sentence_offsets.get(Target_Sentence)
		        		 *  Sentence Last: Sentence_offsets.get(Target_Sentence+1)
		        		 */
		        		//Find the target sentence
		        		int Target_Sentence=0;
		        		if(SPfound == false) // 1. left : Closed to start of the gene mention 
		        		{
		        			for(int s=0;s Target_Sentence+1){ Sentence_Last = Sentence_offsets.get(Target_Sentence+1); }
		        		if(SPfound == false) // 1. left : Closed to start of the gene mention 
		        		{
		        			int closet_Sp_Start=0;
		        			for(int sp=0;sp= Sentence_Start && Sp_Start >closet_Sp_Start)
					        		{
					        			closet_Sp_Start=Sp_Start;
										if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
				        				{
					        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid+"&9606");
				        				}
					        			else
					        			{
					        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tLeft:"+taxid);
					        			}
					        			SPfound=true;
					        		}
								}
				        	}
			        	}
		        		if(SPfound == false) // 2. right : Closed to last of the gene mention
		        		{
		        			int closet_Sp_Last=1000000;
		        			for(int sp=0;sp= G_Last && Sp_Last <= Sentence_Last && Sp_Last < closet_Sp_Last)
					        		{
					        			closet_Sp_Last=Sp_Last;
										if(GNormPlus.SP_Virus2Human_hash.containsKey(taxid))
				        				{
					        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid+"&9606");
				        				}
					        			else
					        			{
					        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tRight:"+taxid);
					        			}
					        			SPfound=true;
					        		}
								}
				        	}
		        		}
		        		
		    			/** 3. Focus species */
		        		if(SPfound == false) // 2. right : Closed to last of the gene mention
		        		{
		        			// 1. only the mentions appeared earlier are inferred
		        			//
		        			if(mention2Location2Species_hash.containsKey(G_mentions.toLowerCase()))
		        			{
		        				int closed_loca=0;
		        				for (int loca_start : mention2Location2Species_hash.get(G_mentions.toLowerCase()).keySet())
			        			{
		        					if(loca_startclosed_loca)
										{
											closed_loca=loca_start;
										}
									}
			        			}
		        				if(closed_loca>0)
								{
		        					if(GNormPlus.SP_Virus2Human_hash.containsKey(Location2Species_hash.get(closed_loca)))
			        				{
				        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca)+"&9606");
			        				}
				        			else
				        			{
					        			data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+Location2Species_hash.get(closed_loca));
				        			}
								}
								else
								{
									if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
			        				{
				        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
			        				}
				        			else
				        			{
					        			data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
				        			}
								}
		        			}
		        			else
							{
		        				if(GNormPlus.SP_Virus2Human_hash.containsKey(MajorSP))
		        				{
			        				data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP+"&9606");
		        				}
			        			else
			        			{
				        			data.getBioCDocobj().Annotations.get(i).get(j).set(k, Annotations_Gene_hash.get(k)+"\tFocus:"+MajorSP);
			        			}
							}
		        		}
	    			}
				}
			}
		}
		data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
	}
	public void SpeciesAssignment(String Filename,String FilenameBioC,String FocusSpecies) throws IOException, XMLStreamException
	{
		for (int i = 0; i < data.getBioCDocobj().Annotations.size(); i++) /** PMIDs : i */
		{
			for (int j = 0; j < data.getBioCDocobj().Annotations.get(i).size(); j++) /** Paragraphs : j */
			{
				for (int k = 0; k < data.getBioCDocobj().Annotations.get(i).get(j).size(); k++) // Annotation : k
				{
					String anno[] = data.getBioCDocobj().Annotations.get(i).get(j).get(k).split("\t");
					if(anno.length==5) //Species
	        		{
						String id=anno[4].replaceAll("\\*", "");
						data.getBioCDocobj().Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"\t"+anno[3]+"\t"+id);
	        		}
	        		else //Gene : if(anno.length==3)
	        		{
	        			/** 1. prefix */
		        		boolean SPfound = false;
		        		if(GNormPlus.GeneWithoutSPPrefix_hash.containsKey(anno[2].toLowerCase()))
						{
							//special case, and no need for prefix - SA
						}
						else
						{
							Pattern ptmp = Pattern.compile("^("+GNormPlus.PrefixID_hash.get(FocusSpecies)+")([A-Z].*)$");
							Matcher mtmp = ptmp.matcher(anno[2]);
							if(mtmp.find())
							{
								String MentionWoPrefix=mtmp.group(2);
								data.getBioCDocobj().Annotations.get(i).get(j).set(k, anno[0]+"\t"+anno[1]+"\t"+anno[2]+"|"+MentionWoPrefix+"\t"+anno[3]+"\tPrefix:"+FocusSpecies);
								SPfound=true;
							}
						}
		        		if(SPfound == false)
		        		{
		        			data.getBioCDocobj().Annotations.get(i).get(j).set(k,  data.getBioCDocobj().Annotations.get(i).get(j).get(k)+"\tFocus:"+FocusSpecies);
		        		}
	        		}
				}
			}
		}
		data.getBioCDocobj().BioCOutput(Filename,FilenameBioC,data.getBioCDocobj().Annotations,false,true);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy