All Downloads are FREE. Search and download functionalities are using the official Maven repository.

GNormPluslib.PrefixTree Maven / Gradle / Ivy

/**
 * Project: GNormPlus
 * Function: Dictionary lookup by Prefix Tree
 */

package GNormPluslib;

import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PrefixTree
{
	private Tree Tr=new Tree();
	
	/*
	 * Read Dictionary and insert Mention into the Prefix Tree
	 */
	public static HashMap StopWord_hash = new HashMap();
	
	public void Hash2Tree(HashMap ID2Names)
	{
		for(String ID : ID2Names.keySet())  
		{
			String NameColumn[]=ID2Names.get(ID).split("\\|");
			for(int i=0;i1)
				{
					Column[0]=Column[0].replace("species:ncbi:","");
					Column[1]=Column[1].replaceAll(" strain=", " ");
					Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
					Column[1]=Column[1].replaceAll("[\\(\\)]", " ");
					String SpNameColumn[]=Column[1].split("\\|");
					for(int i=0;i=3
							)
						{
							boolean stopword_boolean=false;
							for(String stopword_RegEx : StopWord_hash.keySet())
							{
								Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$");
								Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase());
								if(mtmp.find())
								{
									stopword_boolean=true;
								}
							}
							if(stopword_boolean == false)
							{
								Tr.insertMention(SpNameColumn[i],Column[0]);
							}
						}
						/*
						 * Criteria for Gene
						 */
						else if (MentionType.equals("Gene") &&  
								(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
								tmp.length()>=3
								)
						{
							if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
							{
								Tr.insertMention(SpNameColumn[i],Column[0]);
							}
						}
						/*
						 * Criteria for Cell
						 */
						else if (MentionType.equals("Cell") && 
								(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
								tmp.length()>=3
								)
						{
							if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
							{
								Tr.insertMention(SpNameColumn[i],Column[0]);
							}
						}
						/*
						 * others
						 */
						else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
								tmp.length()>=3
								)
						{
							if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
							{
								Tr.insertMention(SpNameColumn[i],Column[0]);
							}
						}
					}
				}
			}
			inputfile.close();	
		}
		catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");}
	}
	public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx)	
	{
		try 
		{
			//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
			
			/** Stop Word */
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
			String line="";
			while ((line = br.readLine()) != null)  
			{
				StopWord_hash.put(line, "StopWord");
			}
			br.close();	
			
			BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
			line="";
			//int count=0;
			while ((line = inputfile.readLine()) != null)  
			{
				//count++;
				//if(count%10000==0){	System.out.println(count);	}
				String Column[]=line.split("\t");
				if(Column.length>1)
				{
					if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
					{
						if(Preifx.equals(""))
						{
							Tr.insertMention(Column[0],Column[1]);
						}
						else if(Preifx.equals("Num") && Column[0].matches("[0-9].*"))
						{
							Tr.insertMention(Column[0],Column[1]);
						}
						else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*"))
						{
							Tr.insertMention(Column[0],Column[1]);
						}
						else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
						{
							if( ! Column[0].matches("loc[0-9]+"))
							{
								Tr.insertMention(Column[0],Column[1]);
							}
						}
						else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
						{
							if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar"))
							{
								// remove uncharacterized
							}
							else
							{
								Tr.insertMention(Column[0],Column[1]);
							}
						}
						else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
						{
							Tr.insertMention(Column[0],Column[1]);
						}
					}
				}
			}
			inputfile.close();	
		}
		catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
	}
	public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx)	
	{
		try 
		{
			//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
			
			/** Stop Word */
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
			String line="";
			while ((line = br.readLine()) != null)  
			{
				StopWord_hash.put(line, "StopWord");
			}
			br.close();	
			
			BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
			line="";
			while ((line = inputfile.readLine()) != null)  
			{
				//count++;
				//if(count%10000==0){	System.out.println(count);	}
				String Column[]=line.split("\t");
				if(Column.length>1)
				{
					if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
					{
						if(Preifx.equals("")) //all
						{
							if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
							{
								String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
								String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
								if(mention_tmp.length()>=10)
								{
									Tr.insertMention(mention_rev,Column[1]);
								}
							}
							else
							{
								Tr.insertMention(Column[0],Column[1]); // mention, id
							}
							
						}
						else if(Column[0].matches("[0-9][0-9].*"))
						{
							if(Preifx.equals("Num"))
							{
								if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
								{
									String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
									String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
									if(mention_tmp.length()>=10)
									{
										Tr.insertMention(mention_rev,Column[1]);
									}
								}
								else
								{
									Tr.insertMention(Column[0],Column[1]); // mention, id
								}
							}
						}
						/*
						else if(Column[0].matches("[a-z][0-9].*"))
						{
							if(Preifx.equals("AZNum"))
							{
								if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
								{
									String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
									String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
									if(mention_tmp.length()>=10)
									{
										Tr.insertMention(mention_rev,Column[1]);
									}
								}
								else
								{
									Tr.insertMention(Column[0],Column[1]); // mention, id
								}
							}
						}
						*/
						else if(Column[0].matches("[a-z][a-z].*"))
						{
							if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
							{
								if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
								{
									String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
									String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
									if(mention_tmp.length()>=10)
									{
										Tr.insertMention(mention_rev,Column[1]);
									}
								}
								else
								{
									Tr.insertMention(Column[0],Column[1]); // mention, id
								}
							}
						}
						else if(Preifx.equals("Others"))
						{
							if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
							{
								String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
								String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
								if(mention_tmp.length()>=10)
								{
									Tr.insertMention(mention_rev,Column[1]);
								}
							}
							else
							{
								Tr.insertMention(Column[0],Column[1]); // mention, id
							}
						}
					}
				}
			}
			inputfile.close();	
		}
		catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
	}
	public void TreeFile2Tree(String Filename)	
	{
		try 
		{
			//System.out.println("TreeFile2Tree : " + Filename);
			
			BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
			String line="";
			int count=0;
			while ((line = inputfile.readLine()) != null)  
			{
				String Anno[]=line.split("\t");
				if(Anno.length<2){System.out.println(count+"\t"+line);} //check error
				String LocationInTree = Anno[0];
				String token = Anno[1];
				String identifier="";
				if(Anno.length==3)
				{
					identifier = Anno[2];
				}
				String LocationsInTree[]=LocationInTree.split("-");
				TreeNode tmp = Tr.root;
				for(int i=0;i location = new ArrayList();
		String Menlist[]=Mentions.split("\\|");
		for(int m=0;m=0) //Find Tokens in the links
			{
				if(i == Tkns.length-1){PrefixTranslation = 1;}
				tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
				find=true;
				i++;
			}
			if(find == true)
			{
				if(i==Tkns.length)
				{
					if(!tmp.Concept.equals(""))
					{
						return tmp.Concept;
					}
					else
					{
						return "-1";
						//gene id is not found.
					}
				}
				else
				{
					return "-2";
					//the gene mention matched a substring in PrefixTree.
				}
			}
			else
			{
				return "-3";
				//mention is not found
			}
		}
		return "-3"; //mention is not found
	}
	
	/*
	 * Search target mention in the Prefix Tree
	 */
	public String MentionMatch_species(String Mentions)
	{
		ArrayList location = new ArrayList();
		String Menlist[]=Mentions.split("\\|");
		for(int m=0;m=0) //Find Tokens in the links
			{
				if(i == Tkns.length-1){PrefixTranslation = 1;}
				tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
				find=true;
				i++;
			}
			if(find == true)
			{
				if(i==Tkns.length)
				{
					if(!tmp.Concept.equals(""))
					{
						return tmp.Concept;
					}
					else
					{
						return "-1";
						//gene id is not found.
					}
				}
				else
				{
					return "-2";
					//the gene mention matched a substring in PrefixTree.
				}
			}
			else
			{
				return "-3";
				//mention is not found
			}
		}
		return "-3"; //mention is not found
	}
	
	/*
	 * Search target mention in the Prefix Tree
	 * ConceptType: Species|Genus|Cell|CTDGene
	 */
	public ArrayList SearchMentionLocation(String Doc,String ConceptType)
	{
		ArrayList location = new ArrayList();
		Doc=Doc+" XXXX XXXX";
		String Doc_org=Doc;
		Doc=Doc.toLowerCase();
		String Doc_lc=Doc;
		Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
		Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
		Doc = Doc.replaceAll("[\\W^;:,]+", " ");
		
		/* = keep special characters =
		 * 
		String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
		String DocTkns[]=Doc.split(regex);
		 */
		
		String DocTkns[]=Doc.split(" ");
		int Offset=0;
		int Start=0;
		int Last=0;
		int FirstTime=0;
		
		while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
		{
			Doc_lc=Doc_lc.substring(1);
			Offset++;
		}
		
		for(int i=0;i=0 ) //Find Tokens in the links
			{
				FirstTime_while++;
				tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); //move point to the link
				if(Start==0 && FirstTime>0){Start = Offset;} //Start <- Offset 
				if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
				{
					if(DocTkns[i].length()>0)
					{
						Doc_lc=Doc_lc.substring(DocTkns[i].length());
						Offset=Offset+DocTkns[i].length();
					}
				}
				Last = Offset;
				while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
				{
					Doc_lc=Doc_lc.substring(1);
					Offset++;
				}
				i++;
				
				if(ConceptType.equals("Species"))
				{
					if(i0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
						{
							Doc_lc=Doc_lc.substring(1);
							Offset++;
						}
						i++;
					}
				}
				
				if(!tmp.Concept.equals("") && (Last-Start>0)) //Keep found concept
				{
					if(Last=DocTkns.length){break;}
				else if(i==DocTkns.length-1){PrefixTranslation=2;}
				
				//System.out.println(i+"\t"+Start+"\t"+Last+"\t("+FirstTime_while+")\t"+Offset+"\t"+Doc_lc);
				
				if(FirstTime_while==0) // first matched token
				{
					pre_i=i;
					pre_Start=Start;
					pre_Last=Last;
					pre_Doc_lc=Doc_lc;
					pre_Offset=Offset;
				}
			}
			
			if(find == true)
			{
				//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
				if(!tmp.Concept.equals("")) //the last matched token has concept id 
				{
					if(LastStart)
					{
						location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
					}
				}
				else
				{
					if(!ConceptFound_STR.equals("")) //Keep found concept
					{
						location.add(ConceptFound_STR);
						i = ConceptFound + 1;
					}
					
					if(FirstTime_while>=1)
					{
						i=pre_i;
						Start=pre_Start;
						Last=pre_Last;
						Doc_lc=pre_Doc_lc;
						Offset=pre_Offset;
					}
				}
				Start=0;
				Last=0;
				if(i>0){i--;}
				ConceptFound=i; //Keep found concept
				ConceptFound_STR="";//Keep found concept
			}
			else //if(find == false)
			{
				//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
				
				if(FirstTime_while>=1 && tmp.Concept.equals(""))
				{
					i=pre_i;
					Start=pre_Start;
					Last=pre_Last;
					Doc_lc=pre_Doc_lc;
					Offset=pre_Offset;
				}
				
				if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
				{
					if(DocTkns[i].length()>0)
					{
						Doc_lc=Doc_lc.substring(DocTkns[i].length());
						Offset=Offset+DocTkns[i].length();
					}
				}
			}
			
			while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
			{
				Doc_lc=Doc_lc.substring(1);
				Offset++;
			}
			FirstTime++;
			
			//System.out.println();
		}
		return location;
	}
	
	/*
	 * Print out the Prefix Tree
	 */
	public String PrintTree()
	{
		return Tr.PrintTree_preorder(Tr.root,"");
	}
	
	public void SaveTree(String outputfile) throws IOException
	{
		BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));
		Tr.SaveTree_preorder(Tr.root,"",fr);
		fr.close();
	}
	
	
	public void insertMention(String Mention, String Identifier)
	{
		Tr.insertMention(Mention,Identifier);
	}
}

class Tree 
{
	/*
	 * Prefix Tree - root node
	 */
	public TreeNode root;
	
	public Tree() 
	{ 
		root = new TreeNode("-ROOT-"); 
	}
	
	/*
	 * Insert mention into the tree
	 */
	public void insertMention(String Mention, String Identifier)
	{
		Mention=Mention.toLowerCase();
		
		Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2");
		Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2");
		Mention = Mention.replaceAll("[\\W\\-\\_]+", " ");
		/* = keep special characters =
		 * 
		String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
		String Tokens[]=Mention.split(regex);
		 */
		String Tokens[]=Mention.split(" ");
		TreeNode tmp = root;
		for(int i=0;i=0)
			{
				tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); //go through next generation (exist node)
				if(i == Tokens.length-1)
				{
					tmp.Concept=Identifier;
				}
			}
			else //not exist
			{
				if(i == Tokens.length-1)
				{
					tmp.InsertToken(Tokens[i],Identifier);
				}
				else
				{
					tmp.InsertToken(Tokens[i]);
				}
				tmp=tmp.links.get(tmp.NumOflinks-1); //go to the next generation (new node)
			}
		}
	}
	
	/*
	 * Print the tree by pre-order
	 */
	public String PrintTree_preorder(TreeNode node, String LocationInTree)
	{
		String opt="";
		if(!node.token.equals("-ROOT-"))//Ignore root
		{
			if(node.Concept.equals(""))
			{
				opt=opt+LocationInTree+"\t"+node.token+"\n";
			}
			else
			{
				opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n";
			}
		} 
		if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
		for(int i=0;i Hashs;
	ArrayList links;
	
	public TreeNode(String Tok,String ID)
	{
		token = Tok;
		NumOflinks = 0;
		Concept = ID;
		links = new ArrayList();/*link*/
		Hashs = new HashMap();/*hash*/
	}
	public TreeNode(String Tok)
	{
		token = Tok;
		NumOflinks = 0;
		Concept = "";
		links = new ArrayList();/*link*/
		Hashs = new HashMap();/*hash*/
	}
	public TreeNode()
	{
		token = "";
		NumOflinks = 0;
		Concept = "";
		links = new ArrayList();/*link*/
		Hashs = new HashMap();/*hash*/
	}
	
	public String toString()
	{
		return (token+"\t"+Concept);
	}
	
	/*
	 * Insert an new node under the target node
	 */
	public void InsertToken(String Tok)
	{
		TreeNode NewNode = new TreeNode(Tok);
		
		/*link*/
		links.add(NewNode);
		
		/*hash*/
		Hashs.put(Tok, NumOflinks);
		
		NumOflinks++;
	}
	public void InsertToken(String Tok,String ID)
	{
		TreeNode NewNode = new TreeNode(Tok,ID);
		/*link*/
		links.add(NewNode);
		
		/*hash*/
		Hashs.put(Tok, NumOflinks);
		
		NumOflinks++;
	}
	
	/*
	 * Check the tokens of children
	 */
	public int CheckChild(String Tok, Integer PrefixTranslation)
	{
		if(Hashs.containsKey(Tok))
		{
			return(Hashs.get(Tok));
		}
		
		if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) // SuffixTranslationMap
		{
			if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok)))
			{
				return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok)));
			}
			
		}
		else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) // for CTDGene feature
		{
			for(int i=0;i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy