GNormPluslib.PrefixTree Maven / Gradle / Ivy
/**
* Project: GNormPlus
* Function: Dictionary lookup by Prefix Tree
*/
package GNormPluslib;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PrefixTree
{
private Tree Tr=new Tree();
/*
* Read Dictionary and insert Mention into the Prefix Tree
*/
public static HashMap StopWord_hash = new HashMap();
public void Hash2Tree(HashMap ID2Names)
{
for(String ID : ID2Names.keySet())
{
String NameColumn[]=ID2Names.get(ID).split("\\|");
for(int i=0;i1)
{
Column[0]=Column[0].replace("species:ncbi:","");
Column[1]=Column[1].replaceAll(" strain=", " ");
Column[1]=Column[1].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
Column[1]=Column[1].replaceAll("[\\(\\)]", " ");
String SpNameColumn[]=Column[1].split("\\|");
for(int i=0;i=3
)
{
boolean stopword_boolean=false;
for(String stopword_RegEx : StopWord_hash.keySet())
{
Pattern ptmp = Pattern.compile("^"+stopword_RegEx+"$");
Matcher mtmp = ptmp.matcher(SpNameColumn[i].toLowerCase());
if(mtmp.find())
{
stopword_boolean=true;
}
}
if(stopword_boolean == false)
{
Tr.insertMention(SpNameColumn[i],Column[0]);
}
}
/*
* Criteria for Gene
*/
else if (MentionType.equals("Gene") &&
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
tmp.length()>=3
)
{
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
{
Tr.insertMention(SpNameColumn[i],Column[0]);
}
}
/*
* Criteria for Cell
*/
else if (MentionType.equals("Cell") &&
(!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
tmp.length()>=3
)
{
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
{
Tr.insertMention(SpNameColumn[i],Column[0]);
}
}
/*
* others
*/
else if ((!SpNameColumn[i].substring(0, 1).matches("[\\W\\-\\_]")) &&
tmp.length()>=3
)
{
if(!StopWord_hash.containsKey(SpNameColumn[i].toLowerCase()))
{
Tr.insertMention(SpNameColumn[i],Column[0]);
}
}
}
}
}
inputfile.close();
}
catch(IOException e1){ System.out.println("[Dictionary2Tree_Combine]: Input file is not exist.");}
}
public void Dictionary2Tree_UniqueGene(String Filename,String StopWords,String Preifx)
{
try
{
//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
/** Stop Word */
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
String line="";
while ((line = br.readLine()) != null)
{
StopWord_hash.put(line, "StopWord");
}
br.close();
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
line="";
//int count=0;
while ((line = inputfile.readLine()) != null)
{
//count++;
//if(count%10000==0){ System.out.println(count); }
String Column[]=line.split("\t");
if(Column.length>1)
{
if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
{
if(Preifx.equals(""))
{
Tr.insertMention(Column[0],Column[1]);
}
else if(Preifx.equals("Num") && Column[0].matches("[0-9].*"))
{
Tr.insertMention(Column[0],Column[1]);
}
else if(Preifx.equals("AZNum") && Column[0].matches("[a-z][0-9].*"))
{
Tr.insertMention(Column[0],Column[1]);
}
else if(Preifx.equals("lo") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
{
if( ! Column[0].matches("loc[0-9]+"))
{
Tr.insertMention(Column[0],Column[1]);
}
}
else if(Preifx.equals("un") && Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
{
if(Column[0].length()>=6 && Column[0].substring(0,6).equals("unchar"))
{
// remove uncharacterized
}
else
{
Tr.insertMention(Column[0],Column[1]);
}
}
else if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
{
Tr.insertMention(Column[0],Column[1]);
}
}
}
}
inputfile.close();
}
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
}
public void Dictionary2Tree_UniqueSpecies(String Filename,String StopWords,String Preifx)
{
try
{
//System.out.println("Dictionary2Tree_UniqueGene : " + Filename);
/** Stop Word */
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(StopWords), "UTF-8"));
String line="";
while ((line = br.readLine()) != null)
{
StopWord_hash.put(line, "StopWord");
}
br.close();
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
line="";
while ((line = inputfile.readLine()) != null)
{
//count++;
//if(count%10000==0){ System.out.println(count); }
String Column[]=line.split("\t");
if(Column.length>1)
{
if(!StopWord_hash.containsKey(Column[0].toLowerCase()))
{
if(Preifx.equals("")) //all
{
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
{
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
if(mention_tmp.length()>=10)
{
Tr.insertMention(mention_rev,Column[1]);
}
}
else
{
Tr.insertMention(Column[0],Column[1]); // mention, id
}
}
else if(Column[0].matches("[0-9][0-9].*"))
{
if(Preifx.equals("Num"))
{
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
{
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
if(mention_tmp.length()>=10)
{
Tr.insertMention(mention_rev,Column[1]);
}
}
else
{
Tr.insertMention(Column[0],Column[1]); // mention, id
}
}
}
/*
else if(Column[0].matches("[a-z][0-9].*"))
{
if(Preifx.equals("AZNum"))
{
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
{
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
if(mention_tmp.length()>=10)
{
Tr.insertMention(mention_rev,Column[1]);
}
}
else
{
Tr.insertMention(Column[0],Column[1]); // mention, id
}
}
}
*/
else if(Column[0].matches("[a-z][a-z].*"))
{
if(Column[0].length()>2 && Column[0].substring(0,2).equals(Preifx))
{
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
{
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
if(mention_tmp.length()>=10)
{
Tr.insertMention(mention_rev,Column[1]);
}
}
else
{
Tr.insertMention(Column[0],Column[1]); // mention, id
}
}
}
else if(Preifx.equals("Others"))
{
if(Column[0].matches(".*[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_].*"))
{
String mention_rev=Column[0].replaceAll("[\\W\\-\\_](str\\.|strain|substr\\.|substrain|var\\.|variety|variant|subsp\\.|subspecies|pv\\.|pathovars|pathovar|br\\.|biovar)[\\W\\-\\_]", " ");
String mention_tmp=mention_rev.replaceAll("[\\W\\-\\_]","");
if(mention_tmp.length()>=10)
{
Tr.insertMention(mention_rev,Column[1]);
}
}
else
{
Tr.insertMention(Column[0],Column[1]); // mention, id
}
}
}
}
}
inputfile.close();
}
catch(IOException e1){ System.out.println("[Dictionary2Tree_UniqueGene]: Input file is not exist.");}
}
public void TreeFile2Tree(String Filename)
{
try
{
//System.out.println("TreeFile2Tree : " + Filename);
BufferedReader inputfile = new BufferedReader(new InputStreamReader(new FileInputStream(Filename), "UTF-8"));
String line="";
int count=0;
while ((line = inputfile.readLine()) != null)
{
String Anno[]=line.split("\t");
if(Anno.length<2){System.out.println(count+"\t"+line);} //check error
String LocationInTree = Anno[0];
String token = Anno[1];
String identifier="";
if(Anno.length==3)
{
identifier = Anno[2];
}
String LocationsInTree[]=LocationInTree.split("-");
TreeNode tmp = Tr.root;
for(int i=0;i location = new ArrayList();
String Menlist[]=Mentions.split("\\|");
for(int m=0;m=0) //Find Tokens in the links
{
if(i == Tkns.length-1){PrefixTranslation = 1;}
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
find=true;
i++;
}
if(find == true)
{
if(i==Tkns.length)
{
if(!tmp.Concept.equals(""))
{
return tmp.Concept;
}
else
{
return "-1";
//gene id is not found.
}
}
else
{
return "-2";
//the gene mention matched a substring in PrefixTree.
}
}
else
{
return "-3";
//mention is not found
}
}
return "-3"; //mention is not found
}
/*
* Search target mention in the Prefix Tree
*/
public String MentionMatch_species(String Mentions)
{
ArrayList location = new ArrayList();
String Menlist[]=Mentions.split("\\|");
for(int m=0;m=0) //Find Tokens in the links
{
if(i == Tkns.length-1){PrefixTranslation = 1;}
tmp=tmp.links.get(tmp.CheckChild(Tkns[i],PrefixTranslation)); //move point to the link
find=true;
i++;
}
if(find == true)
{
if(i==Tkns.length)
{
if(!tmp.Concept.equals(""))
{
return tmp.Concept;
}
else
{
return "-1";
//gene id is not found.
}
}
else
{
return "-2";
//the gene mention matched a substring in PrefixTree.
}
}
else
{
return "-3";
//mention is not found
}
}
return "-3"; //mention is not found
}
/*
* Search target mention in the Prefix Tree
* ConceptType: Species|Genus|Cell|CTDGene
*/
public ArrayList SearchMentionLocation(String Doc,String ConceptType)
{
ArrayList location = new ArrayList();
Doc=Doc+" XXXX XXXX";
String Doc_org=Doc;
Doc=Doc.toLowerCase();
String Doc_lc=Doc;
Doc = Doc.replaceAll("([0-9])([A-Za-z])", "$1 $2");
Doc = Doc.replaceAll("([A-Za-z])([0-9])", "$1 $2");
Doc = Doc.replaceAll("[\\W^;:,]+", " ");
/* = keep special characters =
*
String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
String DocTkns[]=Doc.split(regex);
*/
String DocTkns[]=Doc.split(" ");
int Offset=0;
int Start=0;
int Last=0;
int FirstTime=0;
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
{
Doc_lc=Doc_lc.substring(1);
Offset++;
}
for(int i=0;i=0 ) //Find Tokens in the links
{
FirstTime_while++;
tmp=tmp.links.get(tmp.CheckChild(DocTkns[i],PrefixTranslation)); //move point to the link
if(Start==0 && FirstTime>0){Start = Offset;} //Start <- Offset
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
{
if(DocTkns[i].length()>0)
{
Doc_lc=Doc_lc.substring(DocTkns[i].length());
Offset=Offset+DocTkns[i].length();
}
}
Last = Offset;
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
{
Doc_lc=Doc_lc.substring(1);
Offset++;
}
i++;
if(ConceptType.equals("Species"))
{
if(i0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
{
Doc_lc=Doc_lc.substring(1);
Offset++;
}
i++;
}
}
if(!tmp.Concept.equals("") && (Last-Start>0)) //Keep found concept
{
if(Last=DocTkns.length){break;}
else if(i==DocTkns.length-1){PrefixTranslation=2;}
//System.out.println(i+"\t"+Start+"\t"+Last+"\t("+FirstTime_while+")\t"+Offset+"\t"+Doc_lc);
if(FirstTime_while==0) // first matched token
{
pre_i=i;
pre_Start=Start;
pre_Last=Last;
pre_Doc_lc=Doc_lc;
pre_Offset=Offset;
}
}
if(find == true)
{
//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
if(!tmp.Concept.equals("")) //the last matched token has concept id
{
if(LastStart)
{
location.add(Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
}
}
else
{
if(!ConceptFound_STR.equals("")) //Keep found concept
{
location.add(ConceptFound_STR);
i = ConceptFound + 1;
}
if(FirstTime_while>=1)
{
i=pre_i;
Start=pre_Start;
Last=pre_Last;
Doc_lc=pre_Doc_lc;
Offset=pre_Offset;
}
}
Start=0;
Last=0;
if(i>0){i--;}
ConceptFound=i; //Keep found concept
ConceptFound_STR="";//Keep found concept
}
else //if(find == false)
{
//System.out.println(find+"\t"+FirstTime_while+"\t"+Start+"\t"+Last+"\t"+Doc_org.substring(Start, Last)+"\t"+tmp.Concept);
if(FirstTime_while>=1 && tmp.Concept.equals(""))
{
i=pre_i;
Start=pre_Start;
Last=pre_Last;
Doc_lc=pre_Doc_lc;
Offset=pre_Offset;
}
if(Doc_lc.length()>=DocTkns[i].length() && Doc_lc.substring(0,DocTkns[i].length()).equals(DocTkns[i]))
{
if(DocTkns[i].length()>0)
{
Doc_lc=Doc_lc.substring(DocTkns[i].length());
Offset=Offset+DocTkns[i].length();
}
}
}
while(Doc_lc.length()>0 && Doc_lc.substring(0,1).matches("[\\W]")) //clean the forward whitespace
{
Doc_lc=Doc_lc.substring(1);
Offset++;
}
FirstTime++;
//System.out.println();
}
return location;
}
/*
* Print out the Prefix Tree
*/
public String PrintTree()
{
return Tr.PrintTree_preorder(Tr.root,"");
}
public void SaveTree(String outputfile) throws IOException
{
BufferedWriter fr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputfile), "UTF-8"));
Tr.SaveTree_preorder(Tr.root,"",fr);
fr.close();
}
public void insertMention(String Mention, String Identifier)
{
Tr.insertMention(Mention,Identifier);
}
}
class Tree
{
/*
* Prefix Tree - root node
*/
public TreeNode root;
public Tree()
{
root = new TreeNode("-ROOT-");
}
/*
* Insert mention into the tree
*/
public void insertMention(String Mention, String Identifier)
{
Mention=Mention.toLowerCase();
Mention = Mention.replaceAll("([0-9])([A-Za-z])", "$1 $2");
Mention = Mention.replaceAll("([A-Za-z])([0-9])", "$1 $2");
Mention = Mention.replaceAll("[\\W\\-\\_]+", " ");
/* = keep special characters =
*
String regex="\\s+|(?=\\p{Punct})|(?<=\\p{Punct})";
String Tokens[]=Mention.split(regex);
*/
String Tokens[]=Mention.split(" ");
TreeNode tmp = root;
for(int i=0;i=0)
{
tmp=tmp.links.get( tmp.CheckChild(Tokens[i],0) ); //go through next generation (exist node)
if(i == Tokens.length-1)
{
tmp.Concept=Identifier;
}
}
else //not exist
{
if(i == Tokens.length-1)
{
tmp.InsertToken(Tokens[i],Identifier);
}
else
{
tmp.InsertToken(Tokens[i]);
}
tmp=tmp.links.get(tmp.NumOflinks-1); //go to the next generation (new node)
}
}
}
/*
* Print the tree by pre-order
*/
public String PrintTree_preorder(TreeNode node, String LocationInTree)
{
String opt="";
if(!node.token.equals("-ROOT-"))//Ignore root
{
if(node.Concept.equals(""))
{
opt=opt+LocationInTree+"\t"+node.token+"\n";
}
else
{
opt=opt+LocationInTree+"\t"+node.token+"\t"+node.Concept+"\n";
}
}
if(!LocationInTree.equals("")){LocationInTree=LocationInTree+"-";}
for(int i=0;i Hashs;
ArrayList links;
public TreeNode(String Tok,String ID)
{
token = Tok;
NumOflinks = 0;
Concept = ID;
links = new ArrayList();/*link*/
Hashs = new HashMap();/*hash*/
}
public TreeNode(String Tok)
{
token = Tok;
NumOflinks = 0;
Concept = "";
links = new ArrayList();/*link*/
Hashs = new HashMap();/*hash*/
}
public TreeNode()
{
token = "";
NumOflinks = 0;
Concept = "";
links = new ArrayList();/*link*/
Hashs = new HashMap();/*hash*/
}
public String toString()
{
return (token+"\t"+Concept);
}
/*
* Insert an new node under the target node
*/
public void InsertToken(String Tok)
{
TreeNode NewNode = new TreeNode(Tok);
/*link*/
links.add(NewNode);
/*hash*/
Hashs.put(Tok, NumOflinks);
NumOflinks++;
}
public void InsertToken(String Tok,String ID)
{
TreeNode NewNode = new TreeNode(Tok,ID);
/*link*/
links.add(NewNode);
/*hash*/
Hashs.put(Tok, NumOflinks);
NumOflinks++;
}
/*
* Check the tokens of children
*/
public int CheckChild(String Tok, Integer PrefixTranslation)
{
if(Hashs.containsKey(Tok))
{
return(Hashs.get(Tok));
}
if(PrefixTranslation == 1 && Tok.matches("(alpha|beta|gamam|[abg]|[12])")) // SuffixTranslationMap
{
if(Hashs.containsKey(GNormPlus.SuffixTranslationMap_hash.get(Tok)))
{
return(Hashs.get(GNormPlus.SuffixTranslationMap_hash.get(Tok)));
}
}
else if(PrefixTranslation == 2 && Tok.matches("[1-5]")) // for CTDGene feature
{
for(int i=0;i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy