
org.apache.ctakes.dictionary.lookup.ae.LookupParseUtilities Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.dictionary.lookup.ae;
import org.apache.ctakes.core.resource.FileResource;
import org.apache.ctakes.core.resource.JdbcConnectionResource;
import org.apache.ctakes.core.resource.LuceneIndexReaderResource;
import org.apache.ctakes.dictionary.lookup.Dictionary;
import org.apache.ctakes.dictionary.lookup.DictionaryEngine;
import org.apache.ctakes.dictionary.lookup.algorithms.LookupAlgorithm;
import org.apache.ctakes.dictionary.lookup.filter.StringPreLookupFilterImpl;
import org.apache.ctakes.dictionary.lookup.jdbc.JdbcDictionaryImpl;
import org.apache.ctakes.dictionary.lookup.lucene.LuceneDictionaryImpl;
import org.apache.ctakes.dictionary.lookup.strtable.StringTable;
import org.apache.ctakes.dictionary.lookup.strtable.StringTableDictionaryImpl;
import org.apache.ctakes.dictionary.lookup.strtable.StringTableFactory;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.sql.Connection;
import java.util.*;
/**
* @author Mayo Clinic
*/
public class LookupParseUtilities
{
//returns a set of LookupSpec objects
public static Set parseDescriptor(File descFile, UimaContext aContext, int maxListSize)
throws JDOMException, IOException, Exception
{
SAXBuilder saxBuilder = new SAXBuilder();
Document doc = saxBuilder.build(descFile);
maxSizeList = maxListSize; //ohnlp-Bugs-3296301 fixes limit the search results to fixed 100 records.
Map dictMap = parseDictionaries(aContext, doc.getRootElement().getChild(
"dictionaries"));
//ohnlp-Bugs-3296301
return parseLookupBindingXml(aContext, dictMap, doc.getRootElement().getChild("lookupBindings"));
}
public static Set parseDescriptor(File descFile, UimaContext aContext)
throws JDOMException, IOException, Exception
{
SAXBuilder saxBuilder = new SAXBuilder();
Document doc = saxBuilder.build(descFile);
Map dictMap = parseDictionaries(aContext, doc.getRootElement().getChild(
"dictionaries"));
//ohnlp-Bugs-3296301
return parseLookupBindingXml(aContext, dictMap, doc.getRootElement().getChild("lookupBindings"));
}
private static Map parseDictionaries(UimaContext aContext,
Element dictetteersEl) throws AnnotatorContextException, Exception
{
Map m = new HashMap<>();
Iterator> dictItr = dictetteersEl.getChildren().iterator();
while (dictItr.hasNext())
{
Element dictEl = (Element) dictItr.next();
String id = dictEl.getAttributeValue("id");
DictionaryEngine dictEngine = LookupParseUtilities.parseDictionaryXml(
aContext,
dictEl);
m.put(id, dictEngine);
}
return m;
}
private static DictionaryEngine parseDictionaryXml(UimaContext annotCtx,
Element rootDictEl) throws AnnotatorContextException, Exception
{
String extResrcKey = rootDictEl.getAttributeValue("externalResourceKey");
Boolean keepCase = Boolean.parseBoolean(rootDictEl.getAttributeValue("caseSensitive"));
Object extResrc = annotCtx.getResourceObject(extResrcKey);
if (extResrc == null)
{
throw new Exception("Unable to find external resource with key:"
+ extResrcKey);
}
Element lookupFieldEl = rootDictEl.getChild("lookupField");
String lookupFieldName = lookupFieldEl.getAttributeValue("fieldName");
Dictionary dict;
Element implEl = (Element) rootDictEl.getChild("implementation")
.getChildren()
.get(0);
String implType = implEl.getName();
if (implType.equals("luceneImpl"))
{
if (!(extResrc instanceof LuceneIndexReaderResource))
{
throw new Exception("Expected external resource to be:"
+ LuceneIndexReaderResource.class);
}
IndexReader indexReader = ((LuceneIndexReaderResource) extResrc).getIndexReader();
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// Added 'MaxListSize' ohnlp-Bugs-3296301
dict = new LuceneDictionaryImpl(indexSearcher, lookupFieldName, maxSizeList);
}
else if (implType.equals("jdbcImpl"))
{
String tableName = implEl.getAttributeValue("tableName");
if (!(extResrc instanceof JdbcConnectionResource))
{
throw new Exception("Expected external resource to be:"
+ JdbcConnectionResource.class);
}
Connection conn = ((JdbcConnectionResource) extResrc).getConnection();
dict = new JdbcDictionaryImpl(conn, tableName, lookupFieldName);
}
else if (implType.equals("csvImpl"))
{
String fieldDelimiter = implEl.getAttributeValue("delimiter");
if (!(extResrc instanceof FileResource))
{
throw new Exception("Expected external resource to be:"
+ FileResource.class);
}
String idxFieldNameStr = implEl.getAttributeValue("indexedFieldNames");
StringTokenizer st = new StringTokenizer(idxFieldNameStr, ",");
int arrIdx = 0;
String[] idxFieldNameArr = new String[st.countTokens()];
while (st.hasMoreTokens())
{
idxFieldNameArr[arrIdx++] = st.nextToken().trim();
}
File csvFile = ((FileResource) extResrc).getFile();
try(FileReader fr = new FileReader(csvFile)){
StringTable strTable = StringTableFactory.build(
fr,
fieldDelimiter,
idxFieldNameArr,
true);
dict = new StringTableDictionaryImpl(strTable, lookupFieldName);
}
}
else
{
throw new Exception("Unsupported impl type:" + implType);
}
Iterator> metaFieldItr = rootDictEl.getChild("metaFields")
.getChildren()
.iterator();
while (metaFieldItr.hasNext())
{
Element metaFieldEl = (Element) metaFieldItr.next();
String metaFieldName = metaFieldEl.getAttributeValue("fieldName");
dict.retainMetaData(metaFieldName);
}
DictionaryEngine dictEngine = new DictionaryEngine(dict, keepCase.booleanValue());
Element excludeList = rootDictEl.getChild("excludeList");
if (excludeList != null && excludeList.getChildren() != null && excludeList.getChildren().size() > 0) {
addExcludeList(dictEngine, excludeList.getChildren().iterator());
}
return dictEngine;
}
/*
* Word(s) not to look up
* TODO Consider adding common words as possible performance improvement
*/
private static void addExcludeList(DictionaryEngine ge, Iterator> itr) {
HashSet hs = new HashSet<>();
while(itr.hasNext()) {
Element item = (Element) itr.next();
String s = item.getAttributeValue("value");
System.out.println("Adding exclude value["+s+"]"); // TODO - use logger
hs.add(s);
}
StringPreLookupFilterImpl plf = new StringPreLookupFilterImpl(hs);
ge.addPreLookupFilter(plf);
}
private static Set parseLookupBindingXml(UimaContext annotCtx,
Map dictMap, Element lookupBindingsEl) throws Exception {
Set lsSet = new HashSet<>();
Iterator> itr = lookupBindingsEl.getChildren().iterator();
while (itr.hasNext())
{
Element bindingEl = (Element) itr.next();
Element dictEl = bindingEl.getChild("dictionaryRef");
String dictID = dictEl.getAttributeValue("idRef");
DictionaryEngine dictEngine = dictMap.get(dictID);
if (dictEngine == null)
{
throw new Exception("Dictionary undefined: " + dictID);
}
Class>[] constrArgs = { UimaContext.class, Properties.class };
Class>[] constrArgsConsum = { UimaContext.class, Properties.class, int.class };//ohnlp-Bugs-3296301
Class>[] constrArgsConsumB = { UimaContext.class, Properties.class };
Element lookupInitEl = bindingEl.getChild("lookupInitializer");
String liClassName = lookupInitEl.getAttributeValue("className");
Element liPropertiesEl = lookupInitEl.getChild("properties");
Properties liProps = parsePropertiesXml(liPropertiesEl);
Class> liClass = Class.forName(liClassName);
Constructor> liConstr = liClass.getConstructor(constrArgs);
Object[] liArgs = { annotCtx, liProps };
LookupInitializer li = (LookupInitializer) liConstr.newInstance(liArgs);
Element lookupConsumerEl = bindingEl.getChild("lookupConsumer");
String lcClassName = lookupConsumerEl.getAttributeValue("className");
Element lcPropertiesEl = lookupConsumerEl.getChild("properties");
Properties lcProps = parsePropertiesXml(lcPropertiesEl);
Class> lcClass = Class.forName(lcClassName);
Constructor>[] consts = lcClass.getConstructors();
Constructor> lcConstr = null;
Object[] lcArgs = null;
for(int i=0;i itr = propsEl.getChildren().iterator();
while (itr.hasNext())
{
Element propEl = (Element) itr.next();
String key = propEl.getAttributeValue("key");
String value = propEl.getAttributeValue("value");
props.put(key, value);
}
return props;
}
// Added 'maxListSize'. Size equals max int by default
private static int maxSizeList = Integer.MAX_VALUE; //ohnlp-Bugs-3296301
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy