
org.apache.ctakes.ytex.tools.SetupAuiFirstWord Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.ytex.tools;
import gov.nih.nlm.nls.lvg.Api.LvgCmdApi;
import org.apache.ctakes.core.nlp.tokenizer.Token;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
import org.apache.ctakes.ytex.kernel.KernelContextHolder;
import org.apache.ctakes.ytex.umls.dao.UMLSDao;
import org.apache.ctakes.ytex.umls.model.UmlsAuiFirstWord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.support.TransactionTemplate;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.net.URL;
import java.util.*;
/**
* setup umls_aui_fword table
*
* @author vijay
*
*/
public class SetupAuiFirstWord {
static private final Logger LOGGER = LoggerFactory.getLogger(SetupAuiFirstWord.class);
// private static final Pattern nonWord = Pattern.compile("\\W");
private TokenizerPTB tokenizer;
private LvgCmdApi lvgCmd;
private Set exclusionSet = null;
/**
* Initialize tokenizer using the hyphen map from
* "tokenizer/hyphenated.txt". Use freqCutoff of 0. If this is changed in
* the TokenizerAnnotator.xml uima config, then the tokenization here will
* not match the tokenization done during document processing.
*
* Initialize exclusionSet from LvgAnnotator.xml. The exclusion set should
* be case insensitive, but it isn't that way in the LvgAnnotator so we
* retain the same functionality.
*
* Initialize LVG. copied from
* edu.mayo.bmi.uima.lvg.resource.LvgCmdApiResourceImpl.
*
* @throws Exception
*/
public SetupAuiFirstWord() throws Exception {
initTokenizer();
// initialize exclusion set
initExclusionSet();
initLvg();
}
/**
* initialize lvgCmd
*/
private void initLvg() {
// See
// http://lexsrv2.nlm.nih.gov/SPECIALIST/Projects/lvg/2008/docs/userDoc/index.html
// See
// http://lexsrv3.nlm.nih.gov/SPECIALIST/Projects/lvg/2008/docs/designDoc/UDF/flow/index.html
// Lower-case the terms and then uninflect
// f = using flow components (in this order)
// l = lower case
// b = uninflect a term
try {
URL uri = this.getClass().getClassLoader()
.getResource("org/apache/ctakes/lvg/data/config/lvg.properties");
LOGGER.info("loading lvg.properties from:" + uri.getPath());
File f = new File(uri.getPath());
LOGGER.info(f.getAbsolutePath());
String configDir = f.getParentFile().getAbsolutePath();
String lvgDir = configDir.substring(0, configDir.length()
- "data/config".length());
System.setProperty("user.dir", lvgDir);
lvgCmd = new LvgCmdApi("-f:l:b", f.getAbsolutePath());
} catch (Exception e) {
LOGGER.warn("could not initialize lvg - will not create a stemmed dictionary.", e);
}
}
/**
* initialize lvg exclusion set
*
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
*/
private void initExclusionSet() throws ParserConfigurationException,
SAXException, IOException {
this.exclusionSet = new HashSet();
InputStream isLvgAnno = null;
try {
isLvgAnno = this
.getClass()
.getClassLoader()
.getResourceAsStream(
"ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml");
if(isLvgAnno == null) {
LOGGER.warn("classpath:ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml not available, attempting to load from file system");
File f = new File("../ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml");
if(f.exists())
isLvgAnno = new BufferedInputStream(new FileInputStream(f));
}
if (isLvgAnno == null) {
LOGGER.warn("ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml not available, using empty exclusion set");
} else {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory
.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(isLvgAnno);
NodeList nList = doc.getElementsByTagName("nameValuePair");
for (int i = 0; i < nList.getLength(); i++) {
Element e = (Element) nList.item(i);
String name = e.getElementsByTagName("name").item(0)
.getChildNodes().item(0).getNodeValue();
if ("ExclusionSet".equals(name)) {
NodeList nListEx = e.getElementsByTagName("string");
for (int j = 0; j < nListEx.getLength(); j++) {
exclusionSet.add(nListEx.item(j).getChildNodes()
.item(0).getNodeValue());
}
}
}
}
} finally {
if (isLvgAnno != null)
isLvgAnno.close();
}
}
/**
* initialize the tokenizer. loads the hypenated word list.
*
* @throws FileNotFoundException
* @throws IOException
*/
private void initTokenizer() throws FileNotFoundException, IOException {
this.tokenizer = new TokenizerPTB();
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
SetupAuiFirstWord setupFword = new SetupAuiFirstWord();
setupFword.setupAuiFirstWord();
}
public void setupAuiFirstWord() {
UMLSDao umlsDao = KernelContextHolder.getApplicationContext().getBean(
UMLSDao.class);
TransactionTemplate t = new TransactionTemplate(KernelContextHolder
.getApplicationContext().getBean(
PlatformTransactionManager.class));
t.setPropagationBehavior(TransactionTemplate.PROPAGATION_REQUIRES_NEW);
// delete all records
// umlsDao.deleteAuiFirstWord();
// get all auis and their strings
// restart processing after the last aui we processed.
// if this is null, then just process everything
String lastAui = umlsDao.getLastAui();
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy