
org.apache.ctakes.dictionary.assertion.CreateAssertionLuceneIndexFromDelimitedFile Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.dictionary.assertion;
import org.apache.ctakes.core.nlp.tokenizer.OffsetComparator;
import org.apache.ctakes.core.nlp.tokenizer.Token;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import java.io.*;
import java.util.*;
/**
* Driver for populating a Lucene Index with assertion cue phrases, so that the
* tokenization of the dictionary entries matches the tokenization that will be
* done to clinical text during pipeline processing. Just as the pipeline can
* use a file of hyphenated words to control which words should be considered
* as a single token, the creation of the dictionary entries can use a file of
* hyphenated words so the dictionary entries are tokenized in the same way as
* the clinical text will be.
*/
@SuppressWarnings( "unchecked" )
public class CreateAssertionLuceneIndexFromDelimitedFile {
private static TokenizerPTB tokenizer = new TokenizerPTB();
// The path to a directory containing one or more pipe-delimited files
// A new directory "assertion_cue_phrase_index" will be created in the
// parent. This new directory will be the lucene index directory.
private static String directoryOfDelimitedFiles = null;
// directoryOfDelimitedFiles =
// "/temp/pipe-delimited-dictionary-data/RxNorm";
private IndexWriter iwriter = null;
private int idCount = 0;
private final String ID = "UNIQUE_DOCUMENT_IDENTIFIER_FIELD";
private final String rxNormCode = "codeRxNorm";
private final String Code = "code";
private final String CodeToken = "codeTokenized";
private final String FirstWord = "first_word";
private final String OtherDesig = "other_designation";
private final String PreferDesig = "preferred_designation";
public static final String CUE_PHRASE_FIELD_NAME = "cuePhrase";
public static final String CUE_PHRASE_CATEGORY_FIELD_NAME = "cuePhraseCategory";
public static final String CUE_PHRASE_FAMILY_FIELD_NAME = "cuePhraseFamily";
public static final String CUE_PHRASE_FIRST_WORD_FIELD_NAME = "cuePhraseFirstWord";
/**
* Constructor
*
* @param tokenizer -
* Used to tokenize the dictionary entries
*/
public CreateAssertionLuceneIndexFromDelimitedFile(TokenizerPTB tokenizer)
throws Exception {
// Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
Analyzer analyzer = new StandardAnalyzer();
String defaultLoc = new File(directoryOfDelimitedFiles)
.getAbsolutePath();
boolean error = false;
long numEntries = 0;
try {
Directory directory = FSDirectory.open(new File(
new File(defaultLoc).getParent() + "/assertion_cue_phrase_index").toPath());
// IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
iwriter = new IndexWriter(directory, indexWriterConfig);
// iwriter = new IndexWriter(directory, analyzer, true,
// IndexWriter.MaxFieldLength.LIMITED);
// Process multiple files in directory
File file = new File(defaultLoc);
if (file.isDirectory()) {
String[] processFiles = file.list();
for (int i = 0; i < processFiles.length; i++) {
System.out.println("Process Each File in " + file.getName()
+ "...");
File nextFile = new File(directoryOfDelimitedFiles + "/"
+ processFiles[i]);
BufferedReader br = new BufferedReader(new FileReader(
nextFile));
String record = "";
while ((record = br.readLine()) != null) {
// System.out.println(" record so far out of " + record
// );
String splitRecord[] = record.split("\\|");
if (splitRecord.length == 0)
{ continue; }
String cuePhrase = splitRecord[0];
String cuePhraseCategory = "default_category";
String cuePhraseFamily = "default_family";
if (splitRecord.length >= 2)
{
cuePhraseCategory = splitRecord[1];
cuePhraseFamily = splitRecord[2];
if (cuePhraseCategory == null || cuePhraseCategory.isEmpty())
{
cuePhraseCategory = "category__" + cuePhraseFamily;
}
}
// String ssubstring = cueWordCategory.substring(cueWordCategory
// .indexOf('|') + 1);
// String source = ssubstring.substring(0,
// ssubstring.indexOf('|'));
//
// String tsubstring = ssubstring.substring(ssubstring
// .indexOf('|') + 1);
// String codeFromSource = tsubstring.substring(0,
// tsubstring.indexOf('|'));
//
// String usubstring = tsubstring.substring(tsubstring
// .indexOf('|') + 1);
// String isPreferred = usubstring.substring(0,
// usubstring.indexOf('|'));
//
// String semIds = usubstring.substring(usubstring
// .indexOf('|') + 1);
// System.out.println(" " + cui +
// " processed so far out of " +
// propertyValue + " -- " + sourceCC +" ispref "
// +isPreferred +
// " semIds " +semIds);
// writeToFormatLucene(cueWord, propertyValue, source,
// codeFromSource, isPreferred, semIds);
writeToFormatLucene(cuePhrase, cuePhraseCategory, cuePhraseFamily);
numEntries++;
}
}
}
} catch (IOException io) {
System.out.println("IO exception caught");
error = true;
} finally {
try {
iwriter.maybeMerge();
iwriter.close();
if (!error) {
System.out.println("Index created with " + numEntries
+ " entries.");
}
} catch (IOException io) {
System.out.println("IO exception caught");
}
}
}
public static void main(String[] args) {
System.gc();
if (args.length == 1) { // If no file of hyphenated words given
try {
directoryOfDelimitedFiles = args[0];
tokenizer = new TokenizerPTB();
new CreateAssertionLuceneIndexFromDelimitedFile(tokenizer);
} catch (Exception e) {
e.printStackTrace();
}
} else if (args.length == 3) { // else, use the file of hyphenated words
// during tokenization
try {
directoryOfDelimitedFiles = args[0];
// ** hyphnated file no longer needed. using the new PTB
// tokenizer instead. **
// String hyphFileLoc = args[1];
// int freqCutoff = Integer.parseInt(args[2]);
// Map hyphMap = loadHyphMap(hyphFileLoc);
// System.out.println("Processing hyphMap from : " +
// hyphFileLoc);
tokenizer = new TokenizerPTB();
new CreateAssertionLuceneIndexFromDelimitedFile(tokenizer);
} catch (Exception e) {
e.printStackTrace();
}
} else {
System.out.println(getUsage());
}
}
/**
* Loads text from a file.
*
* @param filename
* @return
* @throws FileNotFoundException
* @throws IOException
*/
public static String load(String filename) throws FileNotFoundException,
IOException {
String msg = "";
File f = new File(filename);
BufferedReader br = new BufferedReader(new FileReader(f));
String line = br.readLine();
while (line != null) {
msg += line + "\n";
line = br.readLine();
}
br.close();
return msg;
}
/**
* Loads hyphenated words and a frequency value for each, from a file.
*
* @param filename
* @return
* @throws FileNotFoundException
* @throws IOException
*/
public static Map loadHyphMap(String filename)
throws FileNotFoundException, IOException {
Map hyphMap = new HashMap();
File f = new File(filename);
BufferedReader br = new BufferedReader(new FileReader(f));
String line = br.readLine();
while (line != null) {
StringTokenizer st = new StringTokenizer(line, "|");
if (st.countTokens() == 2) {
String hyphWord = st.nextToken();
Integer freq = Integer.parseInt(st.nextToken());
hyphMap.put(hyphWord.toLowerCase(), freq);
} else {
System.out.println("Invalid hyphen file line: " + line);
}
line = br.readLine();
}
br.close();
return hyphMap;
}
/**
* Prints out the tokenized results, for debug use.
*
* @param text
* @param results
*/
public static void printResults(String text, List results) {
System.out.println("Text: " + text);
for (int i = 0; i < results.size(); i++) {
Token t = (Token) results.get(i);
String typeStr = "";
switch (t.getType()) {
case Token.TYPE_WORD:
typeStr = "word ";
break;
case Token.TYPE_PUNCT:
typeStr = "punctuation";
break;
case Token.TYPE_NUMBER:
typeStr = "number ";
break;
case Token.TYPE_EOL:
typeStr = "end of line";
break;
case Token.TYPE_CONTRACTION:
typeStr = "contraction";
break;
case Token.TYPE_SYMBOL:
typeStr = "symbol ";
break;
default:
typeStr = "unknown ";
}
String capsStr = "";
switch (t.getCaps()) {
case Token.CAPS_ALL:
capsStr = "A";
break;
case Token.CAPS_NONE:
capsStr = "N";
break;
case Token.CAPS_MIXED:
capsStr = "M";
break;
case Token.CAPS_FIRST_ONLY:
capsStr = "F";
break;
default:
capsStr = "?";
}
String numPosStr = "";
switch (t.getNumPosition()) {
case Token.NUM_FIRST:
numPosStr = "F";
break;
case Token.NUM_MIDDLE:
numPosStr = "M";
break;
case Token.NUM_LAST:
numPosStr = "L";
break;
case Token.NUM_NONE:
numPosStr = "N";
break;
default:
numPosStr = "?";
}
String intStr = "";
if (t.isInteger()) {
intStr = "Y";
} else {
intStr = "N";
}
System.out.println("Token:" + " type=[" + typeStr + "]" + " caps=["
+ capsStr + "]" + " npos=[" + numPosStr + "]" + " int=["
+ intStr + "]" + " offsets=[" + t.getStartOffset() + ","
+ t.getEndOffset() + "]" + "\t\t" + "text=["
+ text.substring(t.getStartOffset(), t.getEndOffset())
+ "]");
}
}
/**
* @return A string showing usage example (parameters)
*/
public static String getUsage() {
return "java LucenePopulateDriver [hyphenfile] [freqcutoff]";
}
protected void writeToFormatLucene(String cuePhrase, String cuePhraseCategory, String cuePhraseFamily) {
Document doc = new Document();
try {
// Print the name out
idCount++;
//if (idCount % 10000 == 0)
System.out.println(" " + idCount
+ " processed so far out of total");
doc.add(new TextField("cuePhrase", cuePhrase, Field.Store.YES));
doc.add(new StringField("cuePhraseCategory", cuePhraseCategory, Field.Store.YES));
doc.add(new StringField("cuePhraseFamily", cuePhraseFamily, Field.Store.YES));
List list = tokenizer.tokenize(cuePhrase);
Collections.sort(list, new OffsetComparator());
Iterator tokenItr = list.iterator();
Token t;
int tCount = 0;
String firstTokenText = "";
String tokenizedCuePhrase = "";
while (tokenItr.hasNext()) {
tCount++;
t = (Token) tokenItr.next();
if (tCount == 1) {
firstTokenText = t.getText(); // first token (aka
// "first word")
tokenizedCuePhrase += t.getText();
} else { // use blank to separate tokens
tokenizedCuePhrase = tokenizedCuePhrase + " " + t.getText();
}
}
doc.add(new StringField(CUE_PHRASE_FIRST_WORD_FIELD_NAME, firstTokenText, Field.Store.YES));
iwriter.addDocument(doc);
//String data = cui + "|" + firstTokenText + "|" + tokenizedDesc + "|" + codeInSource + "|" + source + "|" + semId + '\n';
String data = cuePhrase + "|" + cuePhraseCategory + "|" + tokenizedCuePhrase + '\n';
writeToFile (data);
} catch (IOException io) {
System.out.println("IOException in document : io "
+ io.getLocalizedMessage());
} catch (Exception exc) {
System.out.println("Exception in document : exc "
+ exc.getLocalizedMessage());
}
// writeToOutPutFile(cui + "|" + desc + "|" + source + "|" + cc + "|" +
// termStatus + "|" + semId);
}
public void writeToFile(String str) {
try {
// Create the output file of sample.txt
FileWriter fstream = new FileWriter(
"sample.txt",
true);
// Write data into the file
BufferedWriter out = new BufferedWriter(fstream);
out.write(str);
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy