edu.utah.bmi.nlp.context.common.IOUtil Maven / Gradle / Ivy
Show all versions of fastcontext Show documentation
/*
* ******************************************************************************
* * Copyright 2017 Department of Biomedical Informatics, University of Utah
* *
* * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
* ******************************************************************************
*/
package edu.utah.bmi.nlp.context.common;
import edu.utah.bmi.nlp.context.common.ContextValueSet.TriggerTypes;
import edu.utah.bmi.nlp.core.DeterminantValueSet;
import edu.utah.bmi.nlp.core.TypeDefinition;
import edu.utah.bmi.nlp.fastcontext.FastContext;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
/**
* @author Jianlin_Shi on 7/28/15.
*/
public class IOUtil {
private static int defaultWindowSize = 8;
public static void readAgnosticRuleResource(String ruleFileOrString, String splitter, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
int strLength = ruleFileOrString.trim().length();
String testFileStr = ruleFileOrString.trim().substring(strLength - 4).toLowerCase();
switch (testFileStr) {
case ".tsv":
readCSVFile(ruleFileOrString, "\t", rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
break;
case ".csv":
readCSVFile(ruleFileOrString, ",", rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
break;
case ".txt":
readCSVFile(ruleFileOrString, splitter, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
break;
case "xlsx":
readXLSXRuleFile(ruleFileOrString, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
break;
case ".owl":
readOwlFile(ruleFileOrString, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
default:
readCSVString(ruleFileOrString, splitter, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
break;
}
if (conceptFeaturesMap.size() == 0) {
readDefaultRules(rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
}
}
private static void readDefaultRules(HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
String ruleStr = "@CONCEPT_FEATURES|Concept|Negation|Certainty|Temporality|Experiencer\n" +
"@FEATURE_VALUES|Negation|affirm|negated\n" +
"@FEATURE_VALUES|Certainty|certain|uncertain\n" +
"@FEATURE_VALUES|Temporality|present|historical|hypothetical\n" +
"@FEATURE_VALUES|Experiencer|patient|nonpatient";
readCSVString(ruleStr, "|", rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
}
//TODO
private static void readOwlFile(String ruleFileOrString, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
// int id = 1;
// try {
// DomainOntology domain = new DomainOntology(owlFile, true);
// ArrayList modifierDictionary = domain.createModifierDictionary();
// for (Modifier modifier : modifierDictionary) {
// String modifierName = modifier.getModName();
// id = addLexicalItemsToRules(output, id, modifierName, TriggerTypes.trigger, modifier.getItems());
// for (Modifier pseudos : modifier.getPseudos()) {
// id = addLexicalItemsToRules(output, id, modifierName, TriggerTypes.pseudo, pseudos.getItems());
// }
// for (Modifier terminations : modifier.getClosures()) {
// id = addLexicalItemsToRules(output, id, modifierName, TriggerTypes.termination, terminations.getItems());
// }
// }
// } catch (Exception e) {
// e.printStackTrace();
// }
}
// private static int addLexicalItemsToRules(LinkedHashMap rules, int id, String modifierName,
// TriggerTypes triggerType, ArrayList lexicalItems) {
// for (LexicalItem mitem : lexicalItems) {
// String term = mitem.getPrefTerm();
// String directionStr = mitem.getActionEn(true);
// int windowSize = mitem.getWindowSize();
// TriggerTypes direction;
// switch (directionStr.charAt(2)) {
// case 'o':
// direction = TriggerTypes.forward;
// break;
// case 'a':
// direction = TriggerTypes.backward;
// break;
// default:
//// modifier ontology use "bidirectional"
// direction = TriggerTypes.both;
// }
// rules.put(id, new ContextRule(direction, triggerType, direction + "_" + modifierName,
// modifierName, term, id, windowSize));
// id++;
// id = addTermsToRules(rules, direction, triggerType, id, modifierName, windowSize, mitem.getSynonym());
// id = addTermsToRules(rules, direction, triggerType, id, modifierName, windowSize, mitem.getAbbreviation());
// id = addTermsToRules(rules, direction, triggerType, id, modifierName, windowSize, mitem.getSubjExp());
// id = addTermsToRules(rules, direction, triggerType, id, modifierName, windowSize, mitem.getMisspelling());
// mitem.getSynonym();
//
// }
// return id;
// }
// private static int addTermsToRules(LinkedHashMap rules, TriggerTypes direction, TriggerTypes triggerType, int id, String modifierName, int windowSize, ArrayList terms) {
// for (String term : terms) {
// rules.put(id, new ContextRule(direction, triggerType, direction + "_" + modifierName,
// modifierName, term, id, windowSize));
// id++;
// }
// return id;
// }
public static void readXLSXRuleFile(String xlsxFileName, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
try {
FileInputStream inputStream = new FileInputStream(new File(xlsxFileName));
Workbook workbook = new XSSFWorkbook(inputStream);
Sheet firstSheet = workbook.getSheetAt(0);
Iterator iterator = firstSheet.iterator();
int id = 0;
while (iterator.hasNext()) {
Row nextRow = iterator.next();
Iterator cellIterator = nextRow.cellIterator();
ArrayList cells = new ArrayList<>();
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
switch (cell.getCellTypeEnum()) {
case NUMERIC:
cells.add(cell.getNumericCellValue() + "");
break;
default:
cells.add(cell.getStringCellValue());
break;
}
}
if (cells.size() > 0 && cells.get(0).trim().length() > 0)
parseCells(cells, id, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
id++;
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void readCSVFile(String csvFileName, String splitter, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
CSVFormat csvFormat = getCSVFormat(splitter);
try {
Iterable recordsIterator = CSVParser.parse(new File(csvFileName), StandardCharsets.UTF_8, csvFormat);
readCSV(recordsIterator, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void readCSVString(String csvString, String splitter, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
CSVFormat csvFormat = getCSVFormat(splitter);
try {
Iterable recordsIterator = CSVParser.parse(csvString, csvFormat);
readCSV(recordsIterator, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void readStringList(ArrayList ruleStringList, String splitter, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
CSVFormat csvFormat = getCSVFormat(splitter);
for (int i = 0; i < ruleStringList.size(); i++) {
String line = ruleStringList.get(i);
if (line.startsWith("#") || line.trim().length() == 0) {
continue;
}
try {
CSVRecord record = CSVParser.parse(line, csvFormat).iterator().next();
ArrayList cells = new ArrayList<>();
for (String cell : record) {
cells.add(cell);
}
parseCells(cells, i, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static void readCSV(Iterable recordsIterator, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
int id = 0;
for (CSVRecord record : recordsIterator) {
ArrayList cells = new ArrayList<>();
for (String cell : record) {
cells.add(cell);
}
parseCells(cells, id, rules, conceptFeaturesMap, featureDefaultValueMap, valueFeatureNameMap);
id++;
}
}
// @CONCEPT_FEATURES ConceptName Feature1Name Feature2Name Feature3Name
// @FEATURE_VALUES DefaultValue Value1 Value2
private static void parseCells(ArrayList cells, int id, HashMap rules,
HashMap conceptFeaturesMap,
HashMap featureDefaultValueMap,
HashMap valueFeatureNameMap) {
if (cells.get(0).startsWith("#") || cells.get(0).startsWith("\"#") || cells.get(0).trim().length() == 0)
return;
if (cells.get(0).length() < 2 && cells.size() > 3) {
String ruleString = cells.get(0);
String direction = cells.get(1);
String triggerType = cells.get(2);
String modifier = cells.get(3);
String determinant = cells.get(1) + "_" + cells.get(3);
int windowSize = defaultWindowSize;
if (cells.size() > 4)
windowSize = (int) Double.parseDouble(cells.get(4));
rules.put(id, new ContextRule(TriggerTypes.valueOf(direction), TriggerTypes.valueOf(triggerType), determinant, modifier,
ruleString, id, windowSize));
return;
}
switch (cells.get(0).substring(0, 2)) {
case "@C":
case "&C":
// @CONCEPT_FEATURES
if (cells.size() < 3) {
System.err.println("Rule format error: " + cells);
} else {
String conceptName = DeterminantValueSet.checkNameSpace(cells.get(1));
List features = cells.subList(2, cells.size());
if (!conceptFeaturesMap.containsKey(conceptName)) {
conceptFeaturesMap.put(conceptName, new TypeDefinition(conceptName, "", new ArrayList<>()));
}
LinkedHashMap featureValues = conceptFeaturesMap.get(conceptName).getFeatureValuePairs();
for (String feature : features) {
if (feature.trim().length() > 0)
featureValues.put(feature, "");
}
}
break;
case "@F":
case "&F":
// @FEATURE_VALUES
if (cells.size() < 2) {
System.err.println("Rule format error: " + cells);
} else if (cells.size() == 2) {
String featureName = cells.get(1);
featureDefaultValueMap.put(featureName, "");
} else {
String featureName = cells.get(1);
String defaultValue = cells.get(2).trim();
featureDefaultValueMap.put(featureName, defaultValue);
valueFeatureNameMap.put(defaultValue, featureName);
if (cells.size() > 2) {
for (String value : cells.subList(2, cells.size())) {
valueFeatureNameMap.put(value, featureName);
}
}
}
break;
default:
String ruleString = cells.get(0);
String direction = cells.get(1);
String triggerType = cells.get(2);
String modifier = cells.get(3);
String determinant = cells.get(1) + "_" + cells.get(3);
int windowSize = defaultWindowSize;
if (cells.size() > 4)
windowSize = (int) Double.parseDouble(cells.get(4));
if (valueFeatureNameMap.size() == 0 || valueFeatureNameMap.containsKey(modifier)) {
rules.put(id, new ContextRule(TriggerTypes.valueOf(direction), TriggerTypes.valueOf(triggerType), determinant, modifier,
ruleString, id, windowSize));
} else {
FastContext.logger.finest("Rule " + id + " " + cells + " has the modifier value not defined in the setting, skip this rule.");
}
break;
// TODO need some key-value pair completeness check for the maps
}
}
public static void writeFile(ArrayList content, String ruleFile) {
ArrayList output = new ArrayList();
File file = new File(ruleFile);
BufferedWriter reader = null;
try {
reader = new BufferedWriter(new FileWriter(file));
// read rules line by line to construct the regular expression
for (String line : content) {
reader.write(line + "\n");
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private static CSVFormat getCSVFormat(String splitter) {
CSVFormat csvFormat = CSVFormat.DEFAULT;
if (splitter != null) {
switch (splitter) {
case "\t":
csvFormat = CSVFormat.TDF;
case ",":
csvFormat = CSVFormat.DEFAULT;
default:
csvFormat = CSVFormat.newFormat(splitter.charAt(0));
}
}
return csvFormat;
}
}
|