edu.utah.bmi.nlp.fastner.IOUtil Maven / Gradle / Ivy
* Copyright 2017 Department of Biomedical Informatics, University of Utah
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package edu.utah.bmi.nlp.fastner;
import edu.utah.blulab.domainontology.Anchor;
import edu.utah.blulab.domainontology.DomainOntology;
import edu.utah.blulab.domainontology.LogicExpression;
import edu.utah.blulab.domainontology.Variable;
import edu.utah.bmi.nlp.core.DeterminantValueSet;
import edu.utah.bmi.nlp.core.DeterminantValueSet.Determinants;
import edu.utah.bmi.nlp.core.NERRule;
import edu.utah.bmi.nlp.core.TypeDefinition;
import edu.utah.bmi.nlp.fastcner.UnicodeChecker;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.logging.Logger;
import static edu.utah.bmi.nlp.core.DeterminantValueSet.checkNameSpace;
import static edu.utah.bmi.nlp.fastcner.UnicodeChecker.isChinese;
* @author Jianlin Shi on 4/20/16.
public class IOUtil {
public static Logger logger = edu.utah.bmi.nlp.core.IOUtil.getLogger(IOUtil.class);
public static HashMap parseRuleStr(String ruleStr, String splitter, boolean caseSensitive) {
HashMap rules = new HashMap<>();
int strLength = ruleStr.trim().length();
String testFileStr = ruleStr.trim().substring(strLength - 4).toLowerCase();
boolean[] thisRuleType = new boolean[]{false, false, false};
LinkedHashMap typeDefinition = new LinkedHashMap<>();
if (testFileStr.equals(".tsv") || testFileStr.equals(".csv") || testFileStr.equals("xlsx") || testFileStr.equals(".owl")) {
thisRuleType = IOUtil.readAgnosticFile(ruleStr, rules, typeDefinition, caseSensitive);
} else {
thisRuleType = IOUtil.readCSVString(ruleStr, rules, typeDefinition, splitter, caseSensitive, thisRuleType);
return rules;
public static boolean[] readOwlFile(String owlFileName, HashMap rules, LinkedHashMap typeDefinition, boolean caseSensitive, boolean[] ruleSupports) {
int ruleType = 0;
int id = 0;
try {
DomainOntology domain = new DomainOntology(owlFileName, true);
ArrayList domainVariables = domain.getAllEvents();
for (Variable var : domainVariables) {
ArrayList> logicExpressions = var.getAnchor();
for (LogicExpression logicExpression : logicExpressions) {
if (logicExpression.isSingleExpression()) {
for (Anchor term : logicExpression) {
String preferredTerm = term.getPrefTerm();
if (preferredTerm == null || preferredTerm.trim().length() == 0) {
System.err.println("Error in owl file at: " + logicExpression.toString());
// TODO enable annotating at variable name level and/or semantic type level
String nameEntityClass = term.getSemanticType().get(0);
nameEntityClass = nameEntityClass.replaceAll(" +", "_").toUpperCase();
ruleSupports = addRule(rules, typeDefinition, new NERRule(++id, caseSensitive ? preferredTerm : preferredTerm.toLowerCase(), nameEntityClass, 0, Determinants.ACTUAL), ruleSupports);
if (term.getSynonym().size() > 0) {
for (String s : term.getSynonym()) {
ruleSupports = addRule(rules, typeDefinition, new NERRule(++id, caseSensitive ? s : s.toLowerCase(), nameEntityClass, 0, Determinants.ACTUAL), ruleSupports);
if (term.getAbbreviation().size() > 0) {
for (String s : term.getAbbreviation())
ruleSupports = addRule(rules, typeDefinition, new NERRule(++id, caseSensitive ? s : s.toLowerCase(), nameEntityClass, 0, Determinants.ACTUAL), ruleSupports);
if (term.getMisspelling().size() > 0) {
for (String s : term.getMisspelling())
ruleSupports = addRule(rules, typeDefinition, new NERRule(++id, caseSensitive ? s : s.toLowerCase(), nameEntityClass, 0, Determinants.ACTUAL), ruleSupports);
if (term.getPseudos().size() > 0) {
for (String s : term.getMisspelling())
ruleSupports = addRule(rules, typeDefinition, new NERRule(++id, caseSensitive ? s : s.toLowerCase(), nameEntityClass, 0, Determinants.PSEUDO), ruleSupports);
} else {"Current FastRule does not support complex NER:\n\t" + logicExpression);
} catch (Exception e) {
return ruleSupports;
public static boolean[] readOwlDirectory(String owlFileDirectory, HashMap rules, boolean caseSensitive) {
Collection files = FileUtils.listFiles(new File(owlFileDirectory), new String[]{"owl"}, true);
LinkedHashMap typeDefinition = new LinkedHashMap<>();
boolean[] thisRuleType = new boolean[]{false, false, false};
for (File file : files) {
thisRuleType = readOwlFile(file.getAbsolutePath(), rules, typeDefinition, caseSensitive, thisRuleType);
return thisRuleType;
public static boolean[] readAgnosticFile(String agnosticFileName, HashMap rules, LinkedHashMap typeDefinition, boolean caseSensitive) {
boolean[] thisRuleType = new boolean[]{false, false, false, false, false, false};
readAgnosticFile(agnosticFileName, rules, typeDefinition, caseSensitive, thisRuleType);
return thisRuleType;
public static boolean[] readAgnosticFile(String agnosticFileName, HashMap rules,
LinkedHashMap typeDefinition, boolean caseSensitive,
boolean[] thisRuleType) {
File agnosticFile = new File(agnosticFileName);
if (agnosticFile.exists()) {
if (agnosticFile.isDirectory()) {
thisRuleType = readOwlDirectory(agnosticFileName, rules, caseSensitive);
} else if (FilenameUtils.getExtension(agnosticFileName).equals("owl")) {
thisRuleType = readOwlFile(agnosticFileName, rules, typeDefinition, caseSensitive, thisRuleType);
} else if (FilenameUtils.getExtension(agnosticFileName).equals("xlsx")) {
thisRuleType = readXLSXRuleFile(agnosticFileName, rules, typeDefinition, caseSensitive, thisRuleType);
} else if (FilenameUtils.getExtension(agnosticFileName).equals("csv")) {
thisRuleType = readCSVFile(agnosticFileName, rules, typeDefinition, CSVFormat.DEFAULT, caseSensitive, thisRuleType);
} else if (FilenameUtils.getExtension(agnosticFileName).equals("tsv")) {
thisRuleType = readCSVFile(agnosticFileName, rules, typeDefinition, CSVFormat.TDF, caseSensitive, thisRuleType);
return thisRuleType;
// public static HashMap readXLSXRuleFile(String xlsxFileName) {
// HashMap rules = new HashMap();
// readXLSXRuleFile(xlsxFileName, rules, FASTRULEFILE, true);
// return rules;
// }
public static boolean[] readXLSXRuleFile(String xlsxFileName, HashMap rules, LinkedHashMap typeDefinition, boolean caseSensitive, boolean[] ruleSupports) {
try {
FileInputStream inputStream = new FileInputStream(new File(xlsxFileName));
Workbook workbook = new XSSFWorkbook(inputStream);
Sheet firstSheet = workbook.getSheetAt(0);
Iterator iterator = firstSheet.iterator();
int id = 0;
while (iterator.hasNext()) {
Row nextRow =;
Iterator cellIterator = nextRow.cellIterator();
ArrayList cells = new ArrayList<>();
while (cellIterator.hasNext()) {
Cell cell =;
if (cells.size() > 0)
ruleSupports = parseCells(cells, id, rules, typeDefinition, caseSensitive, ruleSupports);
} catch (IOException e) {
return ruleSupports;
public static boolean[] readCSVFile(String csvFileName, HashMap rules, LinkedHashMap typeDefinition, CSVFormat csvFormat, boolean caseSensitive, boolean[] ruleSupports) {
try {
Iterable recordsIterator = CSVParser.parse(new File(csvFileName), StandardCharsets.UTF_8, csvFormat);
ruleSupports = readCSV(recordsIterator, rules, typeDefinition, caseSensitive, ruleSupports);
} catch (FileNotFoundException e) {
} catch (IOException e) {
return ruleSupports;
public static boolean[] readCSVString(String csvString, HashMap rules, LinkedHashMap typeDefinition, String splitter, boolean caseSensitive, boolean[] ruleSupports) {
CSVFormat csvFormat = CSVFormat.DEFAULT;
if (splitter.equals("\t")) {
csvFormat = CSVFormat.TDF;
ruleSupports = readCSVString(csvString, rules, typeDefinition, csvFormat, caseSensitive, ruleSupports);
return ruleSupports;
public static boolean[] readCSVString(String csvString, HashMap rules, LinkedHashMap typeDefinition, CSVFormat csvFormat, boolean caseSensitive, boolean[] ruleSupports) {
try {
Iterable recordsIterator = CSVParser.parse(csvString, csvFormat);
ruleSupports = readCSV(recordsIterator, rules, typeDefinition, caseSensitive, ruleSupports);
} catch (FileNotFoundException e) {
} catch (IOException e) {
return ruleSupports;
private static boolean[] readCSV(Iterable recordsIterator, HashMap rules,
LinkedHashMap typeDefinition, boolean caseSensitive, boolean[] ruleSupports) {
int id = 0;
for (CSVRecord record : recordsIterator) {
ArrayList cells = new ArrayList<>();
for (String cell : record) {
// to be back compatible
if ((!cells.get(0).startsWith("@") && !cells.get(0).startsWith("&")) && cells.size() > 1 && !UnicodeChecker.isNumber(cells.get(1)))
cells.add(1, "1");
ruleSupports = parseCells(cells, id, rules, typeDefinition, caseSensitive, ruleSupports);
return ruleSupports;
private static boolean[] parseCells(ArrayList cells, int id, HashMap<
Integer, NERRule> rules, LinkedHashMap typeDefinition, boolean caseSensitive,
boolean[] ruleSupports) {
if (cells.get(0).startsWith("#") || cells.get(0).trim().length() == 0)
return ruleSupports;
if (cells.get(0).startsWith("@") || cells.get(0).startsWith("&")) {
// Rule type should be defined in the 1st line that starting with '@': '@fastner' or '@fastcner'
if (cells.size() == 1) {
ruleSupports = checkFastCRule(cells.get(0));
} else if (cells.size() > 1) {
// new UIMA type definition with '@typeName superTypeName'
// or '@typeName superTypeName newFeature1 newFeature2 newFeature3...'
cells.set(0, cells.get(0).substring(1));
typeDefinition.put(getShortName(cells.get(0)), new TypeDefinition(cells));
return ruleSupports;
if (cells.size() >= 2) {
// if (cells.get(2).indexOf(".") == -1)
// cells.set(2, checkNameSpace(cells.get(2)));
String rule = cells.get(0);
if (UnicodeChecker.isNumber(cells.get(1))) {
String conceptShortName = getShortName(cells.get(2).trim());
if (!typeDefinition.containsKey(conceptShortName)) {
typeDefinition.put(conceptShortName, new TypeDefinition(cells.get(2).trim(), DeterminantValueSet.defaultSuperTypeName, new ArrayList<>()));
ruleSupports = addRule(rules, typeDefinition, new NERRule(id, caseSensitive ? rule : rule.toLowerCase(), cells.get(2).trim(), Double.parseDouble(cells.get(1)), cells.size() > 3 ? Determinants.valueOf(cells.get(3)) : Determinants.ACTUAL), ruleSupports);
} else {
String conceptShortName = getShortName(cells.get(1).trim());
if (!typeDefinition.containsKey(conceptShortName)) {
typeDefinition.put(conceptShortName, new TypeDefinition(cells.get(1).trim(), DeterminantValueSet.defaultSuperTypeName, new ArrayList<>()));
ruleSupports = addRule(rules, typeDefinition, new NERRule(id, caseSensitive ? rule : rule.toLowerCase(), cells.get(1).trim(), 0d, cells.size() > 2 ? Determinants.valueOf(cells.get(2)) : Determinants.ACTUAL), ruleSupports);
} else"Definition format error: line " + id + "\t\t" + cells);
return ruleSupports;
public static HashMap readCRuleString(String ruleString, String splitter) {
int id = 0;
HashMap rules = new HashMap<>();
for (String rule : ruleString.split("\n")) {
rule = rule.trim();
if (rule.length() < 1 || rule.startsWith("#"))
String[] definition = rule.split(splitter);
Determinants determinant = Determinants.ACTUAL;
if (definition.length > 3)
determinant = Determinants.valueOf(definition[3]);
if (definition.length > 2) {
definition[2] = checkNameSpace(definition[2]);
} else if (!rule.trim().startsWith("#")) {"Definition format error: line " + id + "\t\t" + rule);
rules.put(id, new NERRule(id, definition[0], definition[2].trim(), Double.parseDouble(definition[1]), determinant));
return rules;
private static boolean[] addRule(HashMap rules, LinkedHashMap typeDefinition, NERRule rule, boolean[] ruleSupports) {
// support grouping
if (ruleSupports[1] == false && rule.rule.indexOf("(") != -1) {
ruleSupports[1] = true;
// support square bracket
if (ruleSupports[2] == false && rule.rule.indexOf("[") != -1) {
ruleSupports[2] = true;
// support replication grammar '+'
if (ruleSupports[3] == false && rule.rule.indexOf("+") != -1) {
ruleSupports[3] = true;
// support numeric handler
if (ruleSupports[4] == false && ((rule.rule.indexOf("\\>") != -1) || (rule.rule.indexOf("\\<") != -1))) {
ruleSupports[4] = true;
if (ruleSupports[5] == false && isChinese(rule.rule.toCharArray()[0])) {
ruleSupports[5] = true;
rules.put(, rule);
return ruleSupports;
* Rule type should be defined in the 1st line that starting with '@': '@fastner' or '@fastcner' or '@fastcnercn'
* @param ruleString input String that contains rule definitions
* @return if the rule is for FastCNER
private static boolean[] checkFastCRule(String ruleString) {
int begin = ruleString.indexOf("@");
if (begin == -1)
begin = ruleString.indexOf("&");
int end = ruleString.indexOf("\n", begin);
boolean[] ruleSupports = new boolean[]{false, false, false, false, false, false};
String definition = ruleString.substring(begin, end == -1 ? ruleString.length() : end).toLowerCase();
if (definition.indexOf("fastcnercn") != -1) {
ruleSupports[5] = true;
} else if (definition.indexOf("fastcner") != -1) {
ruleSupports[0] = true;
return ruleSupports;
private static String getShortName(String fullName) {
int dot = fullName.lastIndexOf(".");
if (dot != -1) {
fullName = fullName.substring(dot + 1);
return fullName;