org.molgenis.semanticmapper.service.impl.UnitResolverImpl Maven / Gradle / Ivy
package org.molgenis.semanticmapper.service.impl;
import static java.util.Collections.singletonList;
import static java.util.Objects.requireNonNull;
import com.google.common.collect.Sets;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.measure.quantity.Quantity;
import javax.measure.unit.Unit;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.molgenis.data.meta.model.Attribute;
import org.molgenis.ontology.core.model.Ontology;
import org.molgenis.ontology.core.model.OntologyTerm;
import org.molgenis.ontology.core.service.OntologyService;
import org.molgenis.semanticmapper.service.UnitResolver;
import org.molgenis.semanticmapper.utils.UnitHelper;
import org.molgenis.semanticsearch.string.NGramDistanceAlgorithm;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class UnitResolverImpl implements UnitResolver {
private static final Logger LOG = LoggerFactory.getLogger(UnitResolverImpl.class);
static final String UNIT_ONTOLOGY_IRI = "http://purl.obolibrary.org/obo/uo.owl";
private final OntologyService ontologyService;
public UnitResolverImpl(OntologyService ontologyService) {
this.ontologyService = requireNonNull(ontologyService);
}
@Override
public Unit extends Quantity> resolveUnit(Attribute attr) {
Set tokens = tokenize(attr.getLabel(), attr.getDescription());
// Option 1: Check if a term matches a unit
Unit extends Quantity> unit = null;
if (!tokens.isEmpty()) {
for (String term : tokens) {
try {
unit = Unit.valueOf(term);
break;
} catch (IllegalArgumentException e) {
// noop
}
}
if (isUnitEmpty(unit)) {
// Option 2: Search unit ontology for a match
OntologyTerm unitOntologyTerm =
resolveUnitOntologyTerm(
tokens.stream()
.map(this::convertNumberToOntologyTermStyle)
.collect(Collectors.toSet()));
if (unitOntologyTerm != null) {
// try label + synonym labels until hit
for (String synonymLabel : unitOntologyTerm.getSynonyms()) {
try {
unit = Unit.valueOf(synonymLabel);
break;
} catch (IllegalArgumentException e) {
// noop
}
}
}
}
}
if (isUnitEmpty(unit)) {
unit = null;
}
return unit;
}
private OntologyTerm resolveUnitOntologyTerm(Set tokens) {
OntologyTerm unitOntologyTerm;
Ontology unitOntology = ontologyService.getOntology(UNIT_ONTOLOGY_IRI);
if (unitOntology != null) {
if (!tokens.isEmpty()) {
List ontologyIds = singletonList(unitOntology.getId());
List ontologyTerms =
ontologyService.findExactOntologyTerms(ontologyIds, tokens, Integer.MAX_VALUE);
if (ontologyTerms != null && !ontologyTerms.isEmpty()) {
unitOntologyTerm = ontologyTerms.get(0);
} else {
unitOntologyTerm = null;
}
} else {
unitOntologyTerm = null;
}
} else {
LOG.warn("Unit resolver is missing required unit ontology [{}]", UNIT_ONTOLOGY_IRI);
unitOntologyTerm = null;
}
return unitOntologyTerm;
}
String convertNumberToOntologyTermStyle(String term) {
term = UnitHelper.superscriptToNumber(term.replaceAll("\\^", StringUtils.EMPTY));
Pattern pattern = Pattern.compile("\\w+(\\d+)");
Matcher matcher = pattern.matcher(term);
if (matcher.find()) {
String group = matcher.group(1);
String modifiedPart = group.trim();
modifiedPart = "^[" + modifiedPart + "]";
term = term.replaceAll(group, modifiedPart);
}
return QueryParser.escape(term);
}
Set tokenize(String... terms) {
Set tokens = new HashSet<>();
if (terms != null && terms.length > 0) {
Sets.newHashSet(terms).stream()
.filter(StringUtils::isNotBlank)
.map(StringUtils::lowerCase)
.map(this::replaceIllegalChars)
.forEach(
term ->
tokens.addAll(
Sets.newHashSet(term.split("\\s+")).stream()
.filter(this::notPureNumberExpression)
.map(UnitHelper::numberToSuperscript)
.collect(Collectors.toSet())));
tokens.removeAll(NGramDistanceAlgorithm.STOPWORDSLIST);
}
return tokens;
}
boolean isUnitEmpty(Unit extends Quantity> unit) {
return unit == null || StringUtils.isBlank(unit.toString());
}
boolean notPureNumberExpression(String str) {
return !str.matches("\\d+");
}
String replaceIllegalChars(String term) {
return UnitHelper.superscriptToNumber(term).replaceAll("[^a-zA-Z0-9 /\\^]", " ");
}
}