All Downloads are FREE. Search and download functionalities are using the official Maven repository.

it.uniroma2.art.lime.profiler.LIMEProfiler Maven / Gradle / Ivy

The newest version!
/* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL
 * was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/* Portions created by ART Group, University of Rome Tor Vergata are Copyright (C) 2013 */

package it.uniroma2.art.lime.profiler;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Stream;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.eclipse.rdf4j.common.exception.RDF4JException;
import org.eclipse.rdf4j.common.iteration.Iterations;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.impl.LinkedHashModel;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.util.Literals;
import org.eclipse.rdf4j.model.vocabulary.DCTERMS;
import org.eclipse.rdf4j.model.vocabulary.FOAF;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.VOID;
import org.eclipse.rdf4j.query.BindingSet;
import org.eclipse.rdf4j.query.QueryResults;
import org.eclipse.rdf4j.query.TupleQuery;
import org.eclipse.rdf4j.query.TupleQueryResult;
import org.eclipse.rdf4j.query.impl.SimpleDataset;
import org.eclipse.rdf4j.queryrender.RenderUtils;
import org.eclipse.rdf4j.repository.RepositoryConnection;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;

import it.uniroma2.art.lime.model.language.LanguageTagUtils;
import it.uniroma2.art.lime.model.repo.LIMERepositoryConnectionWrapper;
import it.uniroma2.art.lime.model.vocabulary.LIME;
import it.uniroma2.art.lime.profiler.impl.OWLSemanticModelProfiler;
import it.uniroma2.art.lime.profiler.impl.OntoLexLexicalizationModelProfiler;
import it.uniroma2.art.lime.profiler.impl.RDFSLexicalizationModelProfiler;
import it.uniroma2.art.lime.profiler.impl.ResourceLocationUtilsInternal;
import it.uniroma2.art.lime.profiler.impl.SKOSLexicalizationModelProfiler;
import it.uniroma2.art.lime.profiler.impl.SKOSSemanticModelProfiler;
import it.uniroma2.art.lime.profiler.impl.SKOSXLLexicalizationModelProfiler;
import org.eclipse.rdf4j.repository.RepositoryResult;

public class LIMEProfiler {

	private LIMERepositoryConnectionWrapper metadataConnection;
	private RepositoryConnection dataConnection;
	private IRI dataGraph;
	private static Map knownSemanticModelProfilers;
	private static Map knownLexicalizationModelProfilers;

	private IRI metadataBaseURI;

	static {
		ValueFactory vf = SimpleValueFactory.getInstance();

		knownSemanticModelProfilers = new LinkedHashMap<>();
		knownSemanticModelProfilers.put(vf.createIRI("http://www.w3.org/2004/02/skos/core"),
				new SKOSSemanticModelProfiler());
		knownSemanticModelProfilers.put(vf.createIRI("http://www.w3.org/2002/07/owl"),
				new OWLSemanticModelProfiler());

		knownLexicalizationModelProfilers = new LinkedHashMap<>();
		knownLexicalizationModelProfilers.put(vf.createIRI("http://www.w3.org/ns/lemon/ontolex"),
				new OntoLexLexicalizationModelProfiler());
		knownLexicalizationModelProfilers.put(vf.createIRI("http://www.w3.org/2008/05/skos-xl"),
				new SKOSXLLexicalizationModelProfiler());
		knownLexicalizationModelProfilers.put(vf.createIRI("http://www.w3.org/2004/02/skos/core"),
				new SKOSLexicalizationModelProfiler());
		knownLexicalizationModelProfilers.put(vf.createIRI("http://www.w3.org/2000/01/rdf-schema"),
				new RDFSLexicalizationModelProfiler());
	}

	public LIMEProfiler(RepositoryConnection metadataConnection, IRI metadataBaseURI,
			RepositoryConnection dataConnection, IRI dataGraph) {
		this.metadataConnection = new LIMERepositoryConnectionWrapper(metadataConnection.getRepository(),
				metadataConnection);
		this.metadataBaseURI = metadataBaseURI;
		this.dataConnection = dataConnection;
		this.dataGraph = dataGraph;
	}

	public void profile() throws ProfilerException {
		profile(new ProfilerOptions());
	}

	public void profile(ProfilerOptions options) throws ProfilerException {

		ProfilerContext profilerContext = createNewProfilerContext(options);

		boolean includeInferred = options.isIncludeInferred();
		IRI[] contexts = options.getContexts();
		Optional mainDatasetHolder = metadataConnection.getMainDataset(includeInferred, contexts);

		Resource mainDataset;

		if (mainDatasetHolder.isPresent()) {
			mainDataset = mainDatasetHolder.get();
		} else {
			mainDataset = profilerContext.mintMainDatasetResource();

			metadataConnection.add(metadataBaseURI, RDF.TYPE, VOID.DATASET_DESCRIPTION);
			metadataConnection.add(metadataBaseURI, FOAF.PRIMARY_TOPIC, mainDataset);
			metadataConnection.add(mainDataset, RDF.TYPE, VOID.DATASET);
		}

		// -- Overall statistics:
		// Triple count
		BigInteger triples = BigInteger.valueOf(dataConnection.size(dataGraph));

		metadataConnection.add(mainDataset, VOID.TRIPLES,
				metadataConnection.getValueFactory().createLiteral(triples));

		// Distinct subjects

		TupleQuery distinctSubjectsQuery = dataConnection.prepareTupleQuery(
		// @formatter:off
		    " SELECT (COUNT(DISTINCT ?s) AS ?c) WHERE {\n " +
		    "   ?s ?p ?o . \n" +
		    " }\n"
			// @formatter:on
		);
		SimpleDataset dataset = new SimpleDataset();
		dataset.addDefaultGraph(dataGraph);
		distinctSubjectsQuery.setIncludeInferred(includeInferred);
		Literal distinctSubjectsLiteral = (Literal) QueryResults
				.singleResult(distinctSubjectsQuery.evaluate()).getValue("c");
		metadataConnection.add(mainDataset, VOID.DISTINCT_SUBJECTS, distinctSubjectsLiteral);

		// Distinct objects

		TupleQuery distinctObjectsQuery = dataConnection.prepareTupleQuery(
		// @formatter:off
		    " SELECT (COUNT(DISTINCT ?o) AS ?c) WHERE {\n " +
		    "   ?s ?p ?o . \n" +
		    " }\n"
			// @formatter:on
		);
		distinctObjectsQuery.setIncludeInferred(includeInferred);
		Literal distinctObjectsLiteral = (Literal) QueryResults.singleResult(distinctObjectsQuery.evaluate())
				.getValue("c");
		metadataConnection.add(mainDataset, VOID.DISTINCT_OBJECTS, distinctObjectsLiteral);

		// -- Reference dataset statistics

		Set semanticModels = Sets
				.intersection(
						QueryResults.asSet(metadataConnection.getPropertyIRIs(mainDataset,
								DCTERMS.CONFORMS_TO, includeInferred, contexts)),
						knownSemanticModelProfilers.keySet());

		if (semanticModels.size() > 1) {
			throw new AmbiguousSemanticModelException(mainDataset, semanticModels);
		}

		for (Map.Entry entry : knownSemanticModelProfilers.entrySet()) {
			IRI semanticModel = entry.getKey();
			SemanticModelProfiler semanticModelProfiler = entry.getValue();

			boolean processed = semanticModelProfiler.profile(profilerContext, metadataConnection,
					dataConnection, dataGraph, mainDataset);

			if (processed)
				break;
		}

		// -- Lexicons statistics

		profileLexicons(this, options, metadataConnection, dataConnection, dataGraph, mainDataset);

		// -- LexicalizationSet statistics

		for (Entry entry : knownLexicalizationModelProfilers.entrySet()) {
			IRI semanticModel = entry.getKey();
			LexicalizationModelProfiler lexicalizationModelProfiler = entry.getValue();

			boolean processed = lexicalizationModelProfiler.profile(profilerContext, metadataConnection,
					dataConnection, dataGraph, mainDataset);

			if (processed)
				break;
		}

		// -- ConceptSet statistics

		profileConceptSets(this, options, metadataConnection, dataConnection, dataGraph, mainDataset);

		// -- ConceptualizationSet statistics

		profileConceptualizationSets(this, profilerContext, metadataConnection, dataConnection, dataGraph,
				mainDataset);

		// -- Recognize void:Linkset(s)

		StringBuilder linksetQueryStringBuilder = new StringBuilder();
		linksetQueryStringBuilder.append(
		// @formatter:off
			" PREFIX rdfs:                                                                            \n" +
			" PREFIX owl:                                                                                    \n" +
			" PREFIX skos:                                                                             \n" +
			"                                                                                                                                \n" +
			" SELECT ?subject_referenceDataset ?subject_referenceDatasetUriSpace                                                             \n" +
			"        ?object_referenceDataset ?object_referenceDatasetUriSpace ?mappingProp (COUNT(?subject) as ?linkCount) {                \n" +
			"     VALUES(?mappingBaseProp) {                                                                                                 \n" +
			" 		  (owl:differentFrom)                                                                                                    \n" +
			"         (owl:sameAs)                                                                                                           \n" +
			"         (rdfs:subClassOf)                                                                                                      \n" +
			"         (rdfs:subPropertyOf)                                                                                                   \n" +
			"         (owl:disjointWith)                                                                                                     \n" +
			"         (owl:equivalentClass)                                                                                                  \n" +
			"         (owl:equivalentProperty)                                                                                               \n" +
			"         (owl:propertyDisjointWith)                                                                                             \n" +
			"         (skos:mappingRelation)                                                                                                 \n" +
			"     }                                                                                                                          \n" +
			"     ?mappingProp rdfs:subPropertyOf* ?mappingBaseProp .                                                                        \n" +
			" 	GRAPH ?dataGraph {                                                                                                           \n" +
			" 		?subject ?mappingProp ?object .                                                                                          \n"
			// @formatter:on
		);

		ResourceLocationUtilsInternal.appendUriSpaceLogic(options, dataGraph, linksetQueryStringBuilder,
				metadataConnection, "subject_", "subject");
		ResourceLocationUtilsInternal.appendUriSpaceLogic(options, dataGraph, linksetQueryStringBuilder,
				metadataConnection, "object_", "object");

		linksetQueryStringBuilder.append(
		// @formatter:off
			" 		FILTER (!(BOUND(?subject_referenceDataset) && BOUND(?object_referenceDataset)) || !sameTerm(?subject_referenceDataset, ?object_referenceDataset)) \n" +
			" 		FILTER (!(BOUND(?subject_referenceDatasetUriSpace) && BOUND(?object_referenceDatasetUriSpace)) || !sameTerm(?subject_referenceDatasetUriSpace, ?object_referenceDatasetUriSpace)) \n" +
			" 		FILTER (BOUND(?subject_referenceDataset) || BOUND(?object_referenceDataset) || BOUND(?subject_referenceDatasetUriSpace) || BOUND(?object_referenceDatasetUriSpace))\n"+
			" 		FILTER (!BOUND(?subject_referenceDatasetUriSpace)                                                                        \n" +
			"                  || ?subject_referenceDatasetUriSpace NOT IN (\"http://www.w3.org/2002/07/owl#\",                              \n" +
			"                                                               \"http://www.w3.org/2000/01/rdf-schema#\"))                      \n" +
			" 		FILTER (!BOUND(?object_referenceDatasetUriSpace)                                                                         \n" +
			"                  || ?object_referenceDatasetUriSpace NOT IN (\"http://www.w3.org/2002/07/owl#\",                               \n" +
			"                                                               \"http://www.w3.org/2000/01/rdf-schema#\"))                      \n" +
			" 	}                                                                                                                            \n" +
			" }                                                                                                                              \n" +
			" GROUP BY ?subject_referenceDataset ?subject_referenceDatasetUriSpace ?object_referenceDataset ?object_referenceDatasetUriSpace  ?mappingProp \n" +
			" HAVING (?linkCount > 0)                                                                                                        \n"
			// @formatter:on
		);

		TupleQuery linksetQuery = dataConnection.prepareTupleQuery(linksetQueryStringBuilder.toString());
		linksetQuery.setBinding("dataGraph", dataGraph);
		linksetQuery.setIncludeInferred(includeInferred);

		Map additionalDatasets = new HashMap<>();

		Multimap, LinksetStats> pair2linksets = HashMultimap.create();

		try (TupleQueryResult results = linksetQuery.evaluate()) {
			while (results.hasNext()) {
				BindingSet bs = results.next();

				Resource subjectDataset = ResourceLocationUtilsInternal.getDatasetOrMintNew(profilerContext,
						additionalDatasets, mainDataset, (IRI) bs.getValue("subject_referenceDataset"),
						Optional.ofNullable(bs.getValue("subject_referenceDatasetUriSpace"))
								.map(Value::stringValue).orElse(null));
				Resource objectDataset = ResourceLocationUtilsInternal.getDatasetOrMintNew(profilerContext,
						additionalDatasets, mainDataset, (IRI) bs.getValue("object_referenceDataset"),
						Optional.ofNullable(bs.getValue("object_referenceDatasetUriSpace"))
								.map(Value::stringValue).orElse(null));

				Pair datasetPair = new ImmutablePair<>(subjectDataset, objectDataset);

				LinksetStats linksetStats = new LinksetStats();
				linksetStats.setSubjectsTarget(subjectDataset);
				linksetStats.setObjectsTarget(objectDataset);
				linksetStats.setLinkPredicate((IRI) bs.getValue("mappingProp"));
				linksetStats.setTriples(Literals.getIntegerValue(bs.getValue("linkCount"), BigInteger.ZERO));

				pair2linksets.put(datasetPair, linksetStats);
			}
		}

		Model linksetStatsModel = new LinkedHashModel();

		for (Pair datasetPair : pair2linksets.keySet()) {
			Collection relevantLinksets = pair2linksets.get(datasetPair);

			Resource parentLinkset = null;
			long cumulativeLinks = 0;

			if (relevantLinksets.size() > 1) {
				parentLinkset = profilerContext.mintLinksetResource(datasetPair.getKey(),
						datasetPair.getValue(), null);
			}

			for (LinksetStats linksetStats : relevantLinksets) {
				Resource linkset = profilerContext.mintLinksetResource(datasetPair.getKey(),
						datasetPair.getValue(),
						parentLinkset != null ? linksetStats.getLinkPredicate() : null);

				linksetStats.serialize(linksetStatsModel, linkset);

				if (parentLinkset != null) {
					linksetStatsModel.add(parentLinkset, VOID.SUBSET, linkset);
					cumulativeLinks += linksetStats.getTriples().longValue();
				} else {
					linksetStatsModel.add(mainDataset, VOID.SUBSET, linkset);
				}

			}

			if (parentLinkset != null) {
				LinksetStats parentLinksetStats = new LinksetStats();
				parentLinksetStats.setSubjectsTarget(datasetPair.getKey());
				parentLinksetStats.setObjectsTarget(datasetPair.getValue());
				parentLinksetStats.setTriples(BigInteger.valueOf(cumulativeLinks));

				parentLinksetStats.serialize(linksetStatsModel, parentLinkset);

				metadataConnection.add(mainDataset, VOID.SUBSET, parentLinkset);
			}
		}

		additionalDatasets.forEach((uriSpace, dat) -> {
			linksetStatsModel.add(dat, RDF.TYPE, VOID.DATASET);
			linksetStatsModel.add(dat, VOID.URI_SPACE,
					SimpleValueFactory.getInstance().createLiteral(uriSpace));
		});

		metadataConnection.add(linksetStatsModel);
	}

	protected ProfilerContext createNewProfilerContext(ProfilerOptions options) {
		ProfilerContext profilerContext = new ProfilerContext();
		profilerContext.setMetadataBaseURI(metadataBaseURI);
		profilerContext.setMetadataConnection(metadataConnection);
		profilerContext.setOptions(options);
		return profilerContext;
	}

	// public void profileReferenceDataset(Resource dataset, boolean includeInferred, Resource... contexts)
	// throws ProfilerException {
	// Optional semanticModelProfilerHolder = QueryResults
	// .stream(metadataConnection.getProperties(dataset, DCTERMS.CONFORMS_TO, includeInferred,
	// contexts))
	// .flatMap(new CastFlatMapper<>(IRI.class)).map(this::getSemanticModelProfiler2)
	// .filter(Optional::isPresent).map(Optional::get).findAny();
	//
	// if (semanticModelProfilerHolder.isPresent()) {
	// SemanticModelProfiler semanticModelProfiler = semanticModelProfilerHolder.get();
	// semanticModelProfiler.profile(metadataConnection, dataConnection, dataset);
	// } else {
	// throw new SemanticModelNotRecognizedException();
	// }
	// }

	public void profileLexicons(LIMEProfiler profiler, ProfilerOptions options,
			LIMERepositoryConnectionWrapper metadataConnection, RepositoryConnection dataConnection,
			IRI dataGraph, Resource mainDataset) throws ProfilerException {
		TupleQuery query = dataConnection.prepareTupleQuery(
		// @formatter:off
				" PREFIX ontolex:                             \n" +
				" PREFIX lime:                                   \n" +
				"                                                                                  \n" +
				" SELECT ?lexiconDataset (GROUP_CONCAT(DISTINCT ?langT) as ?lang)                  \n" +
				"        (COUNT(DISTINCT ?lexicalEntry) as ?lexicalEntries){                       \n" +
				" 	GRAPH ?dataGraph {                                                             \n" +
				" 		?lexiconDataset lime:entry ?lexicalEntry .                                 \n" +
				" 		?lexicalEntry ontolex:canonicalForm/ontolex:writtenRep ?label .            \n" +
				" 		BIND(LANG(?label) as ?langT)                                               \n" +
				" 	}                                                                              \n" +
				" }                                                                                \n" +
				" GROUP BY ?lexiconDataset                                                         \n" +
				" HAVING BOUND(?lexiconDataset)                                                    \n"
				// @formatter:on
		);
		query.setBinding("dataGraph", dataGraph);

		try (TupleQueryResult results = query.evaluate()) {
			while (results.hasNext()) {
				BindingSet lexiconBindingSet = results.next();

				Resource lexiconDataset = (Resource) lexiconBindingSet.getValue("lexiconDataset");
				String lang = lexiconBindingSet.getValue("lang").stringValue();
				BigInteger lexicalEntries = Literals
						.getIntegerValue(lexiconBindingSet.getValue("lexicalEntries"), BigInteger.ZERO);

				if (lang.contains(",")) {
					throw new ProfilerException("Lexicon \"" + lexiconDataset.stringValue()
							+ "\" has ambiguous languages: " + lang);
				}

				if (lang.isEmpty()) {
					throw new ProfilerException(
							"No language information can be computed for lexicon: " + lexiconDataset);
				}

				LexiconStats lexiconStats = new LexiconStats();
				lexiconStats.setLanguageTag(lang);
				LanguageTagUtils.toLexvo(lang).ifPresent(lexiconStats::setLanguageLexvo);
				LanguageTagUtils.toLOC(lang).ifPresent(lexiconStats::setLanguageLOC);
				lexiconStats.setLexicalEntries(lexicalEntries);

				Model model = new LinkedHashModel();
				lexiconStats.serialize(model, lexiconDataset);

				metadataConnection.add(model);

				try (RepositoryResult repositoryResult = dataConnection.getStatements(lexiconDataset, DCTERMS.TITLE, null, dataGraph);
					 Stream stream = QueryResults.stream(repositoryResult)) {
					stream.forEach(s -> metadataConnection.add(s, (Resource) null));
				}
				metadataConnection.add(mainDataset, VOID.SUBSET, lexiconDataset);
			}
		}

	}

	public void profileConceptSets(LIMEProfiler profiler, ProfilerOptions options,
			LIMERepositoryConnectionWrapper metadataConnection, RepositoryConnection dataConnection,
			IRI dataGraph, Resource mainDataset) throws ProfilerException {
		TupleQuery query = dataConnection.prepareTupleQuery(
		// @formatter:off
				" PREFIX ontolex:                             \n" +
				" PREFIX skos:                               \n" +
				"                                                                                  \n" +
				" SELECT ?conceptSet (COUNT(DISTINCT ?concept) as ?concepts) {                     \n" +
				"   ?conceptSetCls rdfs:subClassOf* ontolex:ConceptSet .                           \n" +
				" 	GRAPH ?dataGraph {                                                             \n" +
				"       ?conceptSet a ?conceptSetCls .                                             \n" +
				" 		?concept skos:inScheme|skos:topConceptOf ?conceptSet .                     \n" +
				" 	}                                                                              \n" +
				" }                                                                                \n" +
				" GROUP BY ?conceptSet                                                             \n" +
				" HAVING BOUND(?conceptSet)                                                        \n"
				// @formatter:on
		);
		query.setBinding("dataGraph", dataGraph);

		try (TupleQueryResult results = query.evaluate()) {
			while (results.hasNext()) {
				BindingSet conceptSetBindingSet = results.next();

				Resource conceptSet = (Resource) conceptSetBindingSet.getValue("conceptSet");
				BigInteger concepts = Literals.getIntegerValue(conceptSetBindingSet.getValue("concepts"),
						BigInteger.ZERO);

				ConceptSetStats conceptSetStats = new ConceptSetStats();
				conceptSetStats.setConcepts(concepts);

				Model model = new LinkedHashModel();
				conceptSetStats.serialize(model, conceptSet);

				try (RepositoryResult repositoryResult = dataConnection.getStatements(conceptSet, DCTERMS.TITLE, null, dataGraph);
					 Stream stream = QueryResults.stream(repositoryResult)) {
					stream.forEach(s -> metadataConnection.add(s, (Resource) null));
				}
				metadataConnection.add(model);
				metadataConnection.add(mainDataset, VOID.SUBSET, conceptSet);
			}
		}

	}

	public void profileConceptualizationSets(LIMEProfiler profiler, ProfilerContext profilerContext,
			LIMERepositoryConnectionWrapper metadataConnection, RepositoryConnection dataConnection,
			IRI dataGraph, Resource mainDataset) throws ProfilerException {
		ValueFactory vf = metadataConnection.getValueFactory();

		try {
			StringBuilder sb = new StringBuilder(
			// @formatter:off
				"PREFIX rdfs:                              \n" +
				"PREFIX ontolex:                             \n" +
				"PREFIX lime:                                   \n" +
				"PREFIX skos:                               \n" +
				"SELECT ?conceptSet ?lexiconDataset                                               \n" +
				"       ?conceptualization_referenceDatasetUriSpace                               \n" +
				"       (COUNT(DISTINCT ?concept) as ?concepts)                                   \n" +
				"       (COUNT(?concept) as ?conceptualizations)                                  \n" +
				"       (COUNT(DISTINCT ?lexicalEntry) as ?lexicalEntries) {                      \n" +
				"  {SELECT DISTINCT ?lexicalEntry ?concept ?conceptSet ?lexiconDataset {          \n" +
				"     GRAPH " + RenderUtils.toSPARQL(dataGraph) + " {                             \n" +
				"       ?lexicalEntry                                                             \n" +
				"         (ontolex:sense|^ontolex:isSenseOf)                                      \n" +
				"           /(ontolex:isLexicalizedSenseOf|^ontolex:lexicalizedSense)             \n" +
				"         |(ontolex:evokes|^ontolex:isEvokedBy)                                   \n" +
				"           ?concept .                                                            \n" +
				"       OPTIONAL {                                                                \n" +
				"          ?concept skos:topConceptOf|skos:inScheme ?conceptSet .                 \n" +
				"       }                                                                         \n" +
				"       ?lexiconDataset lime:entry ?lexicalEntry .                                \n" +
				"    }                                                                            \n" +
				"  }}                                                                             \n" +                            
				(!profilerContext.getOptions().isDefaultToLocalReference()
				?
				" bind(IF(BOUND(?conceptSet), ?unboundVariable, IF(not exists {graph " + RenderUtils.toSPARQL(dataGraph) + " {?concept a []}},  \n" +             
				"  				REPLACE(STR(?concept), \"(.+(#|\\\\/)).*\", \"$1\"),              \n" +
				"                                ?unboundVariable)) as ?conceptualization_referenceDatasetUriSpace) \n"   
				:
				"") +
                "                                                                                 \n" +
				" }                                                                               \n" +
				"GROUP BY ?conceptSet ?conceptualization_referenceDatasetUriSpace ?lexiconDataset \n" +
				"HAVING BOUND(?lexiconDataset)"
				// @formatter:on
			);

			Map additionalRefDatasetStats = new HashMap<>();
			Map additionalRefDatasetRes = new HashMap<>();

			TupleQuery query = dataConnection.prepareTupleQuery(sb.toString());

			try (TupleQueryResult statResult = query.evaluate()) {
				while (statResult.hasNext()) {
					BindingSet stats = statResult.next();

					/* @Nullable */ Resource conceptSet = (Resource) stats.getValue("conceptSet");
					/* @Nullable */ String conceptualization_referenceDatasetUriSpace = Optional
							.ofNullable(stats.getValue("conceptualization_referenceDatasetUriSpace"))
							.map(Value::stringValue).orElse(null);
					Resource lexiconDataset = (Resource) stats.getValue("lexiconDataset");
					BigInteger concepts = Literals.getIntegerValue((Literal) stats.getValue("concepts"),
							BigInteger.ZERO);
					BigInteger conceptualizations = Literals
							.getIntegerValue((Literal) stats.getValue("conceptualizations"), BigInteger.ZERO);
					BigInteger lexicalEntries = Literals
							.getIntegerValue((Literal) stats.getValue("lexicalEntries"), BigInteger.ZERO);

					if (conceptSet == null) {
						if (conceptualization_referenceDatasetUriSpace == null) {
							conceptSet = mainDataset;
						} else {
							conceptSet = additionalRefDatasetRes
									.get(conceptualization_referenceDatasetUriSpace);

							if (conceptSet == null) {
								conceptSet = profilerContext.mintDatasetResource();
								additionalRefDatasetRes.put(conceptualization_referenceDatasetUriSpace,
										conceptSet);
								ConceptSetStats missingConceptSetStats = new ConceptSetStats();
								missingConceptSetStats
										.setUriSpace(conceptualization_referenceDatasetUriSpace);
								additionalRefDatasetStats.put(conceptualization_referenceDatasetUriSpace,
										missingConceptSetStats);
							}
						}
					}
					ConceptualizationSetStatistics statsObj = new ConceptualizationSetStatistics();
					statsObj.setConcepts(concepts);
					statsObj.setConceptualizations(conceptualizations);
					statsObj.setLexicalEntries(lexicalEntries);
					statsObj.setConceptualDataset(conceptSet);
					statsObj.setLexiconDataset(lexiconDataset);

					Optional lexiconEntries;

					try (RepositoryResult repositoryResult = metadataConnection.getStatements(lexiconDataset, LIME.LEXICAL_ENTRIES,
							null, profilerContext.getOptions().getContexts());
						 Stream stream = QueryResults.stream(repositoryResult)) {
						lexiconEntries = stream
								.map(Statement::getObject).filter(Literal.class::isInstance)
								.map(l -> Literals.getDecimalValue(l, BigDecimal.ZERO)).findFirst();
					}

					Optional conceptSetConcepts;

					try (RepositoryResult repositoryResult = metadataConnection.getStatements(conceptSet, LIME.CONCEPTS, null,
							profilerContext.getOptions().getContexts());
						 Stream stream = QueryResults.stream(repositoryResult)) {
						conceptSetConcepts = stream
								.map(Statement::getObject).filter(Literal.class::isInstance)
								.map(l -> Literals.getDecimalValue(l, BigDecimal.ZERO)).findFirst();
					}

					BigDecimal conceptualizationsAsDecimal = new BigDecimal(conceptualizations);

					lexiconEntries.ifPresent(le -> statsObj.setAvgAmbiguity(
							conceptualizationsAsDecimal.divide(le, 3, BigDecimal.ROUND_CEILING)));

					conceptSetConcepts.ifPresent(cs -> statsObj.setAvgSynonymy(
							conceptualizationsAsDecimal.divide(cs, 3, BigDecimal.ROUND_CEILING)));

					Resource conceptualizationSetResource = profilerContext
							.mintConceptualizationSetResource(conceptSet, lexiconDataset);

					Model graph = new LinkedHashModel();
					statsObj.serialize(graph, conceptualizationSetResource);
					metadataConnection.add(graph);
					metadataConnection.add(mainDataset, VOID.SUBSET, conceptualizationSetResource);

					metadataConnection.add(graph);
				}
			}

			Model graph = new LinkedHashModel();

			for (Map.Entry entry : additionalRefDatasetStats.entrySet()) {
				Resource refDatRes = additionalRefDatasetRes.get(entry.getKey());
				entry.getValue().serialize(graph, refDatRes);
			}

			metadataConnection.add(graph);
		} catch (RDF4JException e) {
			throw new ProfilerException(e);
		}
	}

	// public void profileLexicalLinksets(RepositoryConnection conn, IRI semanticModel, IRI[] graphs)
	// throws ProfilerException {
	// Map conceptSetsStats = profileConceptSets(conn, graphs);
	// SemanticModelProfiler semanticModelProfiler = getSemanticModelProfiler(semanticModel);
	// ReferenceDatasetStatistics referenceDatasetStats = semanticModelProfiler.profile(conn, graphs);
	//
	// ValueFactory vf = conn.getValueFactory();
	//
	// try {
	// TupleQuery query = conn.prepareTupleQuery(
//			// @formatter:off                       
//				" PREFIX rdfs:                                                                                             \n" +                          
//				" PREFIX ontolex:                                                                                            \n" +                          
//				" PREFIX lime:                                                                                                  \n" +                          
//				" PREFIX skos:                                                                                              \n" +                          
//				"                                                                                                                                                 \n" +                          
//				" SELECT ?conceptSet (COUNT(DISTINCT ?resource) as ?references) (COUNT (DISTINCT ?lexicalConcept) AS ?concepts)(COUNT(?lexicalConcept) as ?links) \n" +
//				" WHERE {                                                                                                                                         \n" +                          
//				" 	?resource ontolex:concept|^ontolex:isConceptOf ?lexicalConcept .																			  \n" +																					
//				"     OPTIONAL {                                                                                                                                  \n" +                          
//				"         ?lexicalConcept skos:inScheme ?conceptSet                                                                                               \n" +                          
//				"     }                                                                                                                                           \n" +                          
//				" }                                                                                                                                               \n" +                          
//				" GROUP BY ?conceptSet                                                                                                                            \n"
//				// @formatter:on
	// );
	//
	// Optional defaultConceptSet = conceptSetsStats.entrySet().stream()
	// .filter(e -> !e.getValue().isEntriesExplicit()).map(e -> e.getKey()).findAny();
	//
	// SimpleDataset dataset = new SimpleDataset();
	// Arrays.stream(graphs).forEach(dataset::addDefaultGraph);
	// query.setIncludeInferred(false);
	//
	// Collection linksetStats = QueryResults.stream(query.evaluate()).map(bs -> {
	// LexicalLinksetStats stats = new LexicalLinksetStats();
	//
	// stats.setLinkPredicate(ONTOLEX.CONCEPT);
	// stats.setReferences(
	// Literals.getIntegerValue((Literal) bs.getValue("references"), BigInteger.ZERO));
	// stats.setConcepts(
	// Literals.getIntegerValue((Literal) bs.getValue("concepts"), BigInteger.ZERO));
	// stats.setLinks(Literals.getIntegerValue((Literal) bs.getValue("links"), BigInteger.ZERO));
	//
	// Value conceptSetValue = bs.getValue("conceptSet");
	// ConceptSetStats singleConceptSetStats;
	//
	// if (conceptSetValue == null) {
	// if (defaultConceptSet.isPresent()) {
	// conceptSetValue = defaultConceptSet.get();
	// }
	// }
	//
	// if (conceptSetValue == null) {
	// return null;
	// }
	//
	// singleConceptSetStats = conceptSetsStats.get(conceptSetValue);
	// if (singleConceptSetStats == null) {
	// return null;
	// }
	//
	// stats.setAvgNumOfLinks(new BigDecimal(
	// Literals.getIntegerValue((Literal) bs.getValue("links"), BigInteger.ZERO)).divide(
	// new BigDecimal(referenceDatasetStats.getEntities()), 3,
	// BigDecimal.ROUND_CEILING));
	// return stats;
	// }).filter(o -> o != null).collect(Collectors.toList());
	//
	// Model statsModel = new TreeModel();
	//
	// referenceDatasetStats.serialize(statsModel, vf.createIRI("http://referenceDataset"));
	// conceptSetsStats.forEach((iri, stats) -> {
	// stats.serialize(statsModel, iri);
	// });
	//
	// int counter = 0;
	//
	// for (LexicalLinksetStats stats : linksetStats) {
	// IRI iri = vf.createIRI("http://lexical_linkset/" + (++counter));
	//
	// stats.serialize(statsModel, iri);
	// }
	//
	// } catch (RDF4JException e) {
	// throw new ProfilerException(e);
	// }
	// }

	protected SemanticModelProfiler getSemanticModelProfiler(IRI semanticModel)
			throws UnknownSemanticModelException {

		return getSemanticModelProfiler2(semanticModel)
				.orElseThrow(() -> new UnknownSemanticModelException(semanticModel));

	}

	protected Optional getSemanticModelProfiler2(IRI semanticModel) {

		if (semanticModel.stringValue().equals("http://www.w3.org/2002/07/owl")) {
			return Optional.of(new OWLSemanticModelProfiler());
		} else if (semanticModel.stringValue().equals("http://www.w3.org/2004/02/skos/core")) {
			return Optional.of(new SKOSSemanticModelProfiler());
		} else {
			return Optional.empty();
		}

	}

	protected LexicalizationModelProfiler getLexicalizationModelProfiler(IRI lexicalizationModel)
			throws UnknownLexicalizationModelException {

		return getLexicalizationModelProfiler2(lexicalizationModel)
				.orElseThrow(() -> new UnknownLexicalizationModelException(lexicalizationModel));

	}

	protected Optional getLexicalizationModelProfiler2(IRI lexicalizationModel) {

		if (lexicalizationModel.equals(RDFSLexicalizationModelProfiler.RDFS_LEXICALIZATION_MODEL)) {
			return Optional.of(new RDFSLexicalizationModelProfiler());
		} else if (lexicalizationModel.equals(SKOSLexicalizationModelProfiler.SKOS_LEXICALIZATION_MODEL)) {
			return Optional.of(new SKOSLexicalizationModelProfiler());
		} else if (lexicalizationModel
				.equals(SKOSXLLexicalizationModelProfiler.SKOSXL_LEXICALIZATION_MODEL)) {
			return Optional.of(new SKOSXLLexicalizationModelProfiler());
		} else if (lexicalizationModel
				.equals(OntoLexLexicalizationModelProfiler.ONTOLEX_LEXICALIZATION_MODEL)) {
			return Optional.of(new OntoLexLexicalizationModelProfiler());
		} else {
			return Optional.empty();
		}

	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy