All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metreeca.rdf4j.actions.Microdata Maven / Gradle / Ivy

The newest version!


/*
 * Copyright © 2013-2022 Metreeca srl
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.metreeca.rdf4j.actions;

import com.metreeca.json.Values;
import com.metreeca.rdf.schemas.Schema;
import com.metreeca.rest.services.Logger;

import org.eclipse.rdf4j.model.*;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.w3c.dom.*;

import java.net.URI;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import static com.metreeca.json.Values.*;
import static com.metreeca.rest.Toolbox.service;
import static com.metreeca.rest.Xtream.entry;
import static com.metreeca.rest.services.Logger.logger;

import static java.lang.String.format;
import static java.util.Arrays.asList;
import static java.util.Collections.singleton;
import static java.util.stream.Collectors.toCollection;
import static java.util.stream.Collectors.toList;

/**
 * Microdata parser.
 *
 * 

Converts microdata annotations in an XML document into a stream of equivalent RDF statements.

* * @see HTML Living Standard - § 5 Microdata */ public final class Microdata implements Function> { private static final String Id="id"; private static final String ItemScope="itemscope"; private static final String ItemId="itemid"; private static final String ItemType="itemtype"; private static final String ItemProp="itemprop"; private static final String ItemRef="itemref"; private static final Pattern SpacePattern=Pattern.compile("[\t\n\f\r ]+"); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private final Logger logger=service(logger()); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @Override public Stream apply(final Node node) { return node instanceof Document ? items(((Document)node).getDocumentElement()) : node instanceof Element ? items((Element)node) : Stream.empty(); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private Stream items(final Element element) { return items(element, false, ids(element)); } private Stream items(final Element element, final boolean scoped, final Map ids) { if ( element.hasAttribute(ItemScope) ) { return item(element, ids).getValue(); } else { final boolean scoping=scoped || attribute(element, Id).filter(ids::containsKey).isPresent(); if ( element.hasAttribute(ItemId) ) { logger.warning(this, format("%s requires %s", ItemId, ItemScope)); // !!! report open tag } if ( element.hasAttribute(ItemType) ) { logger.warning(this, format("%s requires %s", ItemType, ItemScope)); // !!! report open tag } if ( element.hasAttribute(ItemRef) ) { logger.warning(this, format("%s outside %s", ItemRef, ItemScope)); // !!! report open tag } if ( element.hasAttribute(ItemProp) && !(scoping) ) { logger.warning(this, format("%s outside %s", ItemProp, ItemScope)); // !!! report open tag } return children(element).flatMap(child -> items(child, scoping, ids)); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private Map.Entry> item(final Element element, final Map ids) { if ( element.hasAttribute(ItemId) && !element.hasAttribute(ItemType) ) { logger.warning(this, format("%s requires %s", ItemId, ItemType)); // !!! report open tag } final Resource id=id(element); final List types=types(element).collect(toList()); final String vocabulary=types.stream().findFirst().map(IRI::getNamespace).orElse(Terms.stringValue()); Stream model=types.stream().map(type -> statement(id, RDF.TYPE, Schema.normalize(type))); final Collection visited=new HashSet<>(singleton(element)); final Queue pending=Stream.concat(children(element), attribute(element, ItemRef).stream() .flatMap(refs -> Arrays.stream(SpacePattern.split(refs))) .map(ref -> { final Element target=ids.get(ref); if ( target == null ) { logger.warning(this, format("undefined %s <%s>", ItemRef, ref)); } return target; }) .filter(Objects::nonNull) ).collect(toCollection(ArrayDeque::new)); while ( !pending.isEmpty() ) { final Element current=pending.remove(); final List props=props(current, vocabulary).collect(toList()); if ( !visited.add(current) ) { logger.warning(this, format("%s cycle", ItemRef)); // !!! report open tag } else if ( current.hasAttribute(ItemScope) ) { final Entry> value=item(current, ids); model=Stream.concat(model, props.stream() .map(prop -> statement(id, Schema.normalize(prop), Schema.normalize(value.getKey()))) ); model=Stream.concat(model, value.getValue()); } else { model=Stream.concat(model, value(current).stream() .flatMap(value -> props.stream() .map(prop -> statement(id, Schema.normalize(prop), Schema.normalize(value)))) ); pending.addAll(children(current).collect(toList())); } } return entry(id, model); } private Resource id(final Element element) { // !!! refactor return attribute(element, ItemId) .map(id -> Optional.ofNullable(element.getBaseURI()) .map(base -> { try { return URI.create(base); } catch ( final IllegalArgumentException e ) { logger.warning(this, format("malformed base URI <%s>", base)); return null; } }) .map(base -> { try { return base.resolve(id); } catch ( final IllegalArgumentException e ) { logger.warning(this, format("malformed %s <%s>", ItemId, id)); // !!! report open tag return null; } }) .map(URI::toString) .orElse(id) ) .filter(id -> { if ( AbsoluteIRIPattern.matcher(id).matches() ) { return true; } else { logger.warning(this, format("malformed %s <%s>", ItemId, id)); // !!! report open tag return false; } }) .map(Values::iri) .map(Resource.class::cast) .orElseGet(Values::bnode); } private Stream types(final Element element) { final AtomicReference vocabulary=new AtomicReference<>(); final Set types=ConcurrentHashMap.newKeySet(); return attribute(element, ItemType).stream() .flatMap(type -> Arrays.stream(SpacePattern.split(type))) .map(type -> { try { return iri(URI.create(type)); } catch ( final IllegalArgumentException e ) { logger.warning(this, format("malformed %s <%s>", ItemType, type)); return null; } }) .filter(Objects::nonNull) .peek(type -> { if ( !types.add(type) ) { logger.warning(this, format("repeated %s value <%s>", ItemType, type)); } }) .peek(type -> vocabulary.accumulateAndGet(type.getNamespace(), (current, merged) -> { if ( current == null || current.equals(merged) ) { return merged; } else { logger.warning(this, format("multiple %s vocabularies <%s>", ItemType, merged)); return current; } })) .distinct(); } private Stream props(final Element element, final String vocabulary) { final Set props=ConcurrentHashMap.newKeySet(); return attribute(element, ItemProp).stream() .flatMap(type -> Arrays.stream(SpacePattern.split(type))) .map(prop -> { try { return iri(URI.create(vocabulary).resolve(prop)); } catch ( final IllegalArgumentException e ) { logger.warning(this, format("malformed %s <%s>", ItemProp, prop)); return null; } }) .filter(Objects::nonNull) .peek(prop -> { if ( !props.add(prop) ) { logger.warning(this, format("repeated %s value <%s>", ItemProp, prop)); } }); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private Optional value(final Element element) { return element.getTagName().equalsIgnoreCase("meta") ? text(element, "content") : element.getTagName().equalsIgnoreCase("audio") ? link(element, "src") : element.getTagName().equalsIgnoreCase("embed") ? link(element, "src") : element.getTagName().equalsIgnoreCase("iframe") ? link(element, "src") : element.getTagName().equalsIgnoreCase("img") ? link(element, "src") : element.getTagName().equalsIgnoreCase("source") ? link(element, "src") : element.getTagName().equalsIgnoreCase("track") ? link(element, "src") : element.getTagName().equalsIgnoreCase("video") ? link(element, "src") : element.getTagName().equalsIgnoreCase("a") ? link(element, "href") : element.getTagName().equalsIgnoreCase("area") ? link(element, "href") : element.getTagName().equalsIgnoreCase("link") ? link(element, "href") : element.getTagName().equalsIgnoreCase("object") ? link(element, "data") : element.getTagName().equalsIgnoreCase("data") ? text(element, "value") : element.getTagName().equalsIgnoreCase("meter") ? number(element, "value") : element.getTagName().equalsIgnoreCase("time") ? temporal(element, "datetime") : text(element.getTextContent()).map(Values::literal); } private Optional link(final Element element, final String attribute) { return attribute(element, attribute, true) .map(value -> { try { return URI.create(Optional.ofNullable(element.getBaseURI()).orElse(Values.Base)).resolve(value); } catch ( final IllegalArgumentException e ) { logger.warning(this, format("malformed %s URL attribute <%s>", attribute, value)); return null; } }) .map(Values::iri); } private Optional number(final Element element, final String attribute) { return attribute(element, attribute, true) .map(value -> { try { return Double.parseDouble(value); } catch ( final NumberFormatException e ) { logger.warning(this, format("malformed %s float attribute <%s>", attribute, value)); return null; } }) .map(Values::literal); } private Optional temporal(final Element element, final String attribute) { return attribute(element, attribute, true) .map(Optional::of) .orElseGet(() -> text(element.getTextContent())) .map(value -> { final List> parsers=asList( v -> literal(literal(v).temporalAccessorValue()), v -> literal(literal(v).temporalAmountValue()) ); for (final Function parser : parsers) { try { return parser.apply(value); } catch ( final RuntimeException ignored ) { } } logger.warning(this, format("malformed %s time value <%s>", attribute, value)); return literal(value); // support custom post-processing converters }); } private Optional text(final Element element, final String attribute) { return attribute(element, attribute, true).map(Values::literal); } private Optional text(final String value) { return Optional.of(value).map(String::trim).filter(v -> !v.isEmpty()); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private String tag(final Element element) { final StringBuilder builder=new StringBuilder(100); builder.append("<").append(element.getTagName().toLowerCase(Locale.ROOT)); final NamedNodeMap attributes=element.getAttributes(); for (int i=0; i < attributes.getLength(); ++i) { builder.append(" ").append(attributes.item(i).getNodeName()) .append("='").append(attributes.item(i).getNodeValue()).append("'"); } builder.append("/>"); return builder.toString(); } private Optional attribute(final Element element, final String attribute) { return attribute(element, attribute, false); } private Optional attribute(final Element element, final String attribute, final boolean required) { return Optional.of(element.getAttribute(attribute)).map(String::trim).filter(value -> { final boolean empty=value.isEmpty(); final boolean defined=element.hasAttribute(attribute); if ( empty && defined ) { logger.warning(this, format("empty %s attribute", attribute)); // !!! report open tag } if ( required && !defined ) { logger.warning(this, format("missing %s attribute", attribute)); // !!! report open tag } return !empty; }); } private Stream children(final Node node) { return Stream.of(node.getChildNodes()) .flatMap(list -> IntStream.range(0, list.getLength()).mapToObj(list::item)) .filter(Element.class::isInstance) .map(Element.class::cast); } private Map ids(final Element element) { final Map ids=new HashMap<>(); final Set refs=new HashSet<>(); final Queue pending=new ArrayDeque<>(singleton(element)); while ( !pending.isEmpty() ) { final Element current=pending.remove(); attribute(current, Id).ifPresent(id -> ids.put(id, current)); attribute(current, ItemRef).ifPresent(refs::add); children(current).forEach(pending::add); } ids.keySet().retainAll(refs); // retain ids only if actually referenced from itemref return ids; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy