org.apache.commons.rdf.experimental.RDFParser Maven / Gradle / Ivy
Show all versions of commons-rdf-api Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.rdf.experimental;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Optional;
import java.util.concurrent.Future;
import java.util.function.Consumer;
import org.apache.commons.rdf.api.BlankNode;
import org.apache.commons.rdf.api.Dataset;
import org.apache.commons.rdf.api.Graph;
import org.apache.commons.rdf.api.IRI;
import org.apache.commons.rdf.api.Quad;
import org.apache.commons.rdf.api.RDFSyntax;
import org.apache.commons.rdf.api.RDFTerm;
import org.apache.commons.rdf.api.RDF;
import org.apache.commons.rdf.api.Triple;
/**
* Parse an RDF source into a target (e.g. a Graph/Dataset).
* Experimental
This interface (and its implementations) should be
* considered at risk; they might change or be removed in the
* next minor update of Commons RDF. It may move to the the
* {@link org.apache.commons.rdf.api} package when it has stabilized.
* Description
*
* This interface follows the
* Builder pattern,
* allowing to set parser settings like {@link #contentType(RDFSyntax)} and
* {@link #base(IRI)}. A caller MUST call one of the source
methods
* (e.g. {@link #source(IRI)}, {@link #source(Path)},
* {@link #source(InputStream)}), and MUST call one of the target
* methods (e.g. {@link #target(Consumer)}, {@link #target(Dataset)},
* {@link #target(Graph)}) before calling {@link #parse()} on the returned
* RDFParser - however methods can be called in any order.
*
* The call to {@link #parse()} returns a {@link Future}, allowing asynchronous
* parse operations. Callers are recommended to check {@link Future#get()} to
* ensure parsing completed successfully, or catch exceptions thrown during
* parsing.
*
* Setting a method that has already been set will override any existing value
* in the returned builder - regardless of the parameter type (e.g.
* {@link #source(IRI)} will override a previous {@link #source(Path)}. Settings
* can be unset by passing null
- note that this may require
* casting, e.g. contentType( (RDFSyntax) null )
to undo a previous
* call to {@link #contentType(RDFSyntax)}.
*
* It is undefined if a RDFParser is mutable or thread-safe, so callers should
* always use the returned modified RDFParser from the builder methods. The
* builder may return itself after modification, or a cloned builder with the
* modified settings applied. Implementations are however encouraged to be
* immutable, thread-safe and document this. As an example starting point, see
* org.apache.commons.rdf.simple.AbstractRDFParser
.
*
* Example usage:
*
*
*
* Graph g1 = rDFTermFactory.createGraph();
* new ExampleRDFParserBuilder().source(Paths.get("/tmp/graph.ttl")).contentType(RDFSyntax.TURTLE).target(g1).parse()
* .get(30, TimeUnit.Seconds);
*
*
*/
public interface RDFParser {
/**
* The result of {@link RDFParser#parse()} indicating parsing completed.
*
* This is a marker interface that may be subclassed to include parser
* details, e.g. warning messages or triple counts.
*/
public interface ParseResult {
}
/**
* Specify which {@link RDF} to use for generating {@link RDFTerm}s.
*
* This option may be used together with {@link #target(Graph)} to override
* the implementation's default factory and graph.
*
* Warning: Using the same {@link RDF} for multiple
* {@link #parse()} calls may accidentally merge {@link BlankNode}s having
* the same label, as the parser may use the
* {@link RDF#createBlankNode(String)} method from the parsed blank node
* labels.
*
* @see #target(Graph)
* @param rdfTermFactory
* {@link RDF} to use for generating RDFTerms.
* @return An {@link RDFParser} that will use the specified rdfTermFactory
*/
RDFParser rdfTermFactory(RDF rdfTermFactory);
/**
* Specify the content type of the RDF syntax to parse.
*
* This option can be used to select the RDFSyntax of the source, overriding
* any Content-Type
headers or equivalent.
*
* The character set of the RDFSyntax is assumed to be
* {@link StandardCharsets#UTF_8} unless overridden within the document
* (e.g. {@code } in
* {@link RDFSyntax#RDFXML}).
*
* This method will override any contentType set with
* {@link #contentType(String)}.
*
* @see #contentType(String)
* @param rdfSyntax
* An {@link RDFSyntax} to parse the source according to, e.g.
* {@link RDFSyntax#TURTLE}.
* @throws IllegalArgumentException
* If this RDFParser does not support the specified RDFSyntax.
* @return An {@link RDFParser} that will use the specified content type.
*/
RDFParser contentType(RDFSyntax rdfSyntax) throws IllegalArgumentException;
/**
* Specify the content type of the RDF syntax to parse.
*
* This option can be used to select the RDFSyntax of the source, overriding
* any Content-Type
headers or equivalent.
*
* The content type MAY include a charset
parameter if the RDF
* media types permit it; the default charset is
* {@link StandardCharsets#UTF_8} unless overridden within the document.
*
* This method will override any contentType set with
* {@link #contentType(RDFSyntax)}.
*
* @see #contentType(RDFSyntax)
* @param contentType
* A content-type string, e.g. application/ld+json
* or text/turtle;charset="UTF-8"
as specified by
*
* RFC7231.
* @return An {@link RDFParser} that will use the specified content type.
* @throws IllegalArgumentException
* If the contentType has an invalid syntax, or this RDFParser
* does not support the specified contentType.
*/
RDFParser contentType(String contentType) throws IllegalArgumentException;
/**
* Specify a {@link Graph} to add parsed triples to.
*
* If the source supports datasets (e.g. the {@link #contentType(RDFSyntax)}
* set has {@link RDFSyntax#supportsDataset} is true)), then only quads in
* the default graph will be added to the Graph as {@link Triple}s.
*
* It is undefined if any triples are added to the specified {@link Graph}
* if {@link #parse()} throws any exceptions. (However implementations are
* free to prevent this using transaction mechanisms or similar). If
* {@link Future#get()} does not indicate an exception, the parser
* implementation SHOULD have inserted all parsed triples to the specified
* graph.
*
* Calling this method will override any earlier targets set with
* {@link #target(Graph)}, {@link #target(Consumer)} or
* {@link #target(Dataset)}.
*
* The default implementation of this method calls {@link #target(Consumer)}
* with a {@link Consumer} that does {@link Graph#add(Triple)} with
* {@link Quad#asTriple()} if the quad is in the default graph.
*
* @param graph
* The {@link Graph} to add triples to.
* @return An {@link RDFParser} that will insert triples into the specified
* graph.
*/
default RDFParser target(final Graph graph) {
return target(q -> {
if (!q.getGraphName().isPresent()) {
graph.add(q.asTriple());
}
});
}
/**
* Specify a {@link Dataset} to add parsed quads to.
*
* It is undefined if any quads are added to the specified {@link Dataset}
* if {@link #parse()} throws any exceptions. (However implementations are
* free to prevent this using transaction mechanisms or similar). On the
* other hand, if {@link #parse()} does not indicate an exception, the
* implementation SHOULD have inserted all parsed quads to the specified
* dataset.
*
* Calling this method will override any earlier targets set with
* {@link #target(Graph)}, {@link #target(Consumer)} or
* {@link #target(Dataset)}.
*
* The default implementation of this method calls {@link #target(Consumer)}
* with a {@link Consumer} that does {@link Dataset#add(Quad)}.
*
* @param dataset
* The {@link Dataset} to add quads to.
* @return An {@link RDFParser} that will insert triples into the specified
* dataset.
*/
default RDFParser target(final Dataset dataset) {
return target(dataset::add);
}
/**
* Specify a consumer for parsed quads.
*
* The quads will include triples in all named graphs of the parsed source,
* including any triples in the default graph. When parsing a source format
* which do not support datasets, all quads delivered to the consumer will
* be in the default graph (e.g. their {@link Quad#getGraphName()} will be
* as {@link Optional#empty()}), while for a source
*
* It is undefined if any quads are consumed if {@link #parse()} throws any
* exceptions. On the other hand, if {@link #parse()} does not indicate an
* exception, the implementation SHOULD have produced all parsed quads to
* the specified consumer.
*
* Calling this method will override any earlier targets set with
* {@link #target(Graph)}, {@link #target(Consumer)} or
* {@link #target(Dataset)}.
*
* The consumer is not assumed to be thread safe - only one
* {@link Consumer#accept(Object)} is delivered at a time for a given
* {@link RDFParser#parse()} call.
*
* This method is typically called with a functional consumer, for example:
*
*
* {@code
* List quads = new ArrayList;
* parserBuilder.target(quads::add).parse();
* }
*
*
* @param consumer
* A {@link Consumer} of {@link Quad}s
* @return An {@link RDFParser} that will call the consumer for into the
* specified dataset.
*/
RDFParser target(Consumer consumer);
/**
* Specify a base IRI to use for parsing any relative IRI references.
*
* Setting this option will override any protocol-specific base IRI (e.g.
* Content-Location
header) or the {@link #source(IRI)} IRI,
* but does not override any base IRIs set within the source document (e.g.
* @base
in Turtle documents).
*
* If the source is in a syntax that does not support relative IRI
* references (e.g. {@link RDFSyntax#NTRIPLES}), setting the
* base
has no effect.
*
* This method will override any base IRI set with {@link #base(String)}.
*
* @see #base(String)
* @param base
* An absolute IRI to use as a base.
* @return An {@link RDFParser} that will use the specified base IRI.
*/
RDFParser base(IRI base);
/**
* Specify a base IRI to use for parsing any relative IRI references.
*
* Setting this option will override any protocol-specific base IRI (e.g.
* Content-Location
header) or the {@link #source(IRI)} IRI,
* but does not override any base IRIs set within the source document (e.g.
* @base
in Turtle documents).
*
* If the source is in a syntax that does not support relative IRI
* references (e.g. {@link RDFSyntax#NTRIPLES}), setting the
* base
has no effect.
*
* This method will override any base IRI set with {@link #base(IRI)}.
*
* @see #base(IRI)
* @param base
* An absolute IRI to use as a base.
* @return An {@link RDFParser} that will use the specified base IRI.
* @throws IllegalArgumentException
* If the base is not a valid absolute IRI string
*/
RDFParser base(String base) throws IllegalArgumentException;
/**
* Specify a source {@link InputStream} to parse.
*
* The source set will not be read before the call to {@link #parse()}.
*
* The InputStream will not be closed after parsing. The InputStream does
* not need to support {@link InputStream#markSupported()}.
*
* The parser might not consume the complete stream (e.g. an RDF/XML parser
* may not read beyond the closing tag of
* </rdf:Description>
).
*
* The {@link #contentType(RDFSyntax)} or {@link #contentType(String)}
* SHOULD be set before calling {@link #parse()}.
*
* The character set is assumed to be {@link StandardCharsets#UTF_8} unless
* the {@link #contentType(String)} specifies otherwise or the document
* declares its own charset (e.g. RDF/XML with a
* <?xml encoding="iso-8859-1">
header).
*
* The {@link #base(IRI)} or {@link #base(String)} MUST be set before
* calling {@link #parse()}, unless the RDF syntax does not permit relative
* IRIs (e.g. {@link RDFSyntax#NTRIPLES}).
*
* This method will override any source set with {@link #source(IRI)},
* {@link #source(Path)} or {@link #source(String)}.
*
* @param inputStream
* An InputStream to consume
* @return An {@link RDFParser} that will use the specified source.
*/
RDFParser source(InputStream inputStream);
/**
* Specify a source file {@link Path} to parse.
*
* The source set will not be read before the call to {@link #parse()}.
*
* The {@link #contentType(RDFSyntax)} or {@link #contentType(String)}
* SHOULD be set before calling {@link #parse()}.
*
* The character set is assumed to be {@link StandardCharsets#UTF_8} unless
* the {@link #contentType(String)} specifies otherwise or the document
* declares its own charset (e.g. RDF/XML with a
* <?xml encoding="iso-8859-1">
header).
*
* The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
* {@link #parse()}, otherwise {@link Path#toUri()} will be used as the base
* IRI.
*
* This method will override any source set with {@link #source(IRI)},
* {@link #source(InputStream)} or {@link #source(String)}.
*
* @param file
* A Path for a file to parse
* @return An {@link RDFParser} that will use the specified source.
*/
RDFParser source(Path file);
/**
* Specify an absolute source {@link IRI} to retrieve and parse.
*
* The source set will not be read before the call to {@link #parse()}.
*
* If this builder does not support the given IRI protocol (e.g.
* urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890
), this method
* should succeed, while the {@link #parse()} should throw an
* {@link IOException}.
*
* The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY
* be set before calling {@link #parse()}, in which case that type MAY be
* used for content negotiation (e.g. Accept
header in HTTP),
* and SHOULD be used for selecting the RDFSyntax.
*
* The character set is assumed to be {@link StandardCharsets#UTF_8} unless
* the protocol's equivalent of Content-Type
specifies
* otherwise or the document declares its own charset (e.g. RDF/XML with a
* <?xml encoding="iso-8859-1">
header).
*
* The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
* {@link #parse()}, otherwise the source IRI will be used as the base IRI.
*
* This method will override any source set with {@link #source(Path)},
* {@link #source(InputStream)} or {@link #source(String)}.
*
* @param iri
* An IRI to retrieve and parse
* @return An {@link RDFParser} that will use the specified source.
*/
RDFParser source(IRI iri);
/**
* Specify an absolute source IRI to retrieve and parse.
*
* The source set will not be read before the call to {@link #parse()}.
*
* If this builder does not support the given IRI (e.g.
* urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890
), this method
* should succeed, while the {@link #parse()} should throw an
* {@link IOException}.
*
* The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY
* be set before calling {@link #parse()}, in which case that type MAY be
* used for content negotiation (e.g. Accept
header in HTTP),
* and SHOULD be used for selecting the RDFSyntax.
*
* The character set is assumed to be {@link StandardCharsets#UTF_8} unless
* the protocol's equivalent of Content-Type
specifies
* otherwise or the document declares its own charset (e.g. RDF/XML with a
* <?xml encoding="iso-8859-1">
header).
*
* The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
* {@link #parse()}, otherwise the source IRI will be used as the base IRI.
*
* This method will override any source set with {@link #source(Path)},
* {@link #source(InputStream)} or {@link #source(IRI)}.
*
* @param iri
* An IRI to retrieve and parse
* @return An {@link RDFParser} that will use the specified source.
* @throws IllegalArgumentException
* If the base is not a valid absolute IRI string
*
*/
RDFParser source(String iri) throws IllegalArgumentException;
/**
* Parse the specified source.
*
* A source method (e.g. {@link #source(InputStream)}, {@link #source(IRI)},
* {@link #source(Path)}, {@link #source(String)} or an equivalent subclass
* method) MUST have been called before calling this method, otherwise an
* {@link IllegalStateException} will be thrown.
*
* A target method (e.g. {@link #target(Consumer)},
* {@link #target(Dataset)}, {@link #target(Graph)} or an equivalent
* subclass method) MUST have been called before calling parse(), otherwise
* an {@link IllegalStateException} will be thrown.
*
* It is undefined if this method is thread-safe, however the
* {@link RDFParser} may be reused (e.g. setting a different source) as soon
* as the {@link Future} has been returned from this method.
*
* The RDFParser SHOULD perform the parsing as an asynchronous operation,
* and return the {@link Future} as soon as preliminary checks (such as
* validity of the {@link #source(IRI)} and {@link #contentType(RDFSyntax)}
* settings) have finished. The future SHOULD not mark
* {@link Future#isDone()} before parsing is complete. A synchronous
* implementation MAY be blocking on the parse()
call and
* return a Future that is already {@link Future#isDone()}.
*
* The returned {@link Future} contains a {@link ParseResult}.
* Implementations may subclass this interface to provide any parser
* details, e.g. list of warnings. null
is a possible return
* value if no details are available, but parsing succeeded.
*
* If an exception occurs during parsing, (e.g. {@link IOException} or
* org.apache.commons.rdf.simple.experimental.RDFParseException
),
* it should be indicated as the
* {@link java.util.concurrent.ExecutionException#getCause()} in the
* {@link java.util.concurrent.ExecutionException} thrown on
* {@link Future#get()}.
*
* @return A Future that will return the populated {@link Graph} when the
* parsing has finished.
* @throws IOException
* If an error occurred while starting to read the source (e.g.
* file not found, unsupported IRI protocol). Note that IO
* errors during parsing would instead be the
* {@link java.util.concurrent.ExecutionException#getCause()} of
* the {@link java.util.concurrent.ExecutionException} thrown on
* {@link Future#get()}.
* @throws IllegalStateException
* If the builder is in an invalid state, e.g. a
* source
has not been set.
*/
Future extends ParseResult> parse() throws IOException, IllegalStateException;
}