org.chelona.NTriplesParser.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of chelona_2.12 Show documentation
Show all versions of chelona_2.12 Show documentation
W3C RDF 1.1 Turtle-, TriG-, Quad- and NTriples-Parser
The newest version!
/*
* Copyright (C) 2014, 2015, 2016 Juergen Pfundt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.chelona
import org.parboiled2._
import scala.collection.mutable
import scala.io.BufferedSource
import scala.util.{ Failure, Success }
object NTriplesParser {
def apply(input: ParserInput, output: (Term, Term, Term, Term) ⇒ Int, validate: Boolean = false, basePath: String = "http://chelona.org", label: String = "", verbose: Boolean = true, trace: Boolean = false) = {
new NTriplesParser(input, output, validate, basePath, label, verbose, trace)
}
def parseAll(filename: String, inputBuffer: BufferedSource, output: (Term, Term, Term, Term) ⇒ Int, validate: Boolean, base: String, label: String, verbose: Boolean, trace: Boolean, n: Int): Unit = {
val ms: Double = System.currentTimeMillis
var parserQueue = mutable.Queue[NTriplesParser]()
val worker = new NTriplesThreadWorker(parserQueue, filename, validate, verbose, trace)
worker.setName("NTriplesParser")
worker.start()
val lines = if (n < 50000) 100000 else if (n > 1000000) 1000000 else n
var tripleCount: Long = 0L
val iterator = inputBuffer.getLines()
while (iterator.hasNext) {
if (parserQueue.length > 1024) Thread.sleep(100) // a more sophisticated throtteling should be supplied
asynchronous(NTriplesParser(iterator.take(lines).mkString("\n"), output, validate, base, label))
}
worker.shutdown()
worker.join()
tripleCount += worker.tripleCount
while (parserQueue.nonEmpty) {
val parser = parserQueue.dequeue()
val res = parser.ntriplesDoc.run()
res match {
case Success(count) ⇒ tripleCount += count
case Failure(e: ParseError) ⇒
if (!trace) System.err.println("File '" + filename + "': " + parser.formatError(e, new ChelonaErrorFormatter(block = tripleCount))) else System.err.println("File '" + filename + "': " + parser.formatError(e, new ChelonaErrorFormatter(block = tripleCount, showTraces = true)))
case Failure(e) ⇒ System.err.println("File '" + filename + "': Unexpected error during parsing run: " + e)
}
}
if (verbose) {
val me: Double = System.currentTimeMillis - ms
if (!validate) {
System.err.println("Input file '" + filename + "' converted in " + (me / 1000.0) + "sec " + tripleCount + " triples (triples per second = " + ((tripleCount * 1000) / me + 0.5).toInt + ")")
} else {
System.err.println("Input file '" + filename + "' composed of " + tripleCount + " statements successfully validated in " + (me / 1000.0) + "sec (statements per second = " + ((tripleCount * 1000) / me + 0.5).toInt + ")")
}
}
def asynchronous(parser: NTriplesParser) = parserQueue.synchronized {
parserQueue.enqueue(parser)
if (parserQueue.nonEmpty) parserQueue.notify()
}
}
}
class NTriplesParser(val input: ParserInput, val output: (Term, Term, Term, Term) ⇒ Int, validate: Boolean = false, val basePath: String = "http://chelona.org", val label: String = "", val verbose: Boolean = true, val trace: Boolean = false) extends Parser with StringBuilding {
import org.chelona.CharPredicates._
import org.parboiled2.CharPredicate.{ Alpha, AlphaNum, HexDigit }
import NTripleAST._
private def hexStringToCharString(s: String) = s.grouped(4).map(cc ⇒ (Character.digit(cc(0), 16) << 12 | Character.digit(cc(1), 16) << 8 | Character.digit(cc(2), 16) << 4 | Character.digit(cc(3), 16)).toChar).filter(_ != '\u0000').mkString("")
/*
Parsing of the turtle data is done in the main thread.
Evaluation of the abstract syntax tree for each turtle statement is passed to a separate thread "TurtleASTWorker".
The ast evaluation procedure eval and the ast for a statement are placed in a queue.
The abstract syntax trees of the Turtle statements are evaluated in sequence!
Parsing continues immmediatly.
---P--- denotes the time for parsing a Turtle statement
A denotes administration time for the worker thread
Q denotes the time for enqueueing or dequeueing an ast of a Turtle statement
++E++ denotes the time for evaluating an ast of a Turtle statement
Without worker thread parsing and evaluation of Turtle ast statements is done sequentially in one thread:
main thread: ---P---++E++---P---++E++---P---++E++---P---++E++---P---++E++---P---++E++...
The main thread enqueues an ast of a parsed Turtle statement.
The worker thread dequeues an ast of a Turtle statement and evaluates it.
main thread: AAAAA---P---Q---P---Q---P---Q---P---Q---P---Q---P---Q---P---...
worker thread: Q++E++ Q++E++ Q++E++Q++E++Q++E++ Q++E++
Overhead for administration, e.g. waiting, notifying, joining and shutting down of the worker thread is not shown
in the schematic illustration. Only some initial administrative effort is depicted. For small Turtle data it is
usually faster to not use a worker thread due to the overhead involved to create, manage and dispose it.
It takes some statements until catching up of the delay caused by the worker thread overhead is successful.
For simple Turtle data, which consists mostly of simple s-p-o triples, the ast evaluation is rather short. The
overhead for managing a worker thread compensates the time gain of evaluating the ast in a separate thread.
+E+ denotes the time for evaluating an ast of a simple s-p-o Turtle statement
main thread: ---P---+E+---P---+E+---P---+E+---P---+E+---P---+E+---P---+E+...
Use the 'thread' option for Turtle data which actually uses explicit Turtle syntax like prefixes,
predicate object-lists, collections, etc.
*/
var astQueue = mutable.Queue[(NTripleType ⇒ Int, NTripleType)]()
val worker = new ASTThreadWorker(astQueue)
if (!validate) {
import scala.util._
worker.setName("NTripleASTWorker" + Random.nextInt())
worker.start()
}
/*
Enqueue ast for a NTriple statement
*/
def asynchronous(ast: (NTripleType ⇒ Int, NTripleType)) = astQueue.synchronized {
astQueue.enqueue(ast)
if (astQueue.nonEmpty) astQueue.notify()
}
lazy val renderStatement = EvalNTriples(output, basePath, label).renderStatement _
def ws = rule {
quiet(anyOf(" \t").*)
}
//
def comment = rule {
quiet('#' ~ capture(noneOf("\n").*)) ~> ASTComment
}
//[1] ntriplesDoc ::= triple? (EOL triple)* EOL?
def ntriplesDoc: Rule1[Long] = rule {
(ntriples).*(EOL) ~ EOL.? ~ EOI ~> ((v: Seq[Int]) ⇒ {
if (!validate) {
worker.shutdown()
worker.join()
while (astQueue.nonEmpty) {
val (renderStatement, ast) = astQueue.dequeue()
worker.sum += renderStatement(ast)
}
}
worker.quit()
if (validate) v.sum
else worker.sum
})
}
def ntriples = rule {
(triple ~> ((ast: NTripleType) ⇒
if (!__inErrorAnalysis) {
if (!validate) {
asynchronous((renderStatement, ast)); 1
} else
ast match {
case ASTComment(s) ⇒ 0
case _ ⇒ 1
}
} else {
if (!validate) {
if (astQueue.nonEmpty) {
worker.shutdown()
worker.join()
while (astQueue.nonEmpty) {
val (renderStatement, ast) = astQueue.dequeue()
worker.sum += renderStatement(ast)
}
}
}
0
}))
}
//[2] triple ::= subject predicate object '.'
def triple: Rule1[NTripleType] = rule {
ws ~ (subject ~ predicate ~ `object` ~ '.' ~ ws ~ comment.? ~> ASTTriple | comment ~> ASTTripleComment) | quiet(anyOf(" \t").+) ~ push("") ~> ASTBlankLine
}
//[3] subject ::= IRIREF | BLANK_NODE_LABEL
def subject = rule {
run {
cursorChar match {
case '<' ⇒ IRIREF
case '_' ⇒ BLANK_NODE_LABEL
case _ ⇒ MISMATCH
}
} ~> ASTSubject
}
//[4] predicate ::= IRIREF
def predicate: Rule1[ASTPredicate] = rule {
IRIREF ~> ASTPredicate
}
//[5] object ::= IRIREF | BLANK_NODE_LABEL | literal
def `object` = rule {
run {
cursorChar match {
case '<' ⇒ IRIREF
case '"' ⇒ literal
case '_' ⇒ BLANK_NODE_LABEL
case _ ⇒ MISMATCH
}
} ~> ASTObject
}
//[6] literal ::= STRING_LITERAL_QUOTE ('^^' IRIREF | LANGTAG)?
def literal = rule {
STRING_LITERAL_QUOTE ~ ws ~ (LANGTAG | '^' ~ '^' ~ ws ~ IRIREF).? ~> ASTLiteral ~ ws
}
//[144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
def LANGTAG = rule {
atomic("@" ~ capture(Alpha.+ ~ ('-' ~ AlphaNum.+).*)) ~ ws ~> ASTLangTag
}
//[7] EOL ::= [#xD#xA]+
def EOL = rule {
('\r'.? ~ '\n').+
}
//[8] IRIREF ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>'
def IRIREF = rule {
atomic('<' ~ clearSB ~ (IRIREF_CHAR ~ appendSB |
!((("\\u000" | "\\u001" | "\\U0000000" | "\\U0000001") ~ HexDigit) |
"\\u0020" | "\\U00000020" | "\\u0034" | "\\U00000034" |
"\\u003C" | "\\u003c" | "\\U0000003C" | "\\U0000003c" |
"\\u003E" | "\\u003e" | "\\U0000003E" | "\\U0000003e" |
"\\u005C" | "\\u005c" | "\\U0000005C" | "\\U0000005c" |
"\\u005E" | "\\u005e" | "\\U0000005E" | "\\U0000005E" |
"\\u0060" | "\\U00000060" |
"\\u007B" | "\\u007b" | "\\U0000007B" | "\\U0000007b" |
"\\u007C" | "\\u007c" | "\\U0000007C" | "\\U0000007c" |
"\\u007D" | "\\u007d" | "\\U0000007D" | "\\U0000007d") ~ UCHAR(false)).*) ~ push(sb.toString) ~ '>' ~>
((iri: String) ⇒ (test(isAbsoluteIRIRef(iri)) | run(ChelonaErrorFormatter().WarningMessage(
"relative IRI not allowed", iri, "Use absolute IRI instead", cursor, input
))) ~ push(iri)) ~> ASTIriRef ~ ws
}
//[9] STRING_LITERAL_QUOTE ::= '"' ([^#x22#x5C#xA#xD] | ECHAR | UCHAR)* '"'
def STRING_LITERAL_QUOTE = rule {
atomic('"' ~ clearSB ~ (noneOf("\"\\\r\n") ~ appendSB | UCHAR(true) | ECHAR).* ~ '"') ~ push(sb.toString) ~ ws ~> ASTStringLiteralQuote
}
//[141s] BLANK_NODE_LABEL ::= '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?
def BLANK_NODE_LABEL = rule {
atomic("_:" ~ capture(PN_CHARS_U_DIGIT ~ (PN_CHARS | &(ch('.').+ ~ PN_CHARS) ~ ch('.').+ ~ PN_CHARS | isHighSurrogate ~ isLowSurrogate).*)) ~> ASTBlankNodeLabel ~ ws
}
//[10] UCHAR ::= '\\u' HEX HEX HEX HEX | '\\U' HEX HEX HEX HEX HEX HEX HEX HEX
def UCHAR(flag: Boolean) = rule {
atomic("\\u" ~ capture(4.times(HexDigit))) ~> (maskQuotes(flag, _)) |
atomic("\\U" ~ capture(8.times(HexDigit))) ~> (maskQuotes(flag, _))
}
//[153s] ECHAR ::= '\' [tbnrf"'\]
def ECHAR = rule {
atomic("\\" ~ appendSB ~ ECHAR_CHAR ~ appendSB)
}
private def isAbsoluteIRIRef(iriRef: String): Boolean = {
iriRef.startsWith("//") || hasScheme(iriRef)
}
private def hasScheme(iri: String) = SchemeIdentifier(iri)
private def maskQuotes(flag: Boolean, s: String) = {
val c = hexStringToCharString(s)
if (c.compare("\"") != 0)
appendSB(c)
else {
if (flag)
appendSB("\\" + c)
else
appendSB("\\u0022")
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy