All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.contentpump.RDFReader Maven / Gradle / Ivy

There is a newer version: 11.3.1
Show newest version
/*
 * Copyright (c) 2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RiotReader;
import org.apache.jena.riot.lang.LangRIOT;
import org.apache.jena.riot.lang.PipedQuadsStream;
import org.apache.jena.riot.lang.PipedRDFIterator;
import org.apache.jena.riot.lang.PipedRDFStream;
import org.apache.jena.riot.lang.PipedTriplesStream;
import org.apache.jena.riot.system.ErrorHandler;
import org.apache.jena.riot.system.ParserProfile;
import org.apache.jena.riot.system.RiotLib;
import org.apache.jena.riot.system.StreamRDF;
import org.apache.jena.riot.system.StreamRDFLib;

import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.query.Dataset;
import com.hp.hpl.jena.query.DatasetFactory;
import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.sparql.core.Quad;
import com.marklogic.contentpump.utilities.FileIterator;
import com.marklogic.contentpump.utilities.IdGenerator;
import com.marklogic.contentpump.utilities.PermissionUtil;
import com.marklogic.mapreduce.LinkedMapWritable;
import com.marklogic.mapreduce.MarkLogicConstants;
import com.marklogic.mapreduce.utilities.InternalUtilities;
import com.marklogic.xcc.AdhocQuery;
import com.marklogic.xcc.ContentCapability;
import com.marklogic.xcc.ContentPermission;
import com.marklogic.xcc.ContentSource;
import com.marklogic.xcc.RequestOptions;
import com.marklogic.xcc.ResultSequence;
import com.marklogic.xcc.Session;
import com.marklogic.xcc.exceptions.RequestException;
import com.marklogic.xcc.exceptions.XccConfigException;

/**
 * Reader for RDF quads/triples. Uses Jena library to parse RDF and sends triples
 * to the database in groups of MAXTRIPLESPERDOCUMENT.
 * 
 * @author nwalsh
 *
 * @param 
 */
public class RDFReader extends ImportRecordReader {
    public static final Log LOG = LogFactory.getLog(RDFReader.class);
    public static final String HASHALGORITHM = "SHA-256";
    public static final String DEFAULT_GRAPH = "http://marklogic.com/semantics#default-graph";
    public static final String JENA_DEFAULT_GRAPH = "urn:x-arq:DefaultGraphNode";
    protected static Pattern[] patterns = new Pattern[] {
            Pattern.compile("&"), Pattern.compile("<"), Pattern.compile(">") };

    protected int MAXTRIPLESPERDOCUMENT = 100;
    protected int MAXGRAPHSPERREQUEST = 100;
    protected int countPerBatch = 0;
    protected long INMEMORYTHRESHOLD = 1 * 1024 * 1000; // 1Mb
    protected long INGESTIONNOTIFYSTEP = 10000;

    protected Dataset dataset = null;
    protected StmtIterator statementIter = null;
    protected Iterator graphNameIter = null;
    protected String collection = null;
    protected RunnableParser jenaStreamingParser = null;

    protected PipedRDFIterator rdfIter;
    protected PipedRDFStream rdfInputStream;
    protected Lang lang;

    protected Hashtable collectionHash = 
            new Hashtable ();
    protected int collectionCount = 0;
    private static final int MAX_COLLECTIONS = 100;
    protected boolean ignoreCollectionQuad = false;
    protected boolean hasOutputCol = false;
    protected String outputGraph;
    protected String outputOverrideGraph;
    protected StringBuilder buffer;
    protected boolean hasNext = true;
    protected IdGenerator idGen;

    protected Random random;
    protected long randomValue;
    protected long milliSecs;
    private long HASH64_STEP = 15485863L;

    protected String origFn;
    // Tracks input filename even in the CompressedRDFReader case
    protected String inputFn; 
    protected long splitStart;
    protected long start;
    /*stays 0 until we're done*/
    protected long pos;
    protected long end;
    protected boolean compressed;
    protected long ingestedTriples = 0;
    /* new graphs identified within a RDFReader */
    protected HashSet newGraphs;
    protected HashMap existingMapPerms;
    protected Iterator graphItr;
    /* server version */
    protected String version;
    protected LinkedMapWritable roleMap;
    protected ContentPermission[] defaultPerms;
    protected StringBuilder graphQry;
    /* hadoop:get-role-map() only exists in ML 8.0-1~8.0-3 */
    protected boolean roleMapExists;
    protected boolean graphSupported;
    
    private static final Object jenaLock = new Object();
    
    public RDFReader(String version, LinkedMapWritable roleMap) {
        random = new Random();
        randomValue = random.nextLong();
        Calendar cal = Calendar.getInstance();
        milliSecs = cal.getTimeInMillis();
        compressed = false;
        this.version = version;
        this.roleMap = roleMap;
        roleMapExists = roleMap!=null && roleMap.size()>0 ;
        graphQry = new StringBuilder();
        existingMapPerms = new HashMap();
        newGraphs = new HashSet();
    }

    @Override
    public void close() throws IOException {
        if(rdfIter!=null) {
            rdfIter.close();
        }
        dataset = null;
        if(graphQry.length()==0) 
            return;
        //create graph doc in a batch
        submitGraphQuery();
    }

    protected void submitGraphQuery() throws IOException{
        Session session = null;
        ContentSource cs;
        try {
            cs = InternalUtilities.getOutputContentSource(conf,
                conf.getStrings(MarkLogicConstants.OUTPUT_HOST)[0]);
            session = cs.newSession();
            RequestOptions options = new RequestOptions();
            options.setDefaultXQueryVersion("1.0-ml");
            session.setDefaultRequestOptions(options);
            AdhocQuery query = session.newAdhocQuery(graphQry.toString());
            if(LOG.isDebugEnabled()) {
                LOG.debug(graphQry.toString());
            }
            query.setOptions(options);
            session.submitRequest(query);
        } catch (RequestException e) {
            throw new IOException(e);
        } catch (XccConfigException e) {
            throw new IOException(e);
        } finally {
            if (session != null) {
                session.close();
            }
        }
    }
    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (!hasNext) {
            return 1;
        }
        return (pos > end) ? 1 : ((float) (pos - start)) / (end - start);
    }

    @Override
    public void initialize(InputSplit inSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        if (version == null)
            throw new IOException("Server Version is null");
        String majorVersion = version.substring(0, version.indexOf('.'));
        graphSupported = Integer.valueOf(majorVersion) >= 8;
        conf = context.getConfiguration();

        String rdfopt = conf.get(ConfigConstants.RDF_STREAMING_MEMORY_THRESHOLD);
        if (rdfopt != null) {
            INMEMORYTHRESHOLD = Long.parseLong(rdfopt);
        }

        rdfopt = conf.get(ConfigConstants.RDF_TRIPLES_PER_DOCUMENT);
        if (rdfopt != null) {
            MAXTRIPLESPERDOCUMENT = Integer.parseInt(rdfopt);
        }

        String fnAsColl = conf.get(ConfigConstants.CONF_OUTPUT_FILENAME_AS_COLLECTION);
        if (fnAsColl != null) {
            LOG.warn("The -filename_as_collection has no effect with input_type RDF, use -output_collections instead.");
        }

        String[] collections = conf.getStrings(MarkLogicConstants.OUTPUT_COLLECTION);
        outputGraph = conf.get(MarkLogicConstants.OUTPUT_GRAPH);
        outputOverrideGraph = conf.get(MarkLogicConstants.OUTPUT_OVERRIDE_GRAPH);
        //if no defulat-graph set and output_collections is set
        ignoreCollectionQuad = (outputGraph == null && collections != null)
            || outputOverrideGraph != null;
        hasOutputCol = (collections != null);

        Class valueClass = RDFWritable.class;

        @SuppressWarnings("unchecked")
        VALUEIN localValue = (VALUEIN) ReflectionUtils.newInstance(valueClass, 
                conf);

        value = localValue;
        encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                DEFAULT_ENCODING);

        setFile(((FileSplit) inSplit).getPath());
        fs = file.getFileSystem(context.getConfiguration());
        
        FileStatus status = fs.getFileStatus(file);
        if(status.isDirectory()) {
            iterator = new FileIterator((FileSplit)inSplit, context);
            inSplit = iterator.next();
        }

        try {
            initStream(inSplit);
        } catch (IOException e ){
            LOG.error("Invalid input: " + file.getName() + ": " + e.getMessage());
            throw e;
        }
        String[] perms = conf.getStrings(MarkLogicConstants.OUTPUT_PERMISSION);
        if(perms!=null) {
            defaultPerms = PermissionUtil.getPermissions(perms).toArray(
                new ContentPermission[perms.length>>1]);
        } else {
            List tmp = PermissionUtil.getDefaultPermissions(conf,roleMap);
            if(tmp!=null)
                defaultPerms = tmp.toArray(new ContentPermission[tmp.size()]);
        }
            
        if (roleMapExists) 
            initExistingMapPerms();
    }

    protected void initStream(InputSplit inSplit)
            throws IOException, InterruptedException {
        FSDataInputStream in = openFile(inSplit, false);
        if (in == null) {
            return;
        }
        long size = inSplit.getLength();
        initParser(file.toUri().toASCIIString(), size);
        parse(file.getName(), in);
    }

    protected void initParser(String fsname, long size) throws IOException {
        start = 0;
        pos = 0;
        end = 1;

        jenaStreamingParser = null;
        dataset = null;
        statementIter = null;
        graphNameIter = null;

        String ext = null;
        if (fsname.contains(".")) {
            int pos = fsname.lastIndexOf(".");
            ext = fsname.substring(pos);
            if (".gz".equals(ext)) {
                fsname = fsname.substring(0, pos);
                pos = fsname.lastIndexOf(".");
                if (pos >= 0) {
                    ext = fsname.substring(pos);
                } else {
                    ext = null;
                }
            }
        }
        origFn = fsname;
        inputFn = Long.toHexString(fuse(scramble(random.nextLong()),
                fuse(scramble(milliSecs), random.nextLong())));
        idGen = new IdGenerator(inputFn + "-" + splitStart);
        lang = null;
        if (".rdf".equals(ext)) {
            lang = Lang.RDFXML;
        } else if (".ttl".equals(ext)) {
            lang = Lang.TURTLE;
        } else if (".json".equals(ext)) {
            lang = Lang.RDFJSON;
        } else if (".n3".equals(ext)) {
            lang = Lang.N3;
        } else if (".nt".equals(ext)) {
            lang = Lang.NTRIPLES;
        } else if (".nq".equals(ext)) {
            lang = Lang.NQUADS;
        } else if (".trig".equals(ext)) {
            lang = Lang.TRIG;
        } else {
            lang = Lang.RDFXML; // We have to default to something!
        }
        synchronized (jenaLock) {
            if (size < INMEMORYTHRESHOLD) {
                dataset = DatasetFactory.createMem();
            } 
        }
    }

    protected void parse(String fsname, FSDataInputStream in)
            throws IOException {
        try {
            loadModel(fsname, in);
        } catch (Exception e) {
            LOG.error("Failed to parse(please check intactness and encoding): " + origFn);
        }
    }

    protected void loadModel(final String fsname, final InputStream in) throws IOException {
        if (dataset == null) {
            if (lang == Lang.NQUADS || lang == Lang.TRIG) {
                rdfIter = new PipedRDFIterator();
                @SuppressWarnings("unchecked")
                PipedQuadsStream stream = new PipedQuadsStream(rdfIter);
                rdfInputStream = stream;
            } else {
                rdfIter = new PipedRDFIterator();
                @SuppressWarnings("unchecked")
                PipedTriplesStream stream = new PipedTriplesStream(rdfIter);
                rdfInputStream = stream;
            }

            // Create a runnable for our parser thread
            jenaStreamingParser = new RunnableParser(origFn, fsname, in);

            // Run it
            new Thread(jenaStreamingParser).start();
        } else {
            StreamRDF dest = StreamRDFLib.dataset(dataset.asDatasetGraph());
            LangRIOT parser = RiotReader.createParser(in, lang, fsname, dest);
            ErrorHandler handler = new ParserErrorHandler(fsname);
            ParserProfile prof = RiotLib.profile(lang, fsname, handler);
            parser.setProfile(prof);
            try {
                parser.parse();
            } catch (Throwable e) {
                LOG.error("Parse error in RDF document(please check intactness and encoding); processing partial document:"
                    + fsname + " " + e.getMessage());
            }
            in.close();
            graphNameIter = dataset.listNames();
            statementIter = dataset.getDefaultModel().listStatements();
        }

        // We don't know how many statements are in the model; we could count them, but that's
        // possibly expensive. So we just say 0 until we're done.
        pos = 0;
        end = 1;
    }

    protected void write(String str) {
        if (buffer == null) {
            buffer = new StringBuilder();
        }
        buffer.append(str);
    }


    private long rotl(long x, long y)
    {
        return (x<>(64-y));
    }

    private long fuse(long a, long b)
    {
        return rotl(a,8)^b;
    }

    private long scramble(long x)
    {
        return x^rotl(x,20)^rotl(x,40);
    }

    private long hash64(long value, String str) {
        char[] arr = str.toCharArray();
        for (int i = 0; i < str.length(); i++) {
            value = (value + Character.getNumericValue(arr[i])) * HASH64_STEP;
        }
        return value;
    }

    protected String resource(Node rsrc) {
        if (rsrc.isBlank()) {
            return "http://marklogic.com/semantics/blank/" + Long.toHexString(
                        hash64(fuse(scramble(milliSecs),randomValue), rsrc.getBlankNodeLabel()));
        } else {
            return escapeXml(rsrc.toString());
        }
    }

    protected String resource(Node rsrc, String tag) {
        String uri = resource(rsrc);
        return "" + uri + "";
    }

    private String resource(Resource rsrc) {
        if (rsrc.isAnon()) {
            return "http://marklogic.com/semantics/blank/" + Long.toHexString(
                    hash64(fuse(scramble(milliSecs),randomValue), rsrc.getId().getLabelString()));
        } else {
            return escapeXml(rsrc.toString());
        }
    }

    protected String resource(Resource rsrc, String tag) {
        String uri = resource(rsrc);
        return "" + uri + "";
    }

    protected String subject(Node subj) {
        return resource(subj, "subject");
    }

    protected String subject(Resource subj) {
        return resource(subj, "subject");
    }

    protected String predicate(Node subj) {
        return resource(subj, "predicate");
    }

    protected String predicate(Resource subj) {
        return resource(subj, "predicate");
    }

    protected String object(Node node) {
        if (node.isLiteral()) {
            String text = node.getLiteralLexicalForm();
            String type = node.getLiteralDatatypeURI();
            String lang = node.getLiteralLanguage();

            if (lang == null || "".equals(lang)) {
                lang = "";
            } else {
                lang = " xml:lang='" + escapeXml(lang) + "'";
            }

            if ("".equals(lang)) {
                if (type == null) {
                    type = "http://www.w3.org/2001/XMLSchema#string";
                }
                type = " datatype='" + escapeXml(type) + "'";
            } else {
                type = "";
            }

            return "" + escapeXml(text) + "";
        } else if (node.isBlank()) {
            return "http://marklogic.com/semantics/blank/" + Long.toHexString(
                    hash64(fuse(scramble(milliSecs),randomValue), node.getBlankNodeLabel()))
                    +"";
        } else {
            return "" + escapeXml(node.toString()) + "";
        }
    }

    private String object(RDFNode node) {
        if (node.isLiteral()) {
            Literal lit = node.asLiteral();
            String text = lit.getString();
            String lang = lit.getLanguage();
            String type = lit.getDatatypeURI();

            if (lang == null || "".equals(lang)) {
                lang = "";
            } else {
                lang = " xml:lang='" + escapeXml(lang) + "'";
            }

            if ("".equals(lang)) {
                if (type == null) {
                    type = "http://www.w3.org/2001/XMLSchema#string";
                }
                type = " datatype='" + escapeXml(type) + "'";
            } else {
                type = "";
            }

            return "" + escapeXml(text) + "";
        } else if (node.isAnon()) {
            return "http://marklogic.com/semantics/blank/" + Long.toHexString(
                    hash64(fuse(scramble(milliSecs),randomValue), node.toString()))
                    +"";
        } else {
            return "" + escapeXml(node.toString()) + "";
        }
    }

    protected static String escapeXml(String _in) {
        if (null == _in){
            return "";
        }
        return patterns[2].matcher(
                    patterns[1].matcher(
                        patterns[0].matcher(_in).replaceAll("&"))
                            .replaceAll("<")).replaceAll(">");
    }

    protected void setKey() {
        setKey(idGen.incrementAndGet() +".xml", 0, 0, true);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean result = false;
        if (jenaStreamingParser == null || !jenaStreamingParser.failed()) {
            if (statementIter == null) {
                result = nextStreamingKeyValue();
            } else {
                result = nextInMemoryKeyValue();
            }
        }

        return result;
    }
    
    public void initExistingMapPerms() throws IOException {
        Session session = null;
        ResultSequence result = null;
        ContentSource cs;
        try {
            cs = InternalUtilities.getOutputContentSource(conf,
                conf.getStrings(MarkLogicConstants.OUTPUT_HOST)[0]);
            session = cs.newSession();
            RequestOptions options = new RequestOptions();
            options.setDefaultXQueryVersion("1.0-ml");
            session.setDefaultRequestOptions(options);
            StringBuilder sb = new StringBuilder();
            sb.append("xquery version \"1.0-ml\";\n");
            sb.append("for $doc in fn:collection(\"http://marklogic.com/semantics#graphs\")");
            sb.append("return (fn:base-uri($doc),for $p in $doc/*:graph/*:permissions/*:permission return ($p/*:role-id/text(),$p/*:capability/text()),\"0\")");
            
            if(LOG.isDebugEnabled()) {
                LOG.debug(sb.toString());
            }
            AdhocQuery query = session.newAdhocQuery(sb.toString());
            query.setOptions(options);
            result = session.submitRequest(query);
            while (result.hasNext()) {
                String uri = result.next().asString();
                String tmp = result.next().asString();
                ArrayList perms = new ArrayList();
                while(!tmp.equals("0")) {
                    Text roleid = new Text(tmp);
                    if (!result.hasNext()) {
                        throw new IOException("Invalid role map");
                    }
                    String roleName = roleMap.get(roleid).toString();
                    String cap = result.next().asString();
                    ContentCapability capability = PermissionUtil
                        .getCapbility(cap);
                    perms.add(new ContentPermission(capability, roleName));
                    tmp = result.next().asString();
                }
                
                existingMapPerms.put(uri, perms.toArray(new ContentPermission[perms.size()]));
                
            }
        } catch (XccConfigException e) {
            throw new IOException(e);
        } catch (RequestException e) {
            throw new IOException(e);
        } finally {
            if (result != null) {
                result.close();
            }
            if (session != null) {
                session.close();
            }
        }
    }
    
    /* 
     * create graph doc
     * 
     * return ContentPermission[] for the graph
     */
    public ContentPermission[] insertGraphDoc(String graph) throws IOException {
        ArrayList perms = new ArrayList();
            ContentPermission[] permissions = defaultPerms;
            StringBuilder sb = graphQry;
            if (countPerBatch >= MAXGRAPHSPERREQUEST) {
                countPerBatch = 0;
                submitGraphQuery();
                graphQry.setLength(0);
            }
            String escapedGraph = escapeXml(graph);
            sb.append("if(fn:empty(fn:doc(\"").append(escapedGraph)
                .append("\"))) then sem:create-graph-document(sem:iri(\"")
                .append(escapedGraph).append("\"),(");
            if (permissions != null && permissions.length > 0) {
                for (int i = 0; i < permissions.length; i++) {
                    ContentPermission cp = permissions[i];
                    if (i > 0)
                        sb.append(",");
                    sb.append("xdmp:permission(\"");
                    sb.append(cp.getRole());
                    sb.append("\",\"");
                    sb.append(cp.getCapability());
                    sb.append("\")");
                }
                sb.append(")");
            } else {
                sb.append("xdmp:default-permissions())");
            }
            sb.append(") else ();\n");
            countPerBatch++;
        return perms.toArray(new ContentPermission[0]);
    }

    public boolean nextInMemoryKeyValue() throws IOException, InterruptedException {
        if (lang == Lang.NQUADS || lang == Lang.TRIG) {
            return nextInMemoryQuadKeyValue();
        } else {
            return nextInMemoryTripleKeyValue();
        }
    }

    public boolean nextInMemoryTripleKeyValue() throws IOException, InterruptedException {
        if(statementIter == null) return false;
        if (!statementIter.hasNext()) {
            hasNext = false;
            return false;
        }
        setKey();
        write("\n");
        write("" + origFn + "\n");
        int max = MAXTRIPLESPERDOCUMENT;
        while (max > 0 && statementIter.hasNext()) {
            Statement stmt = statementIter.nextStatement();
            write("");
            write(subject(stmt.getSubject()));
            write(predicate(stmt.getPredicate()));
            write(object(stmt.getObject()));
            write("\n");
            notifyUser();
            max--;
        }
        write("\n");

        if (!statementIter.hasNext()) {
            pos = 1;
        }

        writeValue();
        return true;
    }

    public boolean nextInMemoryQuadKeyValue() throws IOException, InterruptedException {
        if (ignoreCollectionQuad) {
            return nextInMemoryQuadKeyValueIgnoreCollections();
        } else {
            return nextInMemoryQuadKeyValueWithCollections();
        }
    }

    public boolean nextInMemoryQuadKeyValueWithCollections() throws IOException, InterruptedException {
        if(statementIter == null) return false;
        while (!statementIter.hasNext()) {
            if (graphNameIter.hasNext()) {
                collection = graphNameIter.next();
                statementIter = dataset.getNamedModel(collection).listStatements();
            } else {
                hasNext = false;
                collection = null;
                return false;
            }
        }
        setKey();
        write("");
        int max = MAXTRIPLESPERDOCUMENT;
        while (max > 0 && statementIter.hasNext()) {
            Statement stmt = statementIter.nextStatement();
            write("");
            write(subject(stmt.getSubject()));
            write(predicate(stmt.getPredicate()));
            write(object(stmt.getObject()));
            write("");
            max--;
            notifyUser();
        }
        write("\n");

        if (!statementIter.hasNext()) {
            pos = 1;
        }

        writeValue(collection);
        return true;
    }

    public boolean nextInMemoryQuadKeyValueIgnoreCollections() throws IOException, InterruptedException {
        if(statementIter == null) return false;
        while (!statementIter.hasNext()) {
            if (graphNameIter.hasNext()) {
                collection = graphNameIter.next();
                statementIter = dataset.getNamedModel(collection).listStatements();
            } else {
                hasNext = false;
                return false;
            }
        }
        setKey();
        write("");
        int max = MAXTRIPLESPERDOCUMENT;
        while (max > 0 && statementIter.hasNext()) {
            Statement stmt = statementIter.nextStatement();
            write("");
            write(subject(stmt.getSubject()));
            write(predicate(stmt.getPredicate()));
            write(object(stmt.getObject()));
            write("");
            notifyUser();
            max--;

            boolean moreTriples = statementIter.hasNext();
            while (!moreTriples) {
                moreTriples = true; // counter-intuitive; get out of the loop if we really are finished
                if (graphNameIter.hasNext()) {
                    collection = graphNameIter.next();
                    statementIter = dataset.getNamedModel(collection).listStatements();
                    moreTriples = statementIter.hasNext();
                }
            }
        }
        write("\n");

        if (!statementIter.hasNext()) {
            pos = 1;
        }

        writeValue();
        return true;
    }

    public boolean nextStreamingKeyValue() throws IOException, InterruptedException {
        if(rdfIter == null) return false;
        if (!rdfIter.hasNext() && collectionHash.size() == 0) {
            if(compressed) {
                hasNext = false;
                return false;
            } else {
                if (iterator!=null && iterator.hasNext()) {
                    close();
                    initStream(iterator.next());
                } else {
                    hasNext = false;
                    return false;
                }
            }
        }

        if (lang == Lang.NQUADS || lang == Lang.TRIG) {
            return nextStramingQuadKeyValue();
        } else {
            return nextStreamingTripleKeyValue();
        }
    }

    protected boolean nextStreamingTripleKeyValue() throws IOException, InterruptedException {
        if(rdfIter == null) return false;
        setKey();
        write("");
        int max = MAXTRIPLESPERDOCUMENT;
        while (max > 0 && rdfIter.hasNext()) {
            Triple triple = (Triple) rdfIter.next();
            write("");
            write(subject(triple.getSubject()));
            write(predicate(triple.getPredicate()));
            write(object(triple.getObject()));
            write("");
            notifyUser();
            max--;
        }
        write("\n");

        if (!rdfIter.hasNext()) {
            pos = 1;
        }

        writeValue();
        return true;
    }

    public boolean nextStramingQuadKeyValue() throws IOException, InterruptedException {
        if (ignoreCollectionQuad) {
            return nextStreamingQuadKeyValueIgnoreCollections();
        } else {
            return nextStreamingQuadKeyValueWithCollections();
        }
    }

    protected boolean nextStreamingQuadKeyValueIgnoreCollections() throws IOException, InterruptedException {
        if(rdfIter == null) return false;
        setKey();
        write("");
        int max = MAXTRIPLESPERDOCUMENT;
        while (max > 0 && rdfIter.hasNext()) {
            Quad quad = (Quad) rdfIter.next();
            write("");
            write(subject(quad.getSubject()));
            write(predicate(quad.getPredicate()));
            write(object(quad.getObject()));
            write("");
            notifyUser();
            max--;
        }
        write("\n");

        if (!rdfIter.hasNext()) {
            pos = 1;
        }

        writeValue();
        return true;
    }

    public boolean nextStreamingQuadKeyValueWithCollections() throws IOException, InterruptedException {
        if(rdfIter == null) return false;
        if (!rdfIter.hasNext() && collectionHash.isEmpty()) {
            hasNext = false;
            return false;
        }

        String collection = null;
        boolean overflow = false;
        while (!overflow && rdfIter.hasNext()) {
            Quad quad = (Quad) rdfIter.next();

            Node graph = quad.getGraph();
            if (graph == null) {
                collection = DEFAULT_GRAPH;
            } else {
                collection = resource(quad.getGraph());
            }

            String triple = subject(quad.getSubject())
                    + predicate(quad.getPredicate())
                    + object(quad.getObject());

            if (!collectionHash.containsKey(collection)) {
                collectionHash.put(collection, new Vector());
                collectionCount++;
                //System.err.println("Added " + collection + " (" + collectionHash.keySet().size() + ")");
            } else {
                //System.err.println("      " + collection + " (" + collectionHash.get(collection).size() + ")");
            }

            @SuppressWarnings("unchecked")
            Vector triples = collectionHash.get(collection);

            triples.add("" + triple + "");

            //System.err.println(triple);

            if (triples.size() == MAXTRIPLESPERDOCUMENT) {
                //System.err.println("Full doc " + collection + " (" + triples.size() + ")");
                overflow = true;
            } else if (collectionCount > MAX_COLLECTIONS) {
                collection = largestCollection();
                //System.err.println("Full hsh " + collection + " (" + collectionHash.get(collection).size() + ")");
                overflow = true;
            }
        }

        if (!overflow) {
            // HACK: fix this!
            for (String c : collectionHash.keySet()) {
                collection = c;
                //System.err.println("Flushing " + collection + " (" + collectionHash.get(collection).size() + ")");
                break;
            }
        }
        Vector triples = collectionHash.get(collection);
        setKey();
        write("");
        for (String t : triples) {
            write(t);
            notifyUser();
        }
        write("\n");

        collectionHash.remove(collection);
        collectionCount--;

        if (!rdfIter.hasNext()) {
            pos = 1;
        }

        writeValue(collection);
        return true;
    }

    public void writeValue() throws IOException {
        writeValue(null);
    }

    public void writeValue(String collection) throws IOException {
        if (value instanceof Text) {
            ((Text) value).set(buffer.toString());
        } else if (value instanceof RDFWritable) {
            ((RDFWritable)value).set(buffer.toString());
            
            if (collection != null) {
                if(collection.equals(JENA_DEFAULT_GRAPH)) {
                    collection = null;
                }
            }
            
            if(hasOutputCol){// output_collections is set
               if(outputOverrideGraph!=null) {
                    collection = outputOverrideGraph;
                } else if(outputGraph != null) {
                    if (collection == null) {
                        //no graph specified in quad, use output_graph
                        collection = outputGraph;
                    }
                } else { // no output_graph or output_override_graph
                    String[] outCols = conf
                    .getStrings(MarkLogicConstants.OUTPUT_COLLECTION);
                collection = outCols[0];
                }
            } else {//no output_collections
                if (collection == null) { //no quad in data
                    collection = outputGraph != null ? outputGraph : outputOverrideGraph;
                    if(collection == null)
                        collection = DEFAULT_GRAPH;
                }
            }
            if (roleMapExists && existingMapPerms.containsKey(collection)) {
                ((RDFWritable)value).setPermissions(existingMapPerms.get(collection));
            } else {
                ((RDFWritable)value).setPermissions(defaultPerms);
            }
            if (graphSupported && !newGraphs.contains(collection)) {
                newGraphs.add(collection);
                insertGraphDoc(collection);
            }
            ((RDFWritable)value).setGraph(collection);
        } else {
            ((Text)((ContentWithFileNameWritable)
                    value).getValue()).set(buffer.toString());
        }

        buffer.setLength(0);
    }

    protected String largestCollection() {
        String collection = "";
        int count = -1;

        for (String c : collectionHash.keySet()) {
            if (collectionHash.get(c).size() > count) {
                count = collectionHash.get(c).size();
                collection = c;
            }
        }

        return collection;
    }

    protected void notifyUser() {
        ingestedTriples++;
        if (ingestedTriples % INGESTIONNOTIFYSTEP == 0) {
            LOG.info("Ingested " + ingestedTriples + " triples from " + origFn);
        }
    }

    protected class ParserErrorHandler implements ErrorHandler {
        String inputfn = "";

        public ParserErrorHandler(String inputfn) {
            this.inputfn = inputfn;
        }

        private String formatMessage(String message, long line, long col) {
            String msg = inputfn + ":";
            if (line >= 0) {
                msg += line;
            }
            if (line >= 0 && col >= 0) {
                msg += ":" + col;
            }
            return msg += " " + message;
        }

        @Override
        public void warning(String message, long line, long col) {
            if (message.contains("Bad IRI:")) {
                LOG.debug(formatMessage(message, line, col));
            } else {
                LOG.warn(formatMessage(message, line, col));
            }
        }

        @Override
        public void error(String message, long line, long col) {
            LOG.error(formatMessage(message, line, col));
        }

        @Override
        public void fatal(String message, long line, long col) {
            LOG.fatal(formatMessage(message, line, col));
        }
    }
    
    protected class RunnableParser implements Runnable {
        final String fsname;
        final InputStream in;
        final String origFn;
        private boolean failed = false;
        
        public RunnableParser(String origFn, String fsname, InputStream in) {
            super();
            this.fsname = fsname;
            this.in = in;
            this.origFn = origFn;
            if(LOG.isDebugEnabled())
                LOG.debug("O:" + origFn + " : " + fsname);
        }

        public boolean failed() {
            return failed;
        }

        @Override
        public void run() {
            LangRIOT parser;
            try {
                parser = RiotReader.createParser(in, lang, fsname, 
                        rdfInputStream);
            } catch (Exception ex) {
                // Yikes something went horribly wrong, bad encoding maybe?
                LOG.error("Failed to parse(please check intactness and encoding): " + origFn, ex);

                byte[] b = new byte[0] ;
                InputStream emptyBAIS = new ByteArrayInputStream(b) ;
                parser = RiotReader.createParser(emptyBAIS, lang, fsname, 
                        rdfInputStream);
            }
            try {
                ErrorHandler handler = new ParserErrorHandler(fsname);
                ParserProfile prof = RiotLib.profile(lang, fsname, handler);
                parser.setProfile(prof);
                parser.parse();
            } catch (Exception ex) {
                failed = true;
                LOG.error("Parse error in RDF document(please check intactness and encoding); processing partial document:"
                    + origFn + " " + ex.getMessage());
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy