All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.marklogic.spark.reader.document.DocumentRowBuilder Maven / Gradle / Ivy

There is a newer version: 2.4.2
Show newest version
/*
 * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
 */
package com.marklogic.spark.reader.document;

import com.marklogic.client.document.DocumentManager;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.spark.ConnectorException;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData;
import org.apache.spark.sql.catalyst.util.ArrayData;
import org.apache.spark.unsafe.types.ByteArray;
import org.apache.spark.unsafe.types.UTF8String;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.XMLOutputter;

import java.io.ByteArrayInputStream;
import java.util.*;

/**
 * Knows how to build a Spark row conforming to our {@code DocumentRowSchema}.
 * 

* This has to support two different ways of specifying which metadata to include. {@code ForestReader} needs to * capture the requested metadata in one way, while other approaches can just capture the metadata categories as a * simple list of strings. */ public class DocumentRowBuilder { private final List metadataCategories; private final Set requestedMetadata; // For handling XML document properties private final SAXBuilder saxBuilder; private final XMLOutputter xmlOutputter; private static final Namespace PROPERTIES_NAMESPACE = Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"); private String uri; private byte[] content; private String format; private DocumentMetadataHandle metadata; public DocumentRowBuilder(List metadataCategories) { this.saxBuilder = new SAXBuilder(); this.xmlOutputter = new XMLOutputter(); this.metadataCategories = metadataCategories != null ? metadataCategories : new ArrayList<>(); this.requestedMetadata = null; } public DocumentRowBuilder(Set requestedMetadata) { this.saxBuilder = new SAXBuilder(); this.xmlOutputter = new XMLOutputter(); this.requestedMetadata = requestedMetadata; this.metadataCategories = null; } public DocumentRowBuilder withUri(String uri) { this.uri = uri; return this; } public DocumentRowBuilder withContent(byte[] content) { this.content = content; return this; } public DocumentRowBuilder withFormat(String format) { this.format = format; return this; } public DocumentRowBuilder withMetadata(DocumentMetadataHandle metadata) { this.metadata = metadata; return this; } public GenericInternalRow buildRow() { Object[] row = new Object[8]; row[0] = UTF8String.fromString(uri); row[1] = ByteArray.concat(content); if (format != null) { row[2] = UTF8String.fromString(format); } if (metadata != null) { if (includeCollections()) { populateCollectionsColumn(row, metadata); } if (includePermissions()) { populatePermissionsColumn(row, metadata); } if (includeQuality()) { populateQualityColumn(row, metadata); } if (includeProperties()) { populatePropertiesColumn(row, metadata); } if (includeMetadataValues()) { populateMetadataValuesColumn(row, metadata); } } return new GenericInternalRow(row); } private boolean includeCollections() { return includeMetadata("collections", DocumentManager.Metadata.COLLECTIONS); } private boolean includePermissions() { return includeMetadata("permissions", DocumentManager.Metadata.PERMISSIONS); } private boolean includeQuality() { return includeMetadata("quality", DocumentManager.Metadata.QUALITY); } private boolean includeProperties() { return includeMetadata("properties", DocumentManager.Metadata.PROPERTIES); } private boolean includeMetadataValues() { return includeMetadata("metadatavalues", DocumentManager.Metadata.METADATAVALUES); } private boolean includeMetadata(String categoryName, DocumentManager.Metadata metadataType) { return metadataCategories != null ? metadataCategories.contains(categoryName) || metadataCategories.isEmpty() : requestedMetadata.contains(metadataType) || requestedMetadata.contains(DocumentManager.Metadata.ALL); } private void populateCollectionsColumn(Object[] row, DocumentMetadataHandle metadata) { UTF8String[] collections = new UTF8String[metadata.getCollections().size()]; Iterator iterator = metadata.getCollections().iterator(); for (int i = 0; i < collections.length; i++) { collections[i] = UTF8String.fromString(iterator.next()); } row[3] = ArrayData.toArrayData(collections); } private void populatePermissionsColumn(Object[] row, DocumentMetadataHandle metadata) { DocumentMetadataHandle.DocumentPermissions perms = metadata.getPermissions(); UTF8String[] roles = new UTF8String[perms.size()]; Object[] capabilityArrays = new Object[perms.size()]; int i = 0; for (Map.Entry> entry : perms.entrySet()) { roles[i] = UTF8String.fromString(entry.getKey()); UTF8String[] capabilities = new UTF8String[entry.getValue().size()]; int j = 0; Iterator iterator = entry.getValue().iterator(); while (iterator.hasNext()) { capabilities[j++] = UTF8String.fromString(iterator.next().name()); } capabilityArrays[i++] = ArrayData.toArrayData(capabilities); } row[4] = ArrayBasedMapData.apply(roles, capabilityArrays); } private void populateQualityColumn(Object[] row, DocumentMetadataHandle metadata) { row[5] = metadata.getQuality(); } /** * The properties fragment can be a complex XML structure with mixed content and attributes and thus cannot be * defined as a map of particular types. Instead, as of the 2.3.0 release of the connector, the properties column * is of type String and is expected to contain a serialized string of XML representing the contents of the * properties fragment. To obtain that, this method serializes the metadata object into its REST API XML * serialization and then extracts the portion containing the document properties. * * @param row * @param metadata */ private void populatePropertiesColumn(Object[] row, DocumentMetadataHandle metadata) { if (metadata.getProperties() == null || metadata.getProperties().size() == 0) { return; } try { Document doc = this.saxBuilder.build(new ByteArrayInputStream(metadata.toBuffer())); Element properties = doc.getRootElement().getChild("properties", PROPERTIES_NAMESPACE); if (properties != null) { row[6] = UTF8String.fromString(this.xmlOutputter.outputString(properties)); } } catch (Exception e) { throw new ConnectorException(String.format( "Unable to process XML document properties for row with URI %s; cause: %s", row[0], e.getMessage()), e); } } private void populateMetadataValuesColumn(Object[] row, DocumentMetadataHandle metadata) { DocumentMetadataHandle.DocumentMetadataValues metadataValues = metadata.getMetadataValues(); UTF8String[] keys = new UTF8String[metadataValues.size()]; UTF8String[] values = new UTF8String[metadataValues.size()]; int index = 0; for (Map.Entry entry : metadataValues.entrySet()) { keys[index] = UTF8String.fromString(entry.getKey()); values[index++] = UTF8String.fromString(entry.getValue()); } row[7] = ArrayBasedMapData.apply(keys, values); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy