
sparksoniq.spark.iterator.flowr.LetClauseSparkIterator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Authors: Stefan Irimescu, Can Berker Cikis
*
*/
package sparksoniq.spark.iterator.flowr;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.rumbledb.api.Item;
import sparksoniq.exceptions.IteratorFlowException;
import sparksoniq.exceptions.SparksoniqRuntimeException;
import sparksoniq.jsoniq.runtime.iterator.RuntimeIterator;
import sparksoniq.jsoniq.runtime.iterator.primary.VariableReferenceIterator;
import sparksoniq.jsoniq.runtime.metadata.IteratorMetadata;
import sparksoniq.jsoniq.runtime.tupleiterator.RuntimeTupleIterator;
import sparksoniq.jsoniq.runtime.tupleiterator.SparkRuntimeTupleIterator;
import sparksoniq.jsoniq.tuple.FlworTuple;
import sparksoniq.semantics.DynamicContext;
import sparksoniq.spark.DataFrameUtils;
import sparksoniq.spark.closures.OLD_LetClauseMapClosure;
import sparksoniq.spark.udf.LetClauseUDF;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class LetClauseSparkIterator extends SparkRuntimeTupleIterator {
private static final long serialVersionUID = 1L;
private String _variableName; // for efficient use in local iteration
private RuntimeIterator _expression;
private DynamicContext _tupleContext; // re-use same DynamicContext object for efficiency
private FlworTuple _nextLocalTupleResult;
Map _dependencies;
public LetClauseSparkIterator(RuntimeTupleIterator child, VariableReferenceIterator variableReference, RuntimeIterator expression, IteratorMetadata iteratorMetadata) {
super(child, iteratorMetadata);
_variableName = variableReference.getVariableName();
_expression = expression;
_dependencies = _expression.getVariableDependencies();
}
@Override
public boolean isRDD() {
if (this._child == null) {
return false;
} else {
return _child.isRDD();
}
}
@Override
public boolean isDataFrame() {
if (this._child == null) {
return false;
} else {
return _child.isDataFrame();
}
}
@Override
public FlworTuple next() {
if (_hasNext == true) {
FlworTuple result = _nextLocalTupleResult; // save the result to be returned
setNextLocalTupleResult(); // calculate and store the next result
return result;
}
throw new IteratorFlowException("Invalid next() call in let flwor clause", getMetadata());
}
private void setNextLocalTupleResult() {
// if first let clause, there are no more tuples
if (this._child == null) {
this._hasNext = false;
return;
}
if (_child.hasNext()) {
FlworTuple inputTuple = _child.next();
_tupleContext.removeAllVariables(); // clear the previous variables
_tupleContext.setBindingsFromTuple(inputTuple); // assign new variables from new tuple
List- results = new ArrayList<>();
_expression.open(_tupleContext);
while (_expression.hasNext())
results.add(_expression.next());
_expression.close();
FlworTuple newTuple = new FlworTuple(inputTuple, _variableName, results);
_nextLocalTupleResult = newTuple;
this._hasNext = true;
} else {
_child.close();
this._hasNext = false;
}
}
@Override
public void open(DynamicContext context) {
super.open(context);
// isRDD checks omitted, as open is used for non-RDD(local) operations
if (this._child != null) { //if it's not a start clause
_child.open(_currentDynamicContext);
_tupleContext = new DynamicContext(_currentDynamicContext); // assign current context as parent
setNextLocalTupleResult();
} else { //if it's a start clause, it returns only one tuple
// expression is materialized
List
- results = new ArrayList<>();
_expression.open(this._currentDynamicContext);
while (_expression.hasNext())
results.add(_expression.next());
_expression.close();
FlworTuple newTuple = new FlworTuple(_variableName, results);
_nextLocalTupleResult = newTuple;
}
}
@Override
public void close() {
this._isOpen = false;
result = null;
if (_child != null) {
_child.close();
}
}
@Override
public JavaRDD
getRDD(DynamicContext context) {
if (this._child != null) {
this._rdd = _child.getRDD(context);
this._rdd = this._rdd.map(new OLD_LetClauseMapClosure(_variableName, _expression));
return _rdd;
}
throw new SparksoniqRuntimeException("Initial letClauses don't support RDDs");
}
@Override
public Dataset getDataFrame(DynamicContext context, Map parentProjection)
{
//if it's not a start clause
if (this._child != null) {
Dataset df = _child.getDataFrame(context, getProjection(parentProjection));
StructType inputSchema = df.schema();
int duplicateVariableIndex = Arrays.asList(inputSchema.fieldNames()).indexOf(_variableName);
List allColumns = DataFrameUtils.getColumnNames(inputSchema, duplicateVariableIndex, null);
List UDFcolumns = DataFrameUtils.getColumnNames(inputSchema, -1, _dependencies);
df.sparkSession().udf().register("letClauseUDF",
new LetClauseUDF(_expression, UDFcolumns), DataTypes.BinaryType);
String selectSQL = DataFrameUtils.getSQL(allColumns, true);
String udfSQL = DataFrameUtils.getSQL(UDFcolumns, false);
df.createOrReplaceTempView("input");
df = df.sparkSession().sql(
String.format("select %s letClauseUDF(array(%s)) as `%s` from input",
selectSQL, udfSQL, _variableName)
);
return df;
}
throw new SparksoniqRuntimeException("Initial letClauses don't support DataFrames");
}
public Map getVariableDependencies()
{
Map result = new TreeMap();
result.putAll(_expression.getVariableDependencies());
if(_child != null)
{
for (String var : _child.getVariablesBoundInCurrentFLWORExpression())
{
result.remove(var);
}
result.putAll(_child.getVariableDependencies());
}
return result;
}
public Set getVariablesBoundInCurrentFLWORExpression()
{
Set result = new HashSet();
if(_child != null)
{
result.addAll(_child.getVariablesBoundInCurrentFLWORExpression());
}
result.add(_variableName);
return result;
}
public void print(StringBuffer buffer, int indent)
{
super.print(buffer, indent);
for (int i = 0; i < indent + 1; ++i)
{
buffer.append(" ");
}
buffer.append("Variable " + _variableName);
buffer.append("\n");
_expression.print(buffer, indent+1);
}
public Map getProjection(Map parentProjection)
{
if(_child == null)
{
return null;
}
// start with an empty projection.
Map projection = new TreeMap();
// copy over the projection needed by the parent clause.
projection.putAll(parentProjection);
// remove the variable that this clause binds.
projection.remove(_variableName);
// add the variable dependencies needed by this for clause's expression.
Map exprDependency = _expression.getVariableDependencies();
for(String variable : exprDependency.keySet())
{
if(projection.containsKey(variable)) {
if(projection.get(variable) != exprDependency.get(variable))
{
projection.put(variable, DynamicContext.VariableDependency.FULL);
}
} else {
projection.put(variable, exprDependency.get(variable));
}
}
return projection;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy