Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.parquet.cli.util.Expressions Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.cli.util;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.util.Utf8;
public class Expressions {
private static final Pattern NUMERIC_RE = Pattern.compile("^\\d+$");
public static Object select(Schema schema, Object datum, String path) {
return select(schema, datum, Lists.newArrayList(parse(path)));
}
@SuppressWarnings("unchecked")
private static Object select(Schema schema, Object datum, List tokens) {
if (tokens.isEmpty()) {
return datum;
}
Preconditions.checkArgument(tokens.size() == 1, "Cannot return multiple values");
PathExpr token = tokens.get(0);
switch (schema.getType()) {
case RECORD:
if (!(datum instanceof GenericRecord) && "json".equals(schema.getName())) {
// skip the placeholder record schema
return select(schema.getField("value").schema(), datum, tokens);
}
Preconditions.checkArgument(token.type == PathExpr.Type.FIELD, "Cannot dereference records");
Preconditions.checkArgument(datum instanceof GenericRecord, "Not a record: %s", datum);
GenericRecord record = (GenericRecord) datum;
Schema.Field field = schema.getField(token.value);
Preconditions.checkArgument(field != null, "No such field '%s' in schema: %s", token.value, schema);
return select(field.schema(), record.get(token.value), token.children);
case MAP:
Preconditions.checkArgument(datum instanceof Map, "Not a map: %s", datum);
Map map = (Map) datum;
Object value = map.get(token.value);
if (value == null) {
// try with a Utf8
value = map.get(new Utf8(token.value));
}
return select(schema.getValueType(), value, token.children);
case ARRAY:
Preconditions.checkArgument(token.type == PathExpr.Type.DEREF, "Cannot access fields of an array");
Preconditions.checkArgument(datum instanceof Collection, "Not an array: %s", datum);
Preconditions.checkArgument(
NUMERIC_RE.matcher(token.value).matches(), "Not an array index: %s", token.value);
List list = (List) datum;
return select(schema.getElementType(), list.get(Integer.parseInt(token.value)), token.children);
case UNION:
int branch = GenericData.get().resolveUnion(schema, datum);
return select(schema.getTypes().get(branch), datum, tokens);
default:
throw new IllegalArgumentException("Cannot access child of primitive value: " + datum);
}
}
/**
* a.2.b[3]["key"]
* * optional (union with null) should be ignored
* * unions should match by position number or short name (e.g. 2, user)
* * fields should match by name
* * arrays are dereferenced by position [n] => schema is the element schema
* * maps are dereferenced by key => schema is the value schema
*
* @param schema an Avro schema
* @param fieldPaths selected field paths
* @return a filtered schema
*/
public static Schema filterSchema(Schema schema, String... fieldPaths) {
return filterSchema(schema, Lists.newArrayList(fieldPaths));
}
public static Schema filterSchema(Schema schema, List fieldPaths) {
if (fieldPaths == null) {
return schema;
}
List paths = merge(Lists.newArrayList(fieldPaths));
return filter(schema, paths);
}
private static PathExpr parse(String path) {
PathExpr expr = null;
PathExpr last = null;
boolean inDeref = false;
boolean afterDeref = false;
int valueStart = 0;
for (int i = 0; i < path.length(); i += 1) {
switch (path.charAt(i)) {
case '.':
Preconditions.checkState(valueStart != i || afterDeref, "Empty reference: ''");
if (!inDeref) {
if (valueStart != i) {
PathExpr current = PathExpr.field(path.substring(valueStart, i));
if (last != null) {
last.children.add(current);
} else {
expr = current;
}
last = current;
}
valueStart = i + 1;
afterDeref = false;
}
break;
case '[':
Preconditions.checkState(!inDeref, "Cannot nest [ within []");
Preconditions.checkState(valueStart != i || afterDeref, "Empty reference: ''");
if (valueStart != i) {
PathExpr current = PathExpr.field(path.substring(valueStart, i));
if (last != null) {
last.children.add(current);
} else {
expr = current;
}
last = current;
}
valueStart = i + 1;
inDeref = true;
afterDeref = false;
break;
case ']':
Preconditions.checkState(inDeref, "Cannot use ] without a starting [");
Preconditions.checkState(valueStart != i, "Empty reference: ''");
PathExpr current = PathExpr.deref(path.substring(valueStart, i));
if (last != null) {
last.children.add(current);
} else {
expr = current;
}
last = current;
valueStart = i + 1;
inDeref = false;
afterDeref = true;
break;
default:
Preconditions.checkState(!afterDeref, "Fields after [] must start with .");
}
}
Preconditions.checkState(!inDeref, "Fields after [ must end with ]");
if (valueStart < path.length()) {
PathExpr current = PathExpr.field(path.substring(valueStart, path.length()));
if (last != null) {
last.children.add(current);
} else {
expr = current;
}
}
return expr;
}
private static List merge(List fields) {
List paths = Lists.newArrayList();
for (String field : fields) {
merge(paths, parse(field));
}
return paths;
}
private static List merge(List tokens, PathExpr toAdd) {
boolean merged = false;
for (PathExpr token : tokens) {
if ((token.type == toAdd.type) && (token.type == PathExpr.Type.DEREF || token.value.equals(toAdd.value))) {
for (PathExpr child : toAdd.children) {
merge(token.children, child);
}
merged = true;
}
}
if (!merged) {
tokens.add(toAdd);
}
return tokens;
}
private static Schema filter(Schema schema, List exprs) {
if (exprs.isEmpty()) {
return schema;
}
switch (schema.getType()) {
case RECORD:
List fields = Lists.newArrayList();
for (PathExpr expr : exprs) {
Schema.Field field = schema.getField(expr.value);
Preconditions.checkArgument(
field != null, "Cannot find field '%s' in schema: %s", expr.value, schema);
fields.add(new Schema.Field(
expr.value,
filter(field.schema(), expr.children),
field.doc(),
field.defaultVal(),
field.order()));
}
return Schema.createRecord(
schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError(), fields);
case UNION:
// Ignore schemas that are a union with null because there is another token
if (schema.getTypes().size() == 2) {
if (schema.getTypes().get(0).getType() == Schema.Type.NULL) {
return filter(schema.getTypes().get(1), exprs);
} else if (schema.getTypes().get(1).getType() == Schema.Type.NULL) {
return filter(schema.getTypes().get(0), exprs);
}
}
List schemas = Lists.newArrayList();
for (PathExpr expr : exprs) {
schemas.add(filter(schema, expr));
}
if (schemas.size() > 1) {
return Schema.createUnion(schemas);
} else {
return schemas.get(0);
}
case MAP:
Preconditions.checkArgument(
exprs.size() == 1, "Cannot find multiple children of map schema: %s", schema);
return filter(schema, exprs.get(0));
case ARRAY:
Preconditions.checkArgument(
exprs.size() == 1, "Cannot find multiple children of array schema: %s", schema);
return filter(schema, exprs.get(0));
default:
throw new IllegalArgumentException(String.format("Cannot find child of primitive schema: %s", schema));
}
}
private static Schema filter(Schema schema, PathExpr expr) {
if (expr == null) {
return schema;
}
switch (schema.getType()) {
case RECORD:
Preconditions.checkArgument(
expr.type == PathExpr.Type.FIELD, "Cannot index a record: [%s]", expr.value);
Schema.Field field = schema.getField(expr.value);
if (field != null) {
return filter(field.schema(), expr.children);
} else {
throw new IllegalArgumentException(
String.format("Cannot find field '%s' in schema: %s", expr.value, schema.toString(true)));
}
case MAP:
return Schema.createMap(filter(schema.getValueType(), expr.children));
case ARRAY:
Preconditions.checkArgument(
expr.type == PathExpr.Type.DEREF, "Cannot find field '%s' in an array", expr.value);
Preconditions.checkArgument(
NUMERIC_RE.matcher(expr.value).matches(),
"Cannot index array by non-numeric value '%s'",
expr.value);
return Schema.createArray(filter(schema.getElementType(), expr.children));
case UNION:
// TODO: this should only return something if the type can match rather than explicitly
// accessing parts of a union. when selecting data, unions are ignored.
Preconditions.checkArgument(
expr.type == PathExpr.Type.DEREF, "Cannot find field '%s' in a union", expr.value);
List options = schema.getTypes();
if (NUMERIC_RE.matcher(expr.value).matches()) {
// look up the option by position
int i = Integer.parseInt(expr.value);
if (i < options.size()) {
return filter(options.get(i), expr.children);
}
} else {
// look up the option by name
for (Schema option : options) {
if (expr.value.equalsIgnoreCase(option.getName())) {
return filter(option, expr.children);
}
}
}
throw new IllegalArgumentException(
String.format("Invalid union index '%s' for schema: %s", expr.value, schema));
default:
throw new IllegalArgumentException(
String.format("Cannot find '%s' in primitive schema: %s", expr.value, schema));
}
}
private static class PathExpr {
enum Type {
DEREF,
FIELD
}
static PathExpr deref(String value) {
return new PathExpr(Type.DEREF, value);
}
static PathExpr deref(String value, PathExpr child) {
return new PathExpr(Type.DEREF, value, Lists.newArrayList(child));
}
static PathExpr field(String value) {
return new PathExpr(Type.FIELD, value);
}
static PathExpr field(String value, PathExpr child) {
return new PathExpr(Type.FIELD, value, Lists.newArrayList(child));
}
private final Type type;
private final String value;
private final List children;
PathExpr(Type type, String value) {
this.type = type;
this.value = value;
this.children = Lists.newArrayList();
}
PathExpr(Type type, String value, List children) {
this.type = type;
this.value = value;
this.children = children;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
PathExpr pathExpr = (PathExpr) o;
if (type != pathExpr.type) return false;
if (value != null ? !value.equals(pathExpr.value) : pathExpr.value != null) return false;
return children != null ? children.equals(pathExpr.children) : pathExpr.children == null;
}
@Override
public int hashCode() {
int result = type != null ? type.hashCode() : 0;
result = 31 * result + (value != null ? value.hashCode() : 0);
result = 31 * result + (children != null ? children.hashCode() : 0);
return result;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("type", type)
.add("value", value)
.add("children", children)
.toString();
}
}
}