All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.glob.GlobParser Maven / Gradle / Ivy

There is a newer version: 1.15.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.glob;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.parquet.glob.GlobNode.Atom;
import org.apache.parquet.glob.GlobNode.GlobNodeSequence;
import org.apache.parquet.glob.GlobNode.OneOf;

final class GlobParser {
  private GlobParser() {}

  /**
   * Parse a String into a {@link GlobNodeSequence}
   * 

* See {@link org.apache.parquet.Strings#expandGlob(String)} */ public static GlobNodeSequence parse(String pattern) { /* * The parse algorithm works as follows, assuming we are parsing: * "apache{one,pre{x,y}post,two}parquet{a,b}" * * 1) Begin scanning the string until we find the first { * * 2) Now that we've found the beginning of a glob group, scan forwards * until the end of this glob group (by counting { and } we see until we find * the closing } for the group we found in step 1). * * 3) Once the matching closing } is found we need to do two things. First, everything * from the end of the last group up to start of this group is an Atom, so in the example * above, once we've found that "{one,pre{x,y}post,two}" is the first group, we need to grab * "apache" and treat it as an atom and add it to our sequence. * Then, we parse "{one,pre{x,y}post,two}" using a similar but slightly different function (parseOneOf) * and add the result from that to our sequence. * * 4) Repeat until the end of the string -- so next we find {a,b} and add "parquet" as an Atom and parse * {a,b} using parseOneOf. */ if (pattern.isEmpty() || pattern.equals("{}")) { return new GlobNodeSequence(Collections.singletonList(new Atom(""))); } // the outer parse method needs to parse the pattern into a // GlobNodeSequence, though it may end up being a singleton sequence List children = new ArrayList(); int unmatchedBraces = 0; // count of unmatched braces int firstBrace = 0; // open brace of current group being processsed int anchor = 0; // first un-parsed character position for (int i = 0; i < pattern.length(); i++) { char c = pattern.charAt(i); switch (c) { case ',': if (unmatchedBraces == 0) { // commas not allowed in the top level expression // TODO: maybe turn this check off? throw new GlobParseException( "Unexpected comma outside of a {} group:\n" + annotateMessage(pattern, i)); } break; case '{': if (unmatchedBraces == 0) { // this is the first brace of an outermost {} group firstBrace = i; } unmatchedBraces++; break; case '}': unmatchedBraces--; if (unmatchedBraces < 0) { throw new GlobParseException("Unexpected closing }:\n" + annotateMessage(pattern, i)); } if (unmatchedBraces == 0) { // grab everything from the end of the last group up to here, // not including the close brace, it is an Atom in our sequence // (assuming it's not empty) if (anchor != firstBrace) { // not empty! // (substring's end param is exclusive) children.add(new Atom(pattern.substring(anchor, firstBrace))); } // grab the group, parse it, add it to our sequence, and then continue // note that we skip the braces on both sides (substring's end param is exclusive) children.add(parseOneOf(pattern.substring(firstBrace + 1, i))); // we have now parsed all the way up to here, the next un-parsed char is i + 1 anchor = i + 1; } break; } } if (unmatchedBraces > 0) { throw new GlobParseException("Not enough close braces in: " + pattern); } if (anchor != pattern.length()) { // either there were no {} groups, or there were some characters after the // last }, either way whatever is left (could be the entire input) is an Atom // in our sequence children.add(new Atom(pattern.substring(anchor, pattern.length()))); } return new GlobNodeSequence(children); } private static OneOf parseOneOf(String pattern) { /* * This method is only called when parsing the inside of a {} expression. * So in the example above, of calling parse("apache{one,pre{x,y}post,two}parquet{a,b}") * this method will get called on first "one,pre{x,y}post,two", then on "x,y" and then on "a,b" * * The inside of a {} expression essentially means "one of these comma separated expressions". * So this gets parsed slightly differently than the top level string passed to parse(). * * The algorithm works as follows: * 1) Split the string on ',' -- but only commas that are not inside of {} expressions * 2) Each of the splits can be parsed via the parse() method above * 3) Add all parsed splits to a single parent OneOf. */ // this inner parse method needs to parse the pattern into a // OneOf, though it may end up being a singleton OneOf List children = new ArrayList(); int unmatchedBraces = 0; // count of unmatched braces int anchor = 0; // first un-parsed character position for (int i = 0; i < pattern.length(); i++) { char c = pattern.charAt(i); switch (c) { case ',': // only "split" on commas not nested inside of {} if (unmatchedBraces == 0) { // ok, this comma is not inside of a {}, so // grab everything from anchor to here, parse it, and add it // as one of the options in this OneOf children.add(parse(pattern.substring(anchor, i))); // we have now parsed up to this comma, the next un-parsed char is i + 1 anchor = i + 1; } break; case '{': unmatchedBraces++; break; case '}': unmatchedBraces--; if (unmatchedBraces < 0) { throw new GlobParseException("Unexpected closing }:\n" + annotateMessage(pattern, i)); } break; } } if (unmatchedBraces > 0) { throw new GlobParseException("Not enough close braces in: " + pattern); } if (anchor != pattern.length()) { // either there were no commas outside of {} groups, or there were some characters after the // last comma, either way whatever is left (could be the entire input) is an Atom // in our sequence children.add(parse(pattern.substring(anchor, pattern.length()))); } if (pattern.length() > 0 && pattern.charAt(pattern.length() - 1) == ',') { // the above loop won't handle a trailing comma children.add(parse("")); } return new OneOf(children); } // for pretty printing which character had the error private static String annotateMessage(String message, int pos) { StringBuilder sb = new StringBuilder(message); sb.append('\n'); for (int i = 0; i < pos; i++) { sb.append('-'); } sb.append('^'); return sb.toString(); } public static class GlobParseException extends RuntimeException { public GlobParseException() {} public GlobParseException(String message) { super(message); } public GlobParseException(String message, Throwable cause) { super(message, cause); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy