org.opensearch.dissect.DissectParser Maven / Gradle / Ivy

Go to download
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.dissect;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Splits (dissects) a string into its parts based on a pattern.
A dissect pattern is composed of a set of keys and delimiters.
 * For example the dissect pattern: 
%{a} %{b},%{c} has 3 keys (a,b,c) and two delimiters (space and comma). This pattern will
 * match a string of the form: foo bar,baz
 and will result a key/value pairing of a=foo, b=bar, and c=baz.
 * Matches are all or nothing. For example, the same pattern will NOT match 
foo bar baz since all of the delimiters did not
 * match. (the comma did not match)
 * Dissect patterns can optionally have modifiers. These modifiers instruct the parser to change it's behavior. For example the
 * dissect pattern of 
%{a},%{b}:%{c}
 would not match foo,bar,baz since there the colon never matches.
 * Modifiers appear to the left or the right of the key name. The supported modifiers are:
 * 

 * {@code ->} Instructs the parser to ignore repeating delimiters to the right of the key. Example:  * pattern: {@code %{a->} %{b} %{c}}
 * string: {@code foo         bar baz}
 * result: {@code a=foo, b=bar, c=baz}
 * 
 * {@code +} Instructs the parser to appends this key's value to value of prior key with the same name.
 * Example:  * pattern: {@code %{a} %{+a} %{+a}}
 * string: {@code foo bar baz}
 * result: {@code a=foobarbaz}
 * 
 * {@code /} Instructs the parser to appends this key's value to value of a key based based on the order specified after the
 * {@code /}. Requires the {@code +} modifier to also be present in the key. Example:  * pattern: {@code %{a} %{+a/2} %{+a/1}}
 * string: {@code foo bar baz}
 * result: {@code a=foobazbar}
 * 
 * 
 * {@code *} Instructs the parser to ignore the name of this key, instead use the value of key as the key name.
 * Requires another key with the same name and the {@code &} modifier to be the value. Example:  * pattern: {@code %{*a} %{b} %{&a}}
 * string: {@code foo bar baz}
 * result: {@code foo=baz, b=bar}
 * 
 * {@code &} Instructs the parser to ignore this key and place the matched value to a key of the same name with the {@code *} modifier.
 * Requires another key with the same name and the {@code *} modifier.
 * Example:  * pattern: {@code %{*a} %{b} %{&a}}
 * string: {@code foo bar baz}
 * result: {@code foo=baz, b=bar}
 * 
 * {@code ?} Instructs the parser to ignore this key. The key name exists only for the purpose of human readability. Example
 *  *  pattern: {@code %{a} %{?skipme} %{c}}
 *  string: {@code foo bar baz}
 *  result: {@code a=foo, c=baz}
 * 
 * 
 * Empty key names patterns are also supported. They behave just like the {@code ?} modifier, except the name is not required.
 * The result will simply be ignored. Example
 * 
 * pattern: {@code %{a} %{} %{c}}
 * string: {@code foo bar baz}
 * result: {@code a=foo, c=baz}
 * 

 * 
 * Inspired by the Logstash Dissect Filter by Guy Boertje
 */
public final class DissectParser {
    private static final Pattern LEADING_DELIMITER_PATTERN = Pattern.compile("^(.*?)%");
    private static final Pattern KEY_DELIMITER_FIELD_PATTERN = Pattern.compile("%\\{([^}]*?)}([^%]*)", Pattern.DOTALL);
    private static final EnumSet ASSOCIATE_MODIFIERS = EnumSet.of(
        DissectKey.Modifier.FIELD_NAME,
        DissectKey.Modifier.FIELD_VALUE
    );
    private static final EnumSet APPEND_MODIFIERS = EnumSet.of(
        DissectKey.Modifier.APPEND,
        DissectKey.Modifier.APPEND_WITH_ORDER
    );
    private static final Function KEY_NAME = val -> val.getKey().getName();
    private final List matchPairs;
    private final String pattern;
    private String leadingDelimiter = "";
    private final int maxMatches;
    private final int maxResults;
    private final int appendCount;
    private final int referenceCount;
    private final String appendSeparator;

    public DissectParser(String pattern, String appendSeparator) {
        this.pattern = pattern;
        this.appendSeparator = appendSeparator == null ? "" : appendSeparator;
        Matcher matcher = LEADING_DELIMITER_PATTERN.matcher(pattern);
        while (matcher.find()) {
            leadingDelimiter = matcher.group(1);
        }
        List matchPairs = new ArrayList<>();
        matcher = KEY_DELIMITER_FIELD_PATTERN.matcher(pattern.substring(leadingDelimiter.length()));
        while (matcher.find()) {
            DissectKey key = new DissectKey(matcher.group(1));
            String delimiter = matcher.group(2);
            matchPairs.add(new DissectPair(key, delimiter));
        }
        this.maxMatches = matchPairs.size();
        this.maxResults = Long.valueOf(
            matchPairs.stream().filter(dissectPair -> !dissectPair.getKey().skip()).map(KEY_NAME).distinct().count()
        ).intValue();
        if (this.maxMatches == 0 || maxResults == 0) {
            throw new DissectException.PatternParse(pattern, "Unable to find any keys or delimiters.");
        }
        // append validation - look through all of the keys to see if there are any keys that need to participate in an append operation
        // but don't have the '+' defined
        Set appendKeyNames = matchPairs.stream()
            .filter(dissectPair -> APPEND_MODIFIERS.contains(dissectPair.getKey().getModifier()))
            .map(KEY_NAME)
            .distinct()
            .collect(Collectors.toSet());
        if (appendKeyNames.size() > 0) {
            List modifiedMatchPairs = new ArrayList<>(matchPairs.size());
            for (DissectPair p : matchPairs) {
                if (p.getKey().getModifier().equals(DissectKey.Modifier.NONE) && appendKeyNames.contains(p.getKey().getName())) {
                    modifiedMatchPairs.add(new DissectPair(new DissectKey(p.getKey(), DissectKey.Modifier.APPEND), p.getDelimiter()));
                } else {
                    modifiedMatchPairs.add(p);
                }
            }
            matchPairs = modifiedMatchPairs;
        }
        appendCount = appendKeyNames.size();

        // reference validation - ensure that '*' and '&' come in pairs
        Map> referenceGroupings = matchPairs.stream()
            .filter(dissectPair -> ASSOCIATE_MODIFIERS.contains(dissectPair.getKey().getModifier()))
            .collect(Collectors.groupingBy(KEY_NAME));
        for (Map.Entry> entry : referenceGroupings.entrySet()) {
            if (entry.getValue().size() != 2) {
                throw new DissectException.PatternParse(
                    pattern,
                    "Found invalid key/reference associations: '"
                        + entry.getValue().stream().map(KEY_NAME).collect(Collectors.joining(","))
                        + "' Please ensure each '*' is matched with a matching '&"
                );
            }
        }

        referenceCount = referenceGroupings.size() * 2;
        this.matchPairs = Collections.unmodifiableList(matchPairs);
    }

    /**
     * 
Entry point to dissect a string into it's parts.
     *
     * @param inputString The string to dissect
     * @return the key/value Map of the results
     * @throws DissectException if unable to dissect a pair into it's parts.
     */
    public Map parse(String inputString) {
        /**
         *
         * This implements a naive string matching algorithm. The string is walked left to right, comparing each byte against
         * another string's bytes looking for matches. If the bytes match, then a second cursor looks ahead to see if all the bytes
         * of the other string matches. If they all match, record it and advances the primary cursor to the match point. If it can not match
         * all of the bytes then progress the main cursor. Repeat till the end of the input string. Since the string being searching for
         * (the delimiter) is generally small and rare the naive approach is efficient.
         *
         * In this case the string that is walked is the input string, and the string being searched for is the current delimiter.
         * For example for a dissect pattern of {@code %{a},%{b}:%{c}} the delimiters (comma then colon) are searched for in the
         * input string. At class construction the list of keys+delimiters are found (dissectPairs), which allows the use of that ordered
         * list to know which delimiter to use for the search. The delimiters is progressed once the current delimiter is matched.
         *
         * There are two special cases that requires additional parsing beyond the standard naive algorithm. Consecutive delimiters should
         * results in a empty matches unless the {@code ->} is provided. For example given the dissect pattern of
         * {@code %{a},%{b},%{c},%{d}} and input string of {@code foo,,,} the match should be successful with empty values for b,c and d.
         * However, if the key modifier {@code ->}, is present it will simply skip over any delimiters just to the right of the key
         * without assigning any values. For example {@code %{a->},{%b}} will match the input string of {@code foo,,,,,,bar} with a=foo and
         * b=bar.
         *
         */
        DissectMatch dissectMatch = new DissectMatch(appendSeparator, maxMatches, maxResults, appendCount, referenceCount);
        Iterator it = matchPairs.iterator();
        // ensure leading delimiter matches
        if (inputString != null
            && inputString.length() > leadingDelimiter.length()
            && leadingDelimiter.equals(inputString.substring(0, leadingDelimiter.length()))) {
            byte[] input = inputString.getBytes(StandardCharsets.UTF_8);
            // grab the first key/delimiter pair
            DissectPair dissectPair = it.next();
            DissectKey key = dissectPair.getKey();
            byte[] delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8);
            // start dissection after the first delimiter
            int i = leadingDelimiter.length();
            int valueStart = i;
            int lookAheadMatches;
            // start walking the input string byte by byte, look ahead for matches where needed
            // if a match is found jump forward to the end of the match
            for (; i < input.length; i++) {
                lookAheadMatches = 0;
                // potential match between delimiter and input string
                if (delimiter.length > 0 && input[i] == delimiter[0]) {
                    // look ahead to see if the entire delimiter matches the input string
                    for (int j = 0; j < delimiter.length; j++) {
                        if (i + j < input.length && input[i + j] == delimiter[j]) {
                            lookAheadMatches++;
                        }
                    }
                    // found a full delimiter match
                    if (lookAheadMatches == delimiter.length) {
                        // record the key/value tuple
                        byte[] value = Arrays.copyOfRange(input, valueStart, i);
                        dissectMatch.add(key, new String(value, StandardCharsets.UTF_8));
                        // jump to the end of the match
                        i += lookAheadMatches;
                        // look for consecutive delimiters (e.g. a,,,,d,e)
                        while (i < input.length) {
                            lookAheadMatches = 0;
                            for (int j = 0; j < delimiter.length; j++) {
                                if (i + j < input.length && input[i + j] == delimiter[j]) {
                                    lookAheadMatches++;
                                }
                            }
                            // found consecutive delimiters
                            if (lookAheadMatches == delimiter.length) {
                                // jump to the end of the match
                                i += lookAheadMatches;
                                if (!key.skipRightPadding()) {
                                    // progress the keys/delimiter if possible
                                    if (!it.hasNext()) {
                                        break; // the while loop
                                    }
                                    dissectPair = it.next();
                                    key = dissectPair.getKey();
                                    // add the key with an empty value for the empty delimiter
                                    dissectMatch.add(key, "");
                                }
                            } else {
                                break; // the while loop
                            }
                        }
                        // progress the keys/delimiter if possible
                        if (!it.hasNext()) {
                            break; // the for loop
                        }
                        dissectPair = it.next();
                        key = dissectPair.getKey();
                        delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8);
                        // i is always one byte after the last found delimiter, aka the start of the next value
                        valueStart = i;
                    }
                }
            }
            // the last key, grab the rest of the input (unless consecutive delimiters already grabbed the last key)
            // and there is no trailing delimiter
            if (!dissectMatch.fullyMatched() && delimiter.length == 0) {
                byte[] value = Arrays.copyOfRange(input, valueStart, input.length);
                String valueString = new String(value, StandardCharsets.UTF_8);
                dissectMatch.add(key, valueString);
            }
        }
        Map results = dissectMatch.getResults();

        if (!dissectMatch.isValid(results)) {
            throw new DissectException.FindMatch(pattern, inputString);
        }
        return results;
    }

    /**
     * A tuple class to hold the dissect key and delimiter
     */
    private class DissectPair {

        private final DissectKey key;
        private final String delimiter;

        private DissectPair(DissectKey key, String delimiter) {
            this.key = key;
            this.delimiter = delimiter;
        }

        private DissectKey getKey() {
            return key;
        }

        private String getDelimiter() {
            return delimiter;
        }
    }

}