All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.hadoop.serialization.ParsingUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.hadoop.serialization;

import java.util.ArrayList;
import java.util.List;

import org.elasticsearch.hadoop.serialization.Parser.Token;
import org.elasticsearch.hadoop.serialization.bulk.RawJson;
import org.elasticsearch.hadoop.serialization.json.JacksonJsonGenerator;
import org.elasticsearch.hadoop.util.FastByteArrayOutputStream;
import org.elasticsearch.hadoop.util.StringUtils;

public abstract class ParsingUtils {

    public static final String NOT_FOUND = "(not found)";

    /**
     * Seeks the field with the given name in the stream and positions (and returns) the parser to the next available token (value or not).
     * Return null if no token is found.
     *
     * @param path
     * @param parser
     * @return token associated with the given path or null if not found
     */
    public static Token seek(Parser parser, String path) {
        // return current token if no path is given
        if (!StringUtils.hasText(path)) {
            return null;
        }

        List tokens = StringUtils.tokenize(path, ".");
        return seek(parser, tokens.toArray(new String[tokens.size()]));
    }

    public static Token seek(Parser parser, String[] path1) {
        return seek(parser, path1, null);
    }

    public static Token seek(Parser parser, String[] path1, String[] path2) {
        return doSeekToken(parser, path1, 0, path2, 0);
    }

    private static Token doSeekToken(Parser parser, String[] path1, int index1, String[] path2, int index2) {
        Token token = null;

        String currentName;
        token = parser.currentToken();
        if (token == null) {
            token = parser.nextToken();
        }

        while ((token = parser.nextToken()) != null) {
            if (token == Token.START_OBJECT) {
                token = parser.nextToken();
            }
            if (token == Token.FIELD_NAME) {
                // found a node, go one level deep
                currentName = parser.currentName();
                if (path1 != null && currentName.equals(path1[index1])) {
                    if (index1 + 1 < path1.length) {
                        return doSeekToken(parser, path1, index1 + 1, null, 0);
                    }
                    else {
                        return parser.nextToken();
                    }
                }
                else if (path2 != null && currentName.equals(path2[index2])) {
                    if (index2 + 1 < path2.length) {
                        return doSeekToken(parser, null, 0, path2, index2 + 1);
                    }
                    else {
                        return parser.nextToken();
                    }
                }
                else {
                    // get field token (can be value, object or array)
                    parser.nextToken();
                    parser.skipChildren();
                }
            }
            else {
                break;
            }
        }

        return null;
    }

    private static class Matcher {
        private final List tokens;
        private final String path;
        private boolean matched = false;
        private Object value;

        Matcher(String path) {
            this.path = path;
            tokens = StringUtils.tokenize(path, ".");
        }

        // number of levels required for the matcher
        int nesting() {
            return tokens.size() - 1;
        }

        boolean matches(String key, int level) {
            if (level < tokens.size()) {
                return tokens.get(level).equals(key);
            }
            return false;
        }

        void value(Object value) {
            matched = true;
            this.value = value;
        }

        @Override
        public String toString() {
            return path;
        }
    }

    public static List values(Parser parser, String... paths) {
        List matchers = new ArrayList(paths.length);
        int maxNesting = 0;
        for (String path : paths) {
            Matcher matcher = new Matcher(path);
            matchers.add(matcher);
            if (matcher.nesting() > maxNesting) {
                maxNesting = matcher.nesting();
            }
        }

        doFind(parser, matchers, 0, maxNesting);

        List matches = new ArrayList();
        for (Matcher matcher : matchers) {
            matches.add(matcher.matched ? matcher.value : NOT_FOUND);
        }

        return matches;
    }

    private static void doFind(Parser parser, List currentMatchers, int level, int maxNesting) {
        Token token = parser.currentToken();
        if (token == null) {
            // advance to the initial START_OBJECT token
            parser.nextToken();
        }

        while ((token = parser.nextToken()) != null && token != Token.END_OBJECT) {
            if (token == Token.FIELD_NAME) {
                String currentName = parser.currentName();
                Object value = null;
                boolean valueRead = false;
                List nextLevel = null;

                for (Matcher matcher : currentMatchers) {
                    if (matcher.matches(currentName, level)) {
                        // found a match
                        if (matcher.nesting() == level) {
                            if (!valueRead) {
                                valueRead = true;
                                switch (parser.nextToken()) {
                                case VALUE_NUMBER:
                                    value = parser.numberValue();
                                    break;
                                case VALUE_BOOLEAN:
                                    value = Boolean.valueOf(parser.booleanValue());
                                    break;
                                case VALUE_NULL:
                                    value = null;
                                    break;
                                case VALUE_STRING:
                                    value = parser.text();
                                    break;
                                default:
                                    value = new RawJson(readValueAsString(parser));
                                }
                            }
                            matcher.value(value);
                        }
                        // partial match - keep it for the next level
                        else {
                            if (nextLevel == null) {
                                nextLevel = new ArrayList(currentMatchers.size());
                            }
                            nextLevel.add(matcher);
                        }
                    }
                }

                if (!valueRead) {
                    // must parse or skip the value
                    switch (parser.nextToken()) {
                        case START_OBJECT:
                            if (level < maxNesting && nextLevel != null) {
                                doFind(parser, nextLevel, level + 1, maxNesting);
                            } else {
                                parser.skipChildren();
                            }
                            break;
                        case START_ARRAY:
                            // arrays are not handled; simply ignore
                            parser.skipChildren();
                            break;
                    }
                }
            }
        }
    }

    private static String readValueAsString(Parser parser) {
        FastByteArrayOutputStream out = new FastByteArrayOutputStream(256);
        JacksonJsonGenerator generator = new JacksonJsonGenerator(out);
        traverse(parser, generator);
        generator.close();
        return out.toString();
    }

    private static void traverse(Parser parser, Generator generator) {
        Token t = parser.currentToken();
        switch (t) {
        case START_OBJECT:
            traverseMap(parser, generator);
            break;
        case START_ARRAY:
            traverseArray(parser, generator);
            break;
        case FIELD_NAME:
            generator.writeFieldName(parser.currentName());
            parser.nextToken();
            traverse(parser, generator);
            break;
        case VALUE_STRING:
            generator.writeString(parser.text());
            parser.nextToken();
            break;
        case VALUE_BOOLEAN:
            generator.writeBoolean(parser.booleanValue());
            parser.nextToken();
            break;
        case VALUE_NULL:
            generator.writeNull();
            parser.nextToken();
            break;
        case VALUE_NUMBER:
            switch (parser.numberType()) {
            case INT:
                generator.writeNumber(parser.intValue());
                break;
            case LONG:
                generator.writeNumber(parser.longValue());
                break;
            case DOUBLE:
                generator.writeNumber(parser.doubleValue());
                break;
            case FLOAT:
                generator.writeNumber(parser.floatValue());
                break;
            }
            parser.nextToken();
            break;
        }

    }

    private static void traverseMap(Parser parser, Generator generator) {
        generator.writeBeginObject();
        parser.nextToken();

        for (; parser.currentToken() != Token.END_OBJECT;) {
            traverse(parser, generator);
        }

        generator.writeEndObject();
        parser.nextToken();
    }

    private static void traverseArray(Parser parser, Generator generator) {
        generator.writeBeginArray();
        parser.nextToken();

        for (; parser.currentToken() != Token.END_ARRAY;) {
            traverse(parser, generator);
        }

        generator.writeEndArray();
        parser.nextToken();
    }
    
    public static void skipCurrentBlock(Parser parser) {
        int open = 1;

        while (true) {
            Token t = parser.nextToken();
            if (t == null) {
                // handle EOF?
                return;
            }
            switch (t) {
            case START_OBJECT:
            case START_ARRAY:
                ++open;
                break;
            case END_OBJECT:
            case END_ARRAY:
                if (--open == 0) {
                    return;
                }
                break;
            }
        }
    }
}