com.bazaarvoice.jolt.Shiftr Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jolt-core Show documentation
There is a newer version: 0.1.8
/*
 * Copyright 2013 Bazaarvoice, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.bazaarvoice.jolt;

import com.bazaarvoice.jolt.common.Optional;
import com.bazaarvoice.jolt.common.tree.MatchedElement;
import com.bazaarvoice.jolt.common.tree.WalkedPath;
import com.bazaarvoice.jolt.exception.SpecException;
import com.bazaarvoice.jolt.shiftr.spec.ShiftrCompositeSpec;

import javax.inject.Inject;
import java.util.HashMap;
import java.util.Map;

/**
 *
 * Shiftr is a kind of JOLT transform that specifies where "data" from the input JSON should be placed in the
 * output JSON, aka how the input JSON/data should be shifted around to make the output JSON/data.
 *
 * At a base level, a single Shiftr "command" is a mapping from an input path to an output path,
 *  similar to the "mv" command in Unix, "mv /var/data/mysql/data /media/backup/mysql".
 *
 * In Shiftr, the input path is a JSON tree structure, and the output path is flattened "dot notation" path notation.
 *
 * The idea is that you can start with a copy of your JSON input data and modify it into a Shiftr spec by
 *  supplying a "dot notation" output path for each piece of data that you care about.
 *
 * For example, given this simple input JSON:
 *  * {
 *   "rating": {
 *       "quality": {
 *           "value": 3,
 *           "max": 5
 *       }
 *    }
 * }
 * 
 * A simple Shiftr spec could be constructed by copying that input, and modifying it to supply an output path for each piece of data:
 *  * {
 *   "rating": {
 *     "quality": {
 *         "value": "SecondaryRatings.quality.Value",     // copy 3 to "SecondaryRatings.quality.Value"
 *         "max": "SecondaryRatings.quality.RatingRange"  // copy 5 to "SecondaryRatings.quality.RatingRange"
 *     }
 * }
 * 
 * would produce the following output JSON:
 *  * {
 *   "SecondaryRatings": {
 *     "quality": {
 *       "Value": 3,
 *       "RatingRange": 5
 *     }
 *   }
 * }
 * 
 *
 * As shown above, Shiftr specs can be entirely made up of literal string values, but its real power comes from its wildcards.
 * Using wildcards, you can leverage the fact that you know, not just the data and its immediate key, but the whole input
 *  path to that data.
 *
 * Expanding the example above, say we have the following expanded Input JSON:
 *  * {
 *   "rating": {
 *       "primary": {
 *           "value": 3,   // want this value to goto output path "Rating"
 *           "max": 5      // want this value to goto output path "RatingRange"
 *       },
 *       "quality": {      // want output path "SecondaryRatings.quality.Id" = "quality", aka we want the value of the key to be used
 *           "value": 3,   // want this value to goto output path "SecondaryRatings.quality.Value"
 *           "max": 5      // want this value to goto output path "SecondaryRatings.quality.Range"
 *       },
 *       "sharpness": {   // want output path "SecondaryRatings.sharpness.Id" = "sharpness"
 *           "value": 7,  // want this value to goto output path "SecondaryRatings.sharpness.Value"
 *           "max": 10    // want this value to goto output path "SecondaryRatings.sharpness.Range"
 *       }
 *   }
 * }
 * 
 * The Spec would be:
 *  * {
 *   "rating": {
 *     "primary": {
 *         "value": "Rating",                       // output -> "Rating" : 3
 *         "max": "RatingRange"                     // output -> "RatingRange" : 5
 *     },
 *     "*": {                                       // match input data like "rating.[anything-other-than-primary]"
 *         "value": "SecondaryRatings.&1.Value",    // the data at "rating.*.value" goes to "SecondaryRatings.*.Value"
 *                                                  // the "&1" means use the value one level up the tree ( "quality" or "sharpness" )
 *                                                  // output -> "SecondaryRatings.quality.Value" : 3 AND
 *                                                  //           "SecondaryRatings.sharpness.Value" : 7
 *
 *         "max": "SecondaryRatings.&1.Range",      // the data at "rating.*.max" goes to "SecondaryRatings.*.Range"
 *                                                  // the "&1" means use the value one level up the tree ( "quality" or "sharpness" )
 *                                                  // output -> "SecondaryRatings.quality.Range" : 5 AND
 *                                                  //           "SecondaryRatings.sharpness.Range" : 10
 *
 *         "$": "SecondaryRatings.&1.Id"            // Special operator $ means, use the value of the input key itself as the data
 *                                                  // output -> "SecondaryRatings.quality.Id" : "quality"
 *                                                  // output -> "SecondaryRatings.sharpness.Id" : "sharpness"
 *     }
 *   }
 * }
 * 
 * Yielding the following output:
 *  * {
 *   "Rating": 3,
 *   "RatingRange": 5,
 *   "SecondaryRatings": {
 *      "quality": {
 *         "Range": 5,
 *         "Value": 3,
 *         "Id": "quality"     // the special $ operator allows us to use input key the text value of "quality", as the "Id" of the output
 *      },
 *      "sharpness": {
 *         "Range": 10,
 *         "Value": 7,
 *         "Id": "sharpness"   // the special $ operator allows us to use input key the text value of "sharpness", as the "Id" of the output
 *      }
 *   }
 * }
 * 
 *
 *
 * Shiftr Wildcards
 *
 * '*' Wildcard
 *   Valid only on the LHS ( input JSON keys ) side of a Shiftr Spec
 *   The '*' wildcard can be used by itself or to match part of a key.
 *
 *   '*' wildcard by itself:
 *    As illustrated in the example above, the '*' wildcard by itself is useful for "templating" JSON maps,
 *      where each key / value has the same "format".
 *     *    // example input
 *    {
 *      "rating" : {
 *        "quality": {
 *          "value": 3,
 *          "max": 5
 *        },
 *        "sharpness" : {
 *          "value" : 7,
 *          "max" : 10
 *        }
 *    }
 *    
 *    In this example, "rating.quality" and "rating.sharpness" both have the same structure/format, and thus we can use the '*'
 *     to allow us to write more compact rules and avoid having to explicitly write very similar rules for both "quality" and "sharpness".
 *
 *   '*' wildcard as part of a key:
 *    This is useful for working with input JSON with keys that are "prefixed".
 *    Ex: if you had an input document like
 *     *    {
 *       "tag-Pro": "Awesome",
 *       "tag-Con": "Bogus"
 *    }
 *    
 *    A 'tag-*' would match both keys, and make the whole key and "matched" part of the key available.
 *    Ex, input key of "tag-Pro" with LHS spec "tag-*", would "tag-Pro" and "Pro" available to reference.
 *    Note the '*' wildcard is as non-greedy as possible, hence you can use more than one '*' in a key.
 *    For example, "tag-*-*" would match "tag-Foo-Bar", making "tag-Foo-Bar", "Foo", and "Bar" all available to reference.
 *
 * '&' Wildcard
 *   Valid on the LHS (left hand side - input JSON keys) and RHS (output data path)
 *   Means, dereference against a "path" to get a value and use that value as if were a literal key.
 *   The canonical form of the wildcard is "&(0,0)".
 *   The first parameter is where in the input path to look for a value, and the second parameter is which part of the key to use (used with * key).
 *   There are syntactic sugar versions of the wildcard, all of the following mean the same thing.
 *     Sugar : '&' = '&0' = '&(0)' = '&(0,0)
 *   The syntactic sugar versions are nice, as there are a set of data transforms that do not need to use the canonical form,
 *    eg if your input data does not have any "prefixed" keys.
 *
 *   '&' Path lookup
 *    As Shiftr processes data and walks down the spec, it maintains a data structure describing the path it has walked.
 *    The '&' wildcard can access data from that path in a 0 major, upward oriented way.
 *    Example:
 *     *    {
 *        "foo" : {
 *            "bar": {
 *                "baz":  // &0 = baz, &1 = bar, &2 = foo
 *            }
 *        }
 *    }
 *    
 *
 *   '&' Subkey lookup
 *    '&' subkey lookup allows us to referece the values captured by the '*' wildcard.
 *   Example, "tag-*-*" would match "tag-Foo-Bar", making
 *     &(0,0) = "tag-Foo-Bar"
 *     &(0,1) = "Foo"
 *     &(0,2) = "Bar"
 *
 * '$' Wildcard
 *   Valid only on the LHS of the spec.
 *   The existence of this wildcard is a reflection of the fact that the "data" of the input JSON, can be both in the "values"
 *    and the "keys" of the input JSON
 *
 *   The base case operation of Shiftr is to copy input JSON "values", thus we need a way to specify that we want to copy the input JSON "key" instead.
 *
 *   Thus '$' specifies that we want to use an input key, or input key derived value, as the data to be placed in the output JSON.
 *   '$' has the same syntax as the '&' wildcard, and can be read as, dereference to get a value, and then use that value as the data to be output.
 *
 *   There are two cases where this is useful
 *     1) when a "key" in the input JSON needs to be a "id" value in the output JSON, see the ' "$": "SecondaryRatings.&1.Id" ' example above.
 *     2) you want to make a list of all the input keys.
 *
 *   Example of "a list of the input keys":
 *    *   // input
 *   {
 *     "rating": {
 *       "primary": {
 *         "value": 3,
 *         "max": 5
 *       },
 *       "quality": {
 *         "value": 3,
 *         "max": 7
 *       }
 *     }
 *   }
 *
 *   // desired output
 *   {
 *     "ratings" : [ "primary", "quality" ]    // Aside: this is an example of implicit JSON array creation in the output which is detailed further down.
 *                                             // For now just observe that the input keys "primary" and "quality" have both made it to the output.
 *   }
 *
 *   // spec
 *   {
 *     "rating": {
 *       "*": {               // match all keys below "rating"
 *         "$": "ratings"     // output each of the "keys" to "ratings" in the output
 *       }
 *     }
 *   }
 *   
 *
 * '#' Wildcard
 *   Valid both on the LHS and RHS, but has different behavior / format on either side.
 *   The way to think of it, is that it allows you to specify a "synthentic" value, aka a value not found in the input data.
 *
 *   On the RHS of the spec, # is only valid in the the context of an array, like "[#2]".
 *   What "[#2]" means is, go up the three levels and ask that node how many matches it has had, and then use that as an index
 *    in the arrays.
 *   This means that, while Shiftr is doing its parallel tree walk of the input data and the spec, it tracks how many matches it
 *    has processed at each level of the spec tree.
 *
 *   This useful if you want to take a JSON map and turn it into a JSON array, and you do not care about the order of the array.
 *
 *   On the LHS of the spec, # allows you to specify a hard coded String to be place as a value in the output.
 *
 *   The initial use-case for this feature was to be able to process a Boolean input value, and if the value is
 *    boolean true write out the string "enabled".  Note, this was possible before, but it required two Shiftr steps.
 *
 *    *      Example
 *      "hidden" : {
 *          "true" : {                             // if the value of "hidden" is true
 *              "#disabled" : "clients.clientId"   // write the word "disabled" to the path "clients.clientId"
 *          }
 *      }
 *   
 *
 *
 * '|' Wildcard
 *   Valid only on the LHS of the spec.
 *   This 'or' wildcard allows you to match multiple input keys.   Useful if you don't always know exactly what your input data will be.
 *   Example Spec :
 *    *   {
 *     "rating|Rating" : "rating-primary"   // match "rating" or "Rating" copy the data to "rating-primary"
 *   }
 *   
 *   This is really just syntactic sugar, as the implementation really just treats the key "rating|Rating" as two keys when processing.
 *
 *
 * '@' Wildcard
 *   Valid on both sides of the spec.
 *
 *   The basic '@' on the LHS.
 *
 *   This wildcard is necessary if you want to put both the input value and the input key somewhere in the output JSON.
 *
 *  Example '@' wildcard usage :
 *   *  // Say we have a spec that just operates on the value of the input key "rating"
 *  {
 *     "foo" : "place.to.put.value",  // leveraging the implicit operation of Shiftr which is to operate on input JSON values
 *  }
 *
 *  // if we want to do something with the "key" as well as the value
 *  {
 *     "foo" : {
 *       "$" : "place.to.put.key",
 *       "@" : "place.to.put.value"    // '@' explicitly tell Shiftr to operate on the input JSON value of the parent key "foo"
 *     }
 *  }
 *  
 *  Thus the '@' wildcard is the mean "copy the value of the data at this level in the tree, to the output".
 *
 *  Advanced '@' sign wildcard.
 *  The format is lools like "@(3,title)", where
 *    "3" means go up the tree 3 levels and then lookup the key
 *    "title" and use the value at that key.
 *
 *  See the filter*.json and transpose*.json Unit Test fixtures.
 *
 *
 * JSON Arrays :
 *
 *  Reading from (input) and writing to (output) JSON Arrays is fully supported.
 *
 * 1) Handling Arrays in the input JSON
 *  Shiftr treats JSON arrays in the input data as Maps with numeric keys.
 *  Example :
 *   *    // input
 *    {
 *       "Photos": [ "AAA.jpg", "BBB.jpg" ]
 *    }
 *
 *    // spec
 *    {
 *       "Photos" :
 *       {
 *         "1" : "photo-&-url"      // Specify that we only want to operate on the 1-th index of the "Photos" input array
 *       }
 *    }
 *
 *   // output
 *   {
 *       "photo-1-url": "BBB.jpg"
 *   }
 *  
 *
 *
 * 2) Handling Arrays in the output JSON
 *  Traditional array brackets, [ ], are used to specify array index in the output JSON.
 *  []'s are only valid on the RHS of the Shiftr spec.
 *
 *  Example :
 *   *    // input
 *    {
 *      "photo-1-id": "327704",
 *      "photo-1-url": "http://bob.com/0001/327704/photo.jpg"
 *    }
 *
 *    // spec
 *    {
 *      "photo-1-id": "Photos[1].Id",   // Declare the "Photos" in the output to be an array,
 *      "photo-1-url": "Photos[1].Url"  // that the 1-th array location should have data
 *
 *      // same as above but more powerful
 *      // note '&' logic can be used inside the '[ ]' notation
 *      "photo-*-url": "Photos[&(0,1)].Url"
 *    }
 *
 *    // output
 *    {
 *      "Photos": [
 *        null ,                // note Photos[0] is null, because no data was pushed to it
 *        {
 *          "Id":"327704",
 *          "Url":"http://bob.com/0001/327704/photo.jpg"
 *        }
 *      ]
 *    }
 *  
 *
 *
 * 3) JSON arrays in the spec file
 * JSON Arrays in Shiftr spec are used to to specify that piece of input data should be copied to two places in the output JSON.
 * Example :
 *  *   // input
 *   { "foo" : 3 }
 *
 *   // spec
 *   { "foo" : [ "bar", "baz" ] }    // push the 3, to both the of the output paths
 *
 *   // output
 *   {
 *     "bar" : 3,
 *     "baz" : 3
 *   }
 * 
 *
 *
 * 4) Implicit Array creation in the output JSON
 *  If a spec file is configured to output multiple pieces of data to the same output location, the
 *  output location will be turned into a JSON array.
 *  Example :
 *   *    // input
 *    {
 *        "foo" : "bar",
 *        "tuna" : "marlin"
 *    }
 *
 *    // spec
 *    {
 *        "foo"  : "baz",
 *        "tuna" : "baz"
 *    }
 *
 *    // output
 *    {
 *        "baz" : [ "bar", "marlin" ]     // Note the order of this Array should not be relied upon
 *    }
 *  
 *
 *
 *
 *
 *
 * Algorithm High Level
 *  Walk the input data, and Shiftr spec simultaneously, and execute the Shiftr command/mapping each time
 *  there is a match.
 *
 * Algorithm Low Level
 * - Simultaneously walk of the spec and input JSon, and maintain a walked "input" path data structure.
 * - Determine a match between input JSON key and LHS spec, by matching LHS spec keys in the following order :
 * -- Note that '|' keys are are split into their subkeys, eg "literal", '*', or '&' LHS keys
 *
 * 1) Try to match the input key with "literal" spec key values
 * 2) If no literal match is found, try to match against LHS '&' computed values.
 * 2.1) For deterministic behavior, if there is more than one '&' LHS key, they are applied/matched in alphabetical order,
 *   after the '&' syntactic sugar is replaced with its canonical form.
 * 3) If no match is found, try to match against LHS keys with '*' wildcard values.
 * 3.1) For deterministic behavior, '*' wildcard keys are sorted and applied/matched in alphabetical order.
 *
 * Note, processing of the '@' and '$' LHS keys always occur if their parent's match, and do not block any other matching.
 *
 *
 * Implementation
 *
 * Instances of this class execute Shiftr transformations given a transform spec of Jackson-style maps of maps
 * and a Jackson-style map-of-maps input.
 */
public class Shiftr implements SpecDriven, Transform {

    private final ShiftrCompositeSpec rootSpec;

    /**
     * Initialize a Shiftr transform with a Spec.
     *
     * @throws com.bazaarvoice.jolt.exception.SpecException for a malformed spec
     */
    @Inject
    public Shiftr( Object spec ) {

        if ( spec == null ){
            throw new SpecException( "Shiftr expected a spec of Map type, got 'null'." );
        }
        if ( ! ( spec instanceof Map ) ) {
            throw new SpecException( "Shiftr expected a spec of Map type, got " + spec.getClass().getSimpleName() );
        }

        rootSpec = new ShiftrCompositeSpec( ROOT_KEY, (Map) spec );
    }


    /**
     * Applies the Shiftr transform.
     *
     * @param input the JSON object to transform
     * @return the output object with data shifted to it
     * @throws com.bazaarvoice.jolt.exception.TransformException for a malformed spec or if there are issues during
     * the transform
     */
    @Override
    public Object transform( Object input ) {

        Map output = new HashMap<>();

        // Create a root LiteralPathElement so that # is useful at the root level
        MatchedElement rootLpe = new MatchedElement( ROOT_KEY );
        WalkedPath walkedPath = new WalkedPath();
        walkedPath.add( input, rootLpe );

        rootSpec.apply( ROOT_KEY, Optional.of( input ), walkedPath, output, null );

        return output.get( ROOT_KEY );
    }
}