All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bazaarvoice.jolt.Shiftr Maven / Gradle / Ivy

There is a newer version: 0.1.8
Show newest version
/*
 * Copyright 2013 Bazaarvoice, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.bazaarvoice.jolt;

import com.bazaarvoice.jolt.common.Optional;
import com.bazaarvoice.jolt.common.tree.MatchedElement;
import com.bazaarvoice.jolt.common.tree.WalkedPath;
import com.bazaarvoice.jolt.exception.SpecException;
import com.bazaarvoice.jolt.shiftr.spec.ShiftrCompositeSpec;

import javax.inject.Inject;
import java.util.HashMap;
import java.util.Map;

/**
 *
 * Shiftr is a kind of JOLT transform that specifies where "data" from the input JSON should be placed in the
 * output JSON, aka how the input JSON/data should be shifted around to make the output JSON/data.
 *
 * At a base level, a single Shiftr "command" is a mapping from an input path to an output path,
 *  similar to the "mv" command in Unix, "mv /var/data/mysql/data /media/backup/mysql".
 *
 * In Shiftr, the input path is a JSON tree structure, and the output path is flattened "dot notation" path notation.
 *
 * The idea is that you can start with a copy your JSon input data data and modify it into a Shiftr spec by
 *  supplying a "dot notation" output path for each piece of data that you care about.
 *
 * For example, given this simple input JSON :
 * 
 * {
 *   "rating": {
 *       "quality": {
 *           "value": 3,
 *           "max": 5
 *       }
 *    }
 * }
 * 
* A simple Shiftr spec could be constructed by coping of that input, and modifying it to supply an output path for each piece of data : *
 * {
 *   "rating": {
 *     "quality": {
 *         "value": "SecondaryRatings.quality.Value",     // copy 3 to "SecondaryRatings.quality.Value"
 *         "max": "SecondaryRatings.quality.RatingRange"  // copy 5 to "SecondaryRatings.quality.RatingRange"
 *     }
 * }
 * 
* would product the following output JSON : *
 * {
 *   "SecondaryRatings" : {
 *     "quality" : {
 *       "Value" : 3,
 *       "RatingRange" : 5
 *     }
 *   }
 * }
 * 
* * As shown above, Shiftr specs can be entirely made up of literal string values, but it's real power comes from its wildcards. * Using wildcards, you can leverage the fact that you know, not just the data and it's immediate key, but the whole input * path to that data. * * Expanding the example above, say we have the following expanded Input JSON: *
 * {
 *   "rating": {
 *       "primary": {
 *           "value": 3,   // want this value to goto output path "Rating"
 *           "max": 5      // want this value to goto output path "RatingRange"
 *       },
 *       "quality": {      // want output path "SecondaryRatings.quality.Id" = "quality", aka we want the value of the key to be used
 *           "value": 3,   // want this value to goto output path "SecondaryRatings.quality.Value"
 *           "max": 5      // want this value to goto output path "SecondaryRatings.quality.Range"
 *       },
 *       "sharpness" : {   // want output path "SecondaryRatings.sharpness.Id" = "sharpness"
 *           "value" : 7,  // want this value to goto output path "SecondaryRatings.sharpness.Value"
 *           "max" : 10    // want this value to goto output path "SecondaryRatings.sharpness.Range"
 *       }
 *   }
 * }
 * 
* The Spec would be : *
 * {
 *   "rating": {
 *     "primary": {
 *         "value": "Rating",                       // output -> "Rating" : 3
 *         "max": "RatingRange"                     // output -> "RatingRange" : 5
 *     },
 *     "*": {                                       // match input data like "rating.[anything-other-than-primary]"
 *         "value": "SecondaryRatings.&1.Value",    // the data at "rating.*.value" goes to "SecondaryRatings.*.Value"
 *                                                  // the "&1" means use the value one level up the tree ( "quality" or "sharpness" )
 *                                                  // output -> "SecondaryRatings.quality.Value" : 3 AND
 *                                                  //           "SecondaryRatings.sharpness.Value" : 7
 *
 *         "max": "SecondaryRatings.&1.Range",      // the data at "rating.*.max" goes to "SecondaryRatings.*.Range"
 *                                                  // the "&1" means use the value one level up the tree ( "quality" or "sharpness" )
 *                                                  // output -> "SecondaryRatings.quality.Range" : 5 AND
 *                                                  //           "SecondaryRatings.sharpness.Range" : 10
 *
 *         "$": "SecondaryRatings.&1.Id"            // Special operator $ means, use the value of the input key itself as the data
 *                                                  // output -> "SecondaryRatings.quality.Id" : "quality"
 *                                                  // output -> "SecondaryRatings.sharpness.Id" : "sharpness"
 *     }
 *   }
 * }
 * 
* Yielding the following output: *
 * {
 *   "Rating": 3,
 *   "RatingRange": 5,
 *   "SecondaryRatings": {
 *      "quality": {
 *         "Range": 5,
 *         "Value": 3,
 *         "Id": "quality"     // the special $ operator allows us to use input key the text value of "quality", as the "Id" of the output
 *      },
 *      "sharpness": {
 *         "Range": 10,
 *         "Value": 7,
 *         "Id": "sharpness"   // the special $ operator allows us to use input key the text value of "sharpness", as the "Id" of the output
 *      }
 *   }
 * }
 * 
* * * Shiftr Wildcards * * '*' Wildcard * Valid only on the LHS ( input JSON keys ) side of a Shiftr Spec * The '*' wildcard can be used by itself or to match part of a key. * * '*' wildcard by itself : * As illustrated in the example above, the '*' wildcard by itself is useful for "templating" JSON maps, * where each key / value has the same "format". *
 *    // example input
 *    {
 *      "rating" : {
 *        "quality": {
 *          "value": 3,
 *          "max": 5
 *        },
 *        "sharpness" : {
 *          "value" : 7,
 *          "max" : 10
 *        }
 *    }
 *    
* In this example, "rating.quality" and "rating.sharpness" both have the same structure/format, and thus we can use the '*' * to allow use to write more compact rules and avoid having to to explicitly write very similar rules for both "quality" and "sharpness". * * '*' wildcard as part of a key : * This is useful for working with input JSON with keys that are "prefixed". * Ex : if you had an input document like *
 *    {
 *       "tag-Pro" : "Awesome",
 *       "tag-Con" : "Bogus"
 *    }
 *    
* A 'tag-*' would match both keys, and make the whole key and "matched" part of the key available. * Ex, input key of "tag-Pro" with LHS spec "tag-*", would "tag-Pro" and "Pro" available to reference. * Note the '*' wildcard is as non-greedy as possible, hence you can use more than one '*' in a key. * For example, "tag-*-*" would match "tag-Foo-Bar", making "tag-Foo-Bar", "Foo", and "Bar" all available to reference. * * '&' Wildcard * Valid on the LHS (left hand side - input JSON keys) and RHS (output data path) * Means, dereference against a "path" to get a value and use that value as if were a literal key. * The canonical form of the wildcard is "&(0,0)". * The first parameter is where in the input path to look for a value, and the second parameter is which part of the key to use (used with * key). * There are syntactic sugar versions of the wildcard, all of the following mean the same thing. * Sugar : '&' = '&0' = '&(0)' = '&(0,0) * The syntactic sugar versions are nice, as there are a set of data transforms that do not need to use the canonical form, * eg if your input data does not have any "prefixed" keys. * * '&' Path lookup * As Shiftr processes data and walks down the spec, it maintains a data structure describing the path it has walked. * The '&' wildcard can access data from that path in a 0 major, upward oriented way. * Example : *
 *    {
 *        "foo" : {
 *            "bar" : {
 *                "baz" :  // &0 = baz, &1 = bar, &2 = foo
 *            }
 *        }
 *    }
 *    
* * '&' Subkey lookup * '&' subkey lookup allows us to referece the values captured by the '*' wildcard. * Example, "tag-*-*" would match "tag-Foo-Bar", making * &(0,0) = "tag-Foo-Bar" * &(0,1) = "Foo" * &(0,2) = "Bar" * * '$' Wildcard * Valid only on the LHS of the spec. * The existence of this wildcard is a reflection of the fact that the "data" of the input JSON, can be both in the "values" * and the "keys" of the input JSON * * The base case operation of Shiftr is to copy input JSON "values", thus we need a way to specify that we want to copy the input JSON "key" instead. * * Thus '$' specifies that we want to use an input key, or input key derived value, as the data to be placed in the output JSON. * '$' has the same syntax as the '&' wildcard, and can be read as, dereference to get a value, and then use that value as the data to be output. * * There are two cases where this is useful * 1) when a "key" in the input JSON needs to be a "id" value in the output JSON, see the ' "$": "SecondaryRatings.&1.Id" ' example above. * 2) you want to make a list of all the input keys. * * Example of "a list of the input keys" : *
 *   // input
 *   {
 *     "rating": {
 *       "primary": {
 *         "value": 3,
 *         "max": 5
 *       },
 *       "quality": {
 *         "value": 3,
 *         "max": 7
 *       }
 *     }
 *   }
 *
 *   // desired output
 *   {
 *     "ratings" : [ "primary", "quality" ]    // Aside : this is an example of implicit JSON array creation in the output which is detailed further down.
 *                                             // For now just observe that the input keys "primary" and "quality" have both made it to the output.
 *   }
 *
 *   // spec
 *   {
 *     "rating": {
 *       "*": {               // match all keys below "rating"
 *         "$": "ratings"     // output each of the "keys" to "ratings" in the output
 *       }
 *     }
 *   }
 *   
* * '#' Wildcard * Valid both on the LHS and RHS, but has different behavior / format on either side. * They way to think of it, is that it allows you to specify a "synthentic" value, aka a value not found in the input data. * * On the RHS of the spec, # is only valid in the the context of an array, like "[#2]". * What "[#2]" means is, go up the three 2 levels and ask that node how many matches it has had, and then use that as an index * in the arrays. * This means that, while Shiftr is doing its parallel tree walk of the input data and the spec, it tracks how many matches it * has processed at each level of the spec tree. * * This useful if you want to take a JSON map and turn it into a JSON array, and you do not care about the order of the array. * * On the LHS of the spec, # allows you to specify a hard coded String to be place as a value in the output. * * The initial use-case for this feature was to be able to process a Boolean input value, and if the value is * boolean true write out the string "enabled". Note, this was possible before, but it required two Shiftr steps. * *
 *      Example
 *      "hidden" : {
 *          "true" : {                             // if the value of "hidden" is true
 *              "#disabled" : "clients.clientId"   // write the word "disabled" to the path "clients.clientId"
 *          }
 *      }
 *   
* * * '|' Wildcard * Valid only on the LHS of the spec. * This 'or' wildcard allows you to match multiple input keys. Useful if you don't always know exactly what your input data will be. * Example Spec : *
 *   {
 *     "rating|Rating" : "rating-primary"   // match "rating" or "Rating" copy the data to "rating-primary"
 *   }
 *   
* This is really just syntactic sugar, as the implementation really just treats the key "rating|Rating" as two keys when processing. * * * '@' Wildcard * Valid only on both sides of the spec. * * The basic '@' on the LHS. * * This wildcard is necessary if you want to do put both the input value and the input key somewhere in the output JSON. * * Example '@' wildcard usage : *
 *  // Say we have a spec that just operates on the value of the input key "rating"
 *  {
 *     "foo" : "place.to.put.value",  // leveraging the implicit operation of Shiftr which is to operate on input JSON values
 *  }
 *
 *  // if we want to do something with the "key" as well as the value
 *  {
 *     "foo" : {
 *       "$" : "place.to.put.key",
 *       "@" : "place.to.put.value"    // '@' explicitly tell Shiftr to operate on the input JSON value of the parent key "foo"
 *     }
 *  }
 *  
* Thus the '@' wildcard is the mean "copy the value of the data at this level in the tree, to the output". * * Advanced '@' sign wildcard. * The format is lools like "@(3,title)", where * "3" means go up the tree 3 levels and then lookup the key * "title" and use the value at that key. * * See the filter*.json and transpose*.json Unit Test fixtures. * * * JSON Arrays : * * Reading from (input) and writing to (output) JSON Arrays is fully supported. * * 1) Handling Arrays in the input JSON * Shiftr treats JSON arrays in the input data as Maps with numeric keys. * Example : *
 *    // input
 *    {
 *       "Photos": [ "AAA.jpg", "BBB.jpg" ]
 *    }
 *
 *    // spec
 *    {
 *       "Photos" :
 *       {
 *         "1" : "photo-&-url"      // Specify that we only want to operate on the 1-th index of the "Photos" input array
 *       }
 *    }
 *
 *   // output
 *   {
 *       "photo-1-url": "BBB.jpg"
 *   }
 *  
* * * 2) Handling Arrays in the output JSON * Traditional array brackets, [ ], are used to specify array index in the output JSON. * []'s are only valid on the RHS of the Shiftr spec. * * Example : *
 *    // input
 *    {
 *      "photo-1-id": "327704",
 *      "photo-1-url": "http://bob.com/0001/327704/photo.jpg"
 *    }
 *
 *    // spec
 *    {
 *      "photo-1-id": "Photos[1].Id",   // Declare the "Photos" in the output to be an array,
 *      "photo-1-url": "Photos[1].Url"  // that the 1-th array location should have data
 *
 *      // same as above but more powerful
 *      // note '&' logic can be used inside the '[ ]' notation
 *      "photo-*-url": "Photos[&(0,1)].Url"
 *    }
 *
 *    // output
 *    {
 *      "Photos": [
 *        null ,                // note Photos[0] is null, because no data was pushed to it
 *        {
 *          "Id":"327704",
 *          "Url":"http://bob.com/0001/327704/photo.jpg"
 *        }
 *      ]
 *    }
 *  
* * * 3) JSON arrays in the spec file * JSON Arrays in Shiftr spec are used to to specify that piece of input data should be copied to two places in the output JSON. * Example : *
 *   // input
 *   { "foo" : 3 }
 *
 *   // spec
 *   { "foo" : [ "bar", "baz" ] }    // push the 3, to both the of the output paths
 *
 *   // output
 *   {
 *     "bar" : 3,
 *     "baz" : 3
 *   }
 * 
* * * 4) Implicit Array creation in the output JSON * If a spec file is configured to output multiple pieces of data to the same output location, the * output location will be turned into a JSON array. * Example : *
 *    // input
 *    {
 *        "foo" : "bar",
 *        "tuna" : "marlin"
 *    }
 *
 *    // spec
 *    {
 *        "foo"  : "baz",
 *        "tuna" : "baz"
 *    }
 *
 *    // output
 *    {
 *        "baz" : [ "bar", "marlin" ]     // Note the order of this Array should not be relied upon
 *    }
 *  
* * * * * * Algorithm High Level * Walk the input data, and Shiftr spec simultaneously, and execute the Shiftr command/mapping each time * there is a match. * * Algorithm Low Level * - Simultaneously walk of the spec and input JSon, and maintain a walked "input" path data structure. * - Determine a match between input JSON key and LHS spec, by matching LHS spec keys in the following order : * -- Note that '|' keys are are split into their subkeys, eg "literal", '*', or '&' LHS keys * * 1) Try to match the input key with "literal" spec key values * 2) If no literal match is found, try to match against LHS '&' computed values. * 2.1) For deterministic behavior, if there is more than one '&' LHS key, they are applied/matched in alphabetical order, * after the '&' syntactic sugar is replaced with its canonical form. * 3) If no match is found, try to match against LHS keys with '*' wildcard values. * 3.1) For deterministic behavior, '*' wildcard keys are sorted and applied/matched in alphabetical order. * * Note, processing of the '@' and '$' LHS keys always occur if their parent's match, and do not block any other matching. * * * Implementation * * Instances of this class execute Shiftr transformations given a transform spec of Jackson-style maps of maps * and a Jackson-style map-of-maps input. */ public class Shiftr implements SpecDriven, Transform { private final ShiftrCompositeSpec rootSpec; /** * Initialize a Shiftr transform with a Spec. * * @throws com.bazaarvoice.jolt.exception.SpecException for a malformed spec */ @Inject public Shiftr( Object spec ) { if ( spec == null ){ throw new SpecException( "Shiftr expected a spec of Map type, got 'null'." ); } if ( ! ( spec instanceof Map ) ) { throw new SpecException( "Shiftr expected a spec of Map type, got " + spec.getClass().getSimpleName() ); } rootSpec = new ShiftrCompositeSpec( ROOT_KEY, (Map) spec ); } /** * Applies the Shiftr transform. * * @param input the JSON object to transform * @return the output object with data shifted to it * @throws com.bazaarvoice.jolt.exception.TransformException for a malformed spec or if there are issues during * the transform */ @Override public Object transform( Object input ) { Map output = new HashMap<>(); // Create a root LiteralPathElement so that # is useful at the root level MatchedElement rootLpe = new MatchedElement( ROOT_KEY ); WalkedPath walkedPath = new WalkedPath(); walkedPath.add( input, rootLpe ); rootSpec.apply( ROOT_KEY, Optional.of( input ), walkedPath, output, null ); return output.get( ROOT_KEY ); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy