com.datatorrent.lib.logs.LineToTokenHashMap Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.lib.logs;

import java.util.ArrayList;
import java.util.HashMap;

import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.annotation.OperatorAnnotation;
import com.datatorrent.api.annotation.Stateless;

import com.datatorrent.lib.util.BaseLineTokenizer;
import com.datatorrent.lib.util.UnifierHashMap;

/**
 * This operator splits string objects into tokens. 
 * A key value pair is emitted where the key is the first token in an input tuple
 * and the value is a list of the other tokens in an input tuple., and emits as a HashMap where the first token.
 * 
 * This module is a pass through

 * 

 * StateFull : No,  tokens are processed in current window. 

 * Partitions : Yes,  output port unifier operator. 

 * 

 * Ports:

 * data: Input port, expects String

 * tokens: Output port, emits HashMap<String, ArrayList<String>>

 * 

 * Properties:

 * splitby: The characters used to split the line. Default is ";\t "

 * splittokenby: The characters used to split a token into key,val1,val2,.... Default is "", i.e. tokens are not split, and key=token, val=""

 * 
 *
 * @displayName Line To Token (HashMap)
 * @category Tuple Converters
 * @tags string, hashmap
 *
 * @since 0.3.2
 */
@Stateless
@OperatorAnnotation(partitionable = true)
public class LineToTokenHashMap extends BaseLineTokenizer
{
  /**
   * This output port emits the split strings.
   */
  public final transient DefaultOutputPort>> tokens = new DefaultOutputPort>>()
  {
    @Override
    public Unifier>> getUnifier()
    {
      return new UnifierHashMap>();
    }
  };

  protected transient HashMap> otuple = null;
  protected transient ArrayList vals = null;
  protected transient String tok = "";

  /**
   * sets up output tuple
   */
  @Override
  public void beginProcessTokens()
  {
    otuple = new HashMap>();
  }


  /**
   * clears data for subtokens
   */
  @Override
  public void beginProcessSubTokens()
  {
    vals = null;
    tok = "";
  }

  /**
   * first token is key, the rest are added to ArrayList
   * @param subtok
   */
  @Override
  public void processSubToken(String subtok)
  {
    if (vals == null) {
      tok = subtok;
      vals = new ArrayList();
    } else {
      vals.add(subtok);
    }
  }


  /**
   * Adds key,Arraylist pair to output tuple.
   */
  @Override
  public void endProcessSubTokens()
  {
    addSubToken(tok, vals);
    tok = "";
    vals = null;
  }

  /**
   * If you have multiple subtokens with same value, override and aggregate the values and then put
   * in the map
   * @param stok subtoken
   * @param svals subtoken val list
   */
  public void addSubToken(String stok, ArrayList svals)
  {
    otuple.put(stok, svals);
  }

  /**
   * emits output tuple
   */
  @Override
  public void endProcessTokens()
  {
    if (tokens.isConnected()) {
      tokens.emit(otuple);
      otuple = null;
    }
  }
}