All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.snowflake.client.tool.TwitterDataConverter Maven / Gradle / Ivy

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package net.snowflake.client.tool;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.snowflake.gscommon.util.StringUtil;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Pattern;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;

/**
 *
 * @author jhuang
 */
public class TwitterDataConverter
{
  static Pattern hexCode =
  Pattern.compile("^0[X|x]([0-7]([0-9]|[A-F]|[a-f]))|([0-9]|[A-F]|[a-f])$");

  static Pattern octalCode =
  Pattern.compile("^\\\\(([0-7])|([0-7][0-7])|([0-3][0-7][0-7]))$");

  public static void main(String[] args) throws Exception
  {
    Options options = new Options();
    options.addOption("d", true, "field delimiter for output csv file, unquoted "
            + "one character only, hex/octal accepted, e.g. 0x09 or \\\\011 for "
            + "tab");
    options.addOption("i", true, "input file path");
    options.addOption("o", true, "output file path");
    //options.addOption("n", true, "comma separated names of attributes to extract, "
    //        + "use . as json path separator");
    //options.addOption("t", true, "comma separated types of attributes to extract, "
    //        + "must match names (valid values: string/long/int/bool)");
    options.addOption("h", false, "help");
    options.addOption("a", true,
                      "comma separated pairs of attribute name and type. "
            + "Each pair is separated by a colon. Use . as json path separator. "
            + "Valid types are string/long/int/bool. E.g. name:string,age:int");

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = parser.parse( options, args);

    boolean missingArgs = false;

    if (cmd.hasOption("h"))
    {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp( "TwitterDataConverter", options );
      System.exit(0);
    }

    if (!cmd.hasOption("i"))
    {
      System.err.println("input file path not specified");
      missingArgs = true;
    }

    if (!cmd.hasOption("o"))
    {
      System.err.println("output file path not specified");
      missingArgs = true;
    }

    if (!cmd.hasOption("a"))
    {
      System.err.println("attribute names and types not specified");
      missingArgs = true;
    }

    // default field delimiter in output file is ,
    String fieldDelimiter = ",";

    if (cmd.hasOption("d"))
    {
      fieldDelimiter = cmd.getOptionValue("d");
      if (fieldDelimiter.length() > 1)
      {
        // check if it's octal code
        if (octalCode.matcher((fieldDelimiter)).matches())
        {
          try
          {
            fieldDelimiter = StringUtil.convertOctalToChar(fieldDelimiter);
          }
          catch(NumberFormatException ex)
          {
            fieldDelimiter = null;
          }
        }

        if (hexCode.matcher((fieldDelimiter)).matches())
        {
          try
          {
            fieldDelimiter = StringUtil.convertHexToChar(fieldDelimiter);
          }
          catch(NumberFormatException ex)
          {
            fieldDelimiter = null;
          }
        }

        if (fieldDelimiter == null || fieldDelimiter.length() > 1)
        {
          System.err.println("invalid delimiter, must be one character");
          missingArgs = true;
        }
      }
    }

    if (missingArgs)
    {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp( "TwitterDataConverter", options );
      System.exit(-1);
    }

    String inputFilePath;
    String outputFilePath;
    String [] attrNames;
    Class [] attrTypes;
    String [] attrNameTypePairs;

    inputFilePath = cmd.getOptionValue("i");
    outputFilePath = cmd.getOptionValue("o");

    String attrNameTypesStr = cmd.getOptionValue("a");

    attrNameTypePairs = attrNameTypesStr.split(",");

    if (attrNameTypePairs == null || attrNameTypePairs.length == 0)
    {
      System.err.println("attr name and types not specified");
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp( "TwitterDataConverter", options );
      System.exit(-1);
    }

    attrTypes = new Class[attrNameTypePairs.length];
    attrNames = new String[attrNameTypePairs.length];

    for(int idx = 0; idx < attrNameTypePairs.length; idx++)
    {
      String attrNameTypePair = attrNameTypePairs[idx];
      String [] attrNameAndType = attrNameTypePair.split(":");

      System.out.println("attr name and type: " + attrNameTypePair);

      if (attrNameAndType == null ||
          attrNameAndType.length != 2)
      {
        System.err.println("invalid attr name and type pair: " + attrNameTypePair);
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp( "TwitterDataConverter", options );
        System.exit(-1);
      }

      attrNames[idx] = attrNameAndType[0];
      String attrTypeToken = attrNameAndType[1];

    // For types, we only support String, Long, Integer and Boolean for now
      if ("string".equalsIgnoreCase(attrTypeToken))
      {
        attrTypes[idx] = String.class;
      }
      else if ("long".equalsIgnoreCase(attrTypeToken))
      {
        attrTypes[idx] = Long.class;
      }
      else if ("int".equalsIgnoreCase(attrTypeToken) ||
               "integer".equalsIgnoreCase(attrTypeToken) ||
               "number".equalsIgnoreCase(attrTypeToken))
      {
        attrTypes[idx] = Integer.class;
      }
      else if ("bool".equalsIgnoreCase(attrTypeToken) ||
               "boolean".equalsIgnoreCase(attrTypeToken))
      {
        attrTypes[idx] = Boolean.class;
      }
      else
      {
        System.err.println("invalid type:" + attrTypeToken);
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp( "TwitterDataConverter", options );
        System.exit(-1);
      }
    }

    convert(fieldDelimiter, inputFilePath,
            outputFilePath, attrNames, attrTypes);
  }

  public static void convert(String fieldDelimiter,
                      String inputFilePath,
                      String outputFilePath,
                      String [] attrNames,
                      Class [] attrTypes)
          throws FileNotFoundException, IOException
  {
    JsonNode tweetNode;

    File file = new File(inputFilePath);
    File outFile = new File(outputFilePath);

    FileReader fr = new FileReader(file);
    FileWriter fw = new FileWriter(outFile);

    BufferedReader br = new BufferedReader(fr);
    BufferedWriter bw = new BufferedWriter(fw);

    String line = null;
    ObjectMapper mapper = new ObjectMapper();

    while((line = br.readLine()) != null)
    {
      // parse the new line as json
      tweetNode = mapper.readTree(line);

      if (!tweetNode.path("created_at").isMissingNode())
      {
        boolean isFirstCol = true;
        StringBuilder newTweet = new StringBuilder();
        int attrIdx = 0;

        for(String key : attrNames)
        {
          Class attrType = attrTypes[attrIdx++];

          String[] nodeNames = key.split("\\.");

          JsonNode leafNode = tweetNode;

          for(String nodeName : nodeNames)
          {
            if (!nodeName.isEmpty() && !"$".equals(nodeName))
              leafNode = leafNode.path(nodeName);
          }

          if (!isFirstCol)
          {
            newTweet.append(fieldDelimiter);
          }
          else
          {
            isFirstCol = false;
          }

          if (!leafNode.isMissingNode())
          {
            // convert null to nothing
            if (leafNode.isNull())
            {
              continue;
            }

            if (attrType == String.class)
            {
              newTweet.append("\"");

              String value;

              if (leafNode.isValueNode())
              {
                value = leafNode.asText();

              }
              else
              {
                value = leafNode.toString();
              }

              for (int idx = 0; idx < value.length(); idx++)
              {
                if (value.charAt(idx) == '\"')
                {
                  newTweet.append("\"");
                }

                newTweet.append(value.charAt(idx));
              }

              newTweet.append("\"");
            }
            else if (attrType == Long.class)
              newTweet.append(leafNode.asLong());
            else if (attrType == Integer.class)
              newTweet.append(leafNode.asInt());
            else if (attrType == Boolean.class)
              newTweet.append(leafNode.asBoolean());
          }
        }

        bw.write(newTweet.toString());
        bw.newLine();
      }
    }

    bw.flush();
    bw.close();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy