
net.snowflake.client.tool.TwitterDataConverter Maven / Gradle / Ivy
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package net.snowflake.client.tool;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.snowflake.gscommon.util.StringUtil;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Pattern;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
/**
*
* @author jhuang
*/
public class TwitterDataConverter
{
static Pattern hexCode =
Pattern.compile("^0[X|x]([0-7]([0-9]|[A-F]|[a-f]))|([0-9]|[A-F]|[a-f])$");
static Pattern octalCode =
Pattern.compile("^\\\\(([0-7])|([0-7][0-7])|([0-3][0-7][0-7]))$");
public static void main(String[] args) throws Exception
{
Options options = new Options();
options.addOption("d", true, "field delimiter for output csv file, unquoted "
+ "one character only, hex/octal accepted, e.g. 0x09 or \\\\011 for "
+ "tab");
options.addOption("i", true, "input file path");
options.addOption("o", true, "output file path");
//options.addOption("n", true, "comma separated names of attributes to extract, "
// + "use . as json path separator");
//options.addOption("t", true, "comma separated types of attributes to extract, "
// + "must match names (valid values: string/long/int/bool)");
options.addOption("h", false, "help");
options.addOption("a", true,
"comma separated pairs of attribute name and type. "
+ "Each pair is separated by a colon. Use . as json path separator. "
+ "Valid types are string/long/int/bool. E.g. name:string,age:int");
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse( options, args);
boolean missingArgs = false;
if (cmd.hasOption("h"))
{
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "TwitterDataConverter", options );
System.exit(0);
}
if (!cmd.hasOption("i"))
{
System.err.println("input file path not specified");
missingArgs = true;
}
if (!cmd.hasOption("o"))
{
System.err.println("output file path not specified");
missingArgs = true;
}
if (!cmd.hasOption("a"))
{
System.err.println("attribute names and types not specified");
missingArgs = true;
}
// default field delimiter in output file is ,
String fieldDelimiter = ",";
if (cmd.hasOption("d"))
{
fieldDelimiter = cmd.getOptionValue("d");
if (fieldDelimiter.length() > 1)
{
// check if it's octal code
if (octalCode.matcher((fieldDelimiter)).matches())
{
try
{
fieldDelimiter = StringUtil.convertOctalToChar(fieldDelimiter);
}
catch(NumberFormatException ex)
{
fieldDelimiter = null;
}
}
if (hexCode.matcher((fieldDelimiter)).matches())
{
try
{
fieldDelimiter = StringUtil.convertHexToChar(fieldDelimiter);
}
catch(NumberFormatException ex)
{
fieldDelimiter = null;
}
}
if (fieldDelimiter == null || fieldDelimiter.length() > 1)
{
System.err.println("invalid delimiter, must be one character");
missingArgs = true;
}
}
}
if (missingArgs)
{
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "TwitterDataConverter", options );
System.exit(-1);
}
String inputFilePath;
String outputFilePath;
String [] attrNames;
Class [] attrTypes;
String [] attrNameTypePairs;
inputFilePath = cmd.getOptionValue("i");
outputFilePath = cmd.getOptionValue("o");
String attrNameTypesStr = cmd.getOptionValue("a");
attrNameTypePairs = attrNameTypesStr.split(",");
if (attrNameTypePairs == null || attrNameTypePairs.length == 0)
{
System.err.println("attr name and types not specified");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "TwitterDataConverter", options );
System.exit(-1);
}
attrTypes = new Class[attrNameTypePairs.length];
attrNames = new String[attrNameTypePairs.length];
for(int idx = 0; idx < attrNameTypePairs.length; idx++)
{
String attrNameTypePair = attrNameTypePairs[idx];
String [] attrNameAndType = attrNameTypePair.split(":");
System.out.println("attr name and type: " + attrNameTypePair);
if (attrNameAndType == null ||
attrNameAndType.length != 2)
{
System.err.println("invalid attr name and type pair: " + attrNameTypePair);
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "TwitterDataConverter", options );
System.exit(-1);
}
attrNames[idx] = attrNameAndType[0];
String attrTypeToken = attrNameAndType[1];
// For types, we only support String, Long, Integer and Boolean for now
if ("string".equalsIgnoreCase(attrTypeToken))
{
attrTypes[idx] = String.class;
}
else if ("long".equalsIgnoreCase(attrTypeToken))
{
attrTypes[idx] = Long.class;
}
else if ("int".equalsIgnoreCase(attrTypeToken) ||
"integer".equalsIgnoreCase(attrTypeToken) ||
"number".equalsIgnoreCase(attrTypeToken))
{
attrTypes[idx] = Integer.class;
}
else if ("bool".equalsIgnoreCase(attrTypeToken) ||
"boolean".equalsIgnoreCase(attrTypeToken))
{
attrTypes[idx] = Boolean.class;
}
else
{
System.err.println("invalid type:" + attrTypeToken);
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "TwitterDataConverter", options );
System.exit(-1);
}
}
convert(fieldDelimiter, inputFilePath,
outputFilePath, attrNames, attrTypes);
}
public static void convert(String fieldDelimiter,
String inputFilePath,
String outputFilePath,
String [] attrNames,
Class [] attrTypes)
throws FileNotFoundException, IOException
{
JsonNode tweetNode;
File file = new File(inputFilePath);
File outFile = new File(outputFilePath);
FileReader fr = new FileReader(file);
FileWriter fw = new FileWriter(outFile);
BufferedReader br = new BufferedReader(fr);
BufferedWriter bw = new BufferedWriter(fw);
String line = null;
ObjectMapper mapper = new ObjectMapper();
while((line = br.readLine()) != null)
{
// parse the new line as json
tweetNode = mapper.readTree(line);
if (!tweetNode.path("created_at").isMissingNode())
{
boolean isFirstCol = true;
StringBuilder newTweet = new StringBuilder();
int attrIdx = 0;
for(String key : attrNames)
{
Class attrType = attrTypes[attrIdx++];
String[] nodeNames = key.split("\\.");
JsonNode leafNode = tweetNode;
for(String nodeName : nodeNames)
{
if (!nodeName.isEmpty() && !"$".equals(nodeName))
leafNode = leafNode.path(nodeName);
}
if (!isFirstCol)
{
newTweet.append(fieldDelimiter);
}
else
{
isFirstCol = false;
}
if (!leafNode.isMissingNode())
{
// convert null to nothing
if (leafNode.isNull())
{
continue;
}
if (attrType == String.class)
{
newTweet.append("\"");
String value;
if (leafNode.isValueNode())
{
value = leafNode.asText();
}
else
{
value = leafNode.toString();
}
for (int idx = 0; idx < value.length(); idx++)
{
if (value.charAt(idx) == '\"')
{
newTweet.append("\"");
}
newTweet.append(value.charAt(idx));
}
newTweet.append("\"");
}
else if (attrType == Long.class)
newTweet.append(leafNode.asLong());
else if (attrType == Integer.class)
newTweet.append(leafNode.asInt());
else if (attrType == Boolean.class)
newTweet.append(leafNode.asBoolean());
}
}
bw.write(newTweet.toString());
bw.newLine();
}
}
bw.flush();
bw.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy