
gate.corpora.twitter.TweetUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of format-twitter Show documentation
Show all versions of format-twitter Show documentation
Document Format plugin to support reading and writing Twitter style JSON files
The newest version!
/*
* Copyright (c) 1995-2014, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* $Id: TweetUtils.java 18496 2014-12-12 15:13:48Z ian_roberts $
*/
package gate.corpora.twitter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.commons.lang.StringUtils;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import gate.Factory;
import gate.FeatureMap;
/* REFERENCES
* Jackson API
* http://wiki.fasterxml.com/JacksonHome
* Standard: RFC 4627
* https://tools.ietf.org/html/rfc4627
* */
public class TweetUtils {
public static final String PATH_SEPARATOR = ":";
public static final String MIME_TYPE = "text/x-json-twitter";
public static final String DEFAULT_ENCODING = "UTF-8";
public static final String TWEET_ANNOTATION_TYPE = "Tweet";
public static final String TWEET_SEGMENT_ANNOTATION_TYPE = "TweetSegement";
/**
* The JSON property representing entities (e.g. hashtags).
*/
public static final String ENTITIES_ATTRIBUTE = "entities";
/**
* Date parser that understands the "created_at" timestamp format.
* The parser can cope with dates in any timezone but the returned
* DateTime objects will always be anchored in UTC.
*/
// Month names in Twitter API responses are English, so force locale
public static final DateTimeFormatter CREATED_AT_FORMAT = DateTimeFormat.forPattern(
"EEE MMM dd HH:mm:ss Z yyyy").withZoneUTC().withLocale(Locale.ENGLISH);
public static List readTweets(String string) throws IOException {
if (string.startsWith("[")) {
return readTweetList(string, null, null);
}
// implied else
return readTweetLines(string, null, null);
}
public static List readTweets(String string, List contentKeys, List featureKeys) throws IOException {
if (string.startsWith("[")) {
return readTweetList(string, contentKeys, featureKeys);
}
// implied else
return readTweetLines(string, contentKeys, featureKeys);
}
public static ListreadTweetLines(String string, List contentKeys, List featureKeys) throws IOException {
String[] lines = string.split("[\\n\\r]+");
return readTweetStrings(lines, contentKeys, featureKeys);
}
public static ListreadTweetStrings(String[] lines, List contentKeys, List featureKeys) throws IOException {
ObjectMapper mapper = new ObjectMapper();
List tweets = new ArrayList();
for (String line : lines) {
if (line.length() > 0) {
JsonNode jnode = mapper.readTree(line);
tweets.add(new Tweet(jnode, true));
}
}
return tweets;
}
public static ListreadTweetStrings(List lines, List contentKeys, List featureKeys) throws IOException {
ObjectMapper mapper = new ObjectMapper();
List tweets = new ArrayList();
for (String line : lines) {
if (line.length() > 0) {
JsonNode jnode = mapper.readTree(line);
tweets.add(new Tweet(jnode, true));
}
}
return tweets;
}
public static List readTweetList(String string, List contentKeys, List featureKeys) throws IOException {
ObjectMapper mapper = new ObjectMapper();
List tweets = new ArrayList();
ArrayNode jarray = (ArrayNode) mapper.readTree(string);
for (JsonNode jnode : jarray) {
tweets.add(new Tweet(jnode, true));
}
return tweets;
}
public static Object process(JsonNode node) {
/* JSON types: number, string, boolean, array, object (dict/map),
* null. All map keys are strings.
*/
if (node.isBoolean()) {
return node.asBoolean();
}
if (node.isIntegralNumber()) {
// use Long even if the number is representable as an Integer,
// since Long is better supported in JAPE etc.
if(node.canConvertToLong()) {
return node.asLong();
} else {
return node.bigIntegerValue();
}
}
if (node.isNumber()) {
// fractional number, as integers would have been caught by
// the previous test. The numberValue will be a Double
// unless the parser was specifically configured to use
// BigDecimal instead
return node.numberValue();
}
if (node.isTextual()) {
return node.asText();
}
if (node.isNull()) {
return null;
}
if (node.isArray()) {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy