com.twitter.twittertext.TwitterTextParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of twitter-text Show documentation
Text processing routines for Twitter Tweets
There is a newer version: 3.1.0
package com.twitter.twittertext;

import com.twitter.twittertext.TwitterTextConfiguration.TwitterTextWeightedRange;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.text.Normalizer;
import java.util.List;
import java.util.ListIterator;

/**
 * A class to parse tweet text with a {@link TwitterTextConfiguration} and returns a
 * {@link TwitterTextParseResults} object
 */
public class TwitterTextParser {

  public static final TwitterTextParseResults EMPTY_TWITTER_TEXT_PARSE_RESULTS =
      new TwitterTextParseResults(0, 0, false, Range.EMPTY, Range.EMPTY);

  public static final TwitterTextConfiguration TWITTER_TEXT_DEFAULT_CONFIG =
      TwitterTextConfiguration.configurationFromJson("v1.json", true);

  /**
   * v2.json has the following unicode code point blocks defined
   * 0x0000 (0)    - 0x10FF (4351) Basic Latin to Georgian block: Weight 100
   * 0x2000 (8192) - 0x200D (8205) Spaces in the General Punctuation Block: Weight 100
   * 0x2010 (8208) - 0x201F (8223) Hyphens & Quotes in the General Punctuation Block: Weight 100
   * 0x2032 (8242) - 0x2037 (8247) Quotes in the General Punctuation Block: Weight 100
   */
  public static final TwitterTextConfiguration TWITTER_TEXT_WEIGHTED_CHAR_COUNT_CONFIG =
      TwitterTextConfiguration.configurationFromJson("v2.json", true);

  private static final Extractor EXTRACTOR = new Extractor();

  /**
   * Parses a given tweet text with the weighted character count configuration (v2.json).
   *
   * @param tweet which is to be parsed
   * @return {@link TwitterTextParseResults} object
   */
  @Nonnull
  public static TwitterTextParseResults parseTweet(@Nullable final String tweet) {
    return parseTweet(tweet, TWITTER_TEXT_WEIGHTED_CHAR_COUNT_CONFIG);
  }

  /**
   * Parses a given tweet text with the given {@link TwitterTextConfiguration}
   *
   * @param tweet which is to be parsed
   * @param config {@link TwitterTextConfiguration}
   * @return {@link TwitterTextParseResults} object
   */
  @Nonnull
  public static TwitterTextParseResults parseTweet(@Nullable final String tweet,
      @Nonnull final TwitterTextConfiguration config) {
    return parseTweet(tweet, config, true);
  }

  /**
   * Returns the weighted length of a tweet without doing any URL processing.
   * Used by Twitter Backend to validate the visible tweet in the final step of tweet creation.
   */
  @Nonnull
  public static TwitterTextParseResults parseTweetWithoutUrlExtraction(@Nullable final String tweet) {
    return parseTweet(tweet, TWITTER_TEXT_WEIGHTED_CHAR_COUNT_CONFIG, false);
  }

  /**
   * Parses a given tweet text with the given {@link TwitterTextConfiguration} and optionally control
   * if urls should/shouldn't be normalized to {@link TwitterTextConfiguration.DEFAULT_TRANSFORMED_URL_LENGTH}
   *
   * @param tweet which is to be parsed
   * @param config {@link TwitterTextConfiguration}
   * @param extractURLs boolean indicating if URLs should be extracted for counting
   * @return {@link TwitterTextParseResults} object
   */
  @Nonnull
  private static TwitterTextParseResults parseTweet(@Nullable final String tweet,
      @Nonnull final TwitterTextConfiguration config, boolean extractURLs) {
    if (tweet == null || tweet.trim().length() == 0) {
      return EMPTY_TWITTER_TEXT_PARSE_RESULTS;
    }

    final String normalizedTweet = Normalizer.normalize(tweet, Normalizer.Form.NFC);
    final int tweetLength = normalizedTweet.length();

    if (tweetLength == 0) {
      return EMPTY_TWITTER_TEXT_PARSE_RESULTS;
    }

    final int scale = config.getScale();
    final int maxWeightedTweetLength = config.getMaxWeightedTweetLength();
    final int scaledMaxWeightedTweetLength = maxWeightedTweetLength * scale;
    final int transformedUrlWeight = config.getTransformedURLLength() * scale;
    final List ranges = config.getRanges();

    final List urlEntities = EXTRACTOR.extractURLsWithIndices(normalizedTweet);

    boolean hasInvalidCharacters = false;
    int weightedCount = 0;
    int offset = 0;
    int validOffset = 0;

    while (offset < tweetLength) {
      int charWeight = config.getDefaultWeight();

      if (extractURLs) {
        final ListIterator urlEntityIterator = urlEntities.listIterator();
        while (urlEntityIterator.hasNext()) {
          final Extractor.Entity urlEntity = urlEntityIterator.next();
          if (urlEntity.start == offset) {
            final int urlLength = urlEntity.end - urlEntity.start;
            weightedCount += transformedUrlWeight;
            offset += urlLength;
            if (weightedCount <= scaledMaxWeightedTweetLength) {
              validOffset += urlLength;
            }
            urlEntityIterator.remove();
            break;
          }
        }
      }

      if (offset < tweetLength) {
        final int codePoint = normalizedTweet.codePointAt(offset);

        for (final TwitterTextWeightedRange weightedRange : ranges) {
          if (weightedRange.getRange().isInRange(codePoint)) {
            charWeight = weightedRange.getWeight();
            break;
          }
        }

        weightedCount += charWeight;

        hasInvalidCharacters = hasInvalidCharacters ||
            Validator.hasInvalidCharacters(normalizedTweet.substring(offset, offset + 1));

        final int charCount = Character.charCount(codePoint);
        offset += charCount;

        if (!hasInvalidCharacters && weightedCount <= scaledMaxWeightedTweetLength) {
          validOffset += charCount;
        }
      }
    }
    final int normalizedTweetOffset = tweet.length() - normalizedTweet.length();
    final int scaledWeightedLength = weightedCount / scale;
    final boolean isValid = !hasInvalidCharacters && scaledWeightedLength <= maxWeightedTweetLength;
    final int permillage = scaledWeightedLength * 1000 / maxWeightedTweetLength;
    return new TwitterTextParseResults(scaledWeightedLength, permillage, isValid,
        new Range(0, offset + normalizedTweetOffset - 1), new Range(0, validOffset + normalizedTweetOffset - 1));
  }
}