org.apache.flink.util.StringValueUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-core Show documentation
There is a newer version: 1.5.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.util;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.types.StringValue;

import java.io.Serializable;

/**
 * Utility class for efficient operations on {@link StringValue}.
 *
 * All methods in this class are written to be optimized for efficiency and work directly
 * on the StringValues char arrays, avoiding copies. For simplicity and efficiency, the methods
 * only apply to strings whose characters are representable in a single char,
 * ie. strings without surrogate characters.
 */
@PublicEvolving
public final class StringValueUtils {

	/**
	 * Converts the given StringValue into a lower case variant.
	 *
	 * @param string The string to convert to lower case.
	 */
	public static void toLowerCase(StringValue string) {
		final char[] chars = string.getCharArray();
		final int len = string.length();

		for (int i = 0; i < len; i++) {
			chars[i] = Character.toLowerCase(chars[i]);
		}
	}

	/**
	 * Replaces all non-word characters in a string by a given character. The only
	 * characters not replaced are the characters that qualify as word characters
	 * or digit characters with respect to {@link Character#isLetter(char)} or
	 * {@link Character#isDigit(char)}, as well as the underscore character.
	 *
	 * This operation is intended to simplify strings for counting distinct words.
	 *
	 * @param string The string value to have the non-word characters replaced.
	 * @param replacement The character to use as the replacement.
	 */
	public static void replaceNonWordChars(StringValue string, char replacement) {
		final char[] chars = string.getCharArray();
		final int len = string.length();

		for (int i = 0; i < len; i++) {
			final char c = chars[i];
			if (!(Character.isLetter(c) || Character.isDigit(c) || c == '_')) {
				chars[i] = replacement;
			}
		}
	}

	// ============================================================================================

	/**
	 * A tokenizer for string values that uses whitespace characters as token delimiters.
	 * The tokenizer is designed to have a resettable state and operate on mutable objects,
	 * sparing object allocation and garbage collection overhead.
	 */
	public static final class WhitespaceTokenizer implements Serializable {
		private static final long serialVersionUID = 1L;

		private StringValue toTokenize;		// the string to tokenize
		private int pos;					// the current position in the string
		private int limit;					// the limit in the string's character data

		/**
		 * Creates a new tokenizer with an undefined internal state.
		 */
		public WhitespaceTokenizer() {}

		/**
		 * Sets the string to be tokenized and resets the state of the tokenizer.
		 *
		 * @param string The string value to be tokenized.
		 */
		public void setStringToTokenize(StringValue string) {
			this.toTokenize = string;
			this.pos = 0;
			this.limit = string.length();
		}

		/**
		 * Gets the next token from the string. If another token is available, the token is stored
		 * in the given target StringValue object.
		 *
		 * @param target The StringValue object to store the next token in.
		 * @return True, if there was another token, false if not.
		 */
		public boolean next(StringValue target) {
			final char[] data = this.toTokenize.getCharArray();
			final int limit = this.limit;
			int pos = this.pos;

			// skip the delimiter
			for (; pos < limit && Character.isWhitespace(data[pos]); pos++) {
			}

			if (pos >= limit) {
				this.pos = pos;
				return false;
			}

			final int start = pos;
			for (; pos < limit && !Character.isWhitespace(data[pos]); pos++) {
			}

			this.pos = pos;
			target.setValue(this.toTokenize, start, pos - start);
			return true;
		}
	}

	// ============================================================================================

	/**
	 * Private constructor to prevent instantiation, as this is a utility method encapsulating class.
	 */
	private StringValueUtils() {}
}