
com.cinchapi.common.base.StringSplitter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of accent4j Show documentation
Show all versions of accent4j Show documentation
Accent4J is a suite of libraries, helpers and data structures that make Java programming idioms more fluent.
/*
* Copyright (c) 2013-2017 Cinchapi Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cinchapi.common.base;
import static com.cinchapi.common.base.SplitOption.*;
import java.util.NoSuchElementException;
/**
* An in-place utility to traverse and split a string into substring.
*
* Unlike the {@link String#split(String)} method, this utility returns tokens
* as they are split on the fly so the caller can process them in place. The
* traditional {@link String#split(String)} approach must make at least two
* passes over the string [O(n^2)] whereas this approach is guarantee to make a
* single pass [O(n)].
*
*
*
Usage
*
*
* String string = "Please split this string by space";
* StringSplitter splitter = new StringSplitter(string);
* while (splitter.hasNext()) {
* String next = splitter.next();
* }
*
*
*
*
* @author Jeff Nelson
*/
public class StringSplitter {
/**
* An integer that contains bits representing {@link SplitOption split
* options} that have been enabled. To check whether an option is enabled do
*
*
* return (options & (1 << option.mask())) != 0;
*
*/
protected final int options;
/**
* The current position of the splitter.
*/
protected int pos = 0;
/**
* The char array of the string that is being split.
*/
private char[] chars;
/**
* The delimiter to use for splitting.
*/
private final char delimiter;
/**
* A flag that controls whether an attempt to split on a newline character
* sequence should ignore the line feed character ('\n') because the
* previous character was a carriage return (\r). Typically, a sequence of
* \r\n is used by Windows to signify a newline.
*
*
* This flag is only relevant if the option to {@link #splitOnNewline()} is
* enabled.
*
*/
private boolean ignoreLF = false;
/**
* A flag that is set in the {@link #next()} method whenever it grabs a
* {@link #next} token that was determined to be at the end of line. This
* means that calls to {@link #atEndOfLine()} will return {@code true} until
* the next call to {@link #next()}.
*/
private boolean lastEOL = false;
/**
* The next string to return.
*/
private String next = null;
/**
* A flag that is set in the {@link #findNext()} method whenever it
* determines that the {@link #next} token to be returned is at the end of
* line.
*/
private boolean nextEOL = false;
/**
* A flag that controls whether we should allow {@link #findNext()} to set
* {@link #next} to an empty string. Normally, whenever two delimiters
* appear back to back, the splitter will return an empty string (i.e.
* "foo,,bar,car" means that there is an empty token in the 2nd column).
* However, when additional {@link #options} are passed to the splitter, it
* may be unintuitive to return an empty string when we a character that is
* relevant for one of the options and the delimiter appear back-to-back.
*/
private boolean overrideEmptyNext = false;
/**
* The start of the next token.
*/
private int start = 0;
/**
* Construct a new instance.
*
* @param string the string to split
*/
public StringSplitter(String string) {
this(string, ' ');
}
/**
* Construct a new instance.
*
* @param string the string to split
* @param delimiter the delimiter upon which to split
*/
public StringSplitter(String string, char delimiter) {
this(string, delimiter, SplitOption.NONE);
}
/**
* Construct a new instance.
*
* @param string the string to split
* @param delimiter the delimiter upon which to split
* @param options an array of {@link SplitOption options} to supplement the
* split behaviour
*/
public StringSplitter(String string, char delimiter,
SplitOption... options) {
this.chars = string.toCharArray();
this.delimiter = delimiter;
int opts = 0;
for (SplitOption option : options) {
opts |= 1 << option.mask();
}
this.options = opts;
findNext();
}
/**
* Construct a new instance.
*
* @param string the string to split
* @param options an array of {@link SplitOption options} to supplement the
* split behaviour
*/
public StringSplitter(String string, SplitOption... options) {
this(string, ' ', options);
}
/**
* Return {@code true} if {@link SplitOption#SPLIT_ON_NEWLINE} is
* {@link SplitOption#isEnabled(StringSplitter) enabled} and the last token
* returned by {@link #next()} is followed immediately by a line break.
* Otherwise, return {@code false}.
*
* @return {@code true} if the last token returned was at the end of line
*/
public boolean atEndOfLine() {
return lastEOL;
}
/**
* Return {@code true} if this splitter has any remaining substrings.
*
* @return {@code true} if there is another element
*/
public boolean hasNext() {
return next != null;
}
/**
* Return the next substring that results from splitting the original source
* string.
*
* @return the new substring
*/
public String next() {
if(next == null) {
throw new NoSuchElementException();
}
else {
String result = next;
if(lastEOL) {
lastEOL = false;
}
if(nextEOL) {
lastEOL = true;
nextEOL = false;
}
findNext();
return result;
}
}
/**
* Reset the splitter.
*/
public void reset() {
pos = 0;
start = 0;
findNext();
}
/**
* Return an array that contains all the tokens after traversing through the
* entire split process.
*
* @return the tokens
*/
public String[] toArray() {
ArrayBuilder builder = ArrayBuilder.builder();
while (hasNext()) {
builder.add(next());
}
return builder.length() > 0 ? builder.build() : Array.containing();
}
/**
* Before an attempt is made to {@link #setNext() set the next token} do
* some analysis on the internal state of the splitter to see if its
* actually appropriate to do so. If the next token should not be set,
* return {@code false} from this method and also optionally change the
* {@link #pos} pointer to rewind the splitter.
*
* @return {@code true} if the splitter is indeed ready to set the next
* token
*/
protected boolean confirmSetNext() {
return true;
}
/**
* Determine, based on state factors that are recorded within the class, if
* the splitter is actually ready to split the string on an instance of the
* delimiter. By default, this method always returns {@code true}, but a
* subclass can use it for awareness of certain conditions that would mean a
* string should not be split on an instance of the delimiter (i.e. if the
* delimiter occurs within quotes).
*
* @return {@code true} if the splitter is actually ready to perform a split
*/
protected boolean isReadyToSplit() {
return true;
}
/**
* Given a character {@code c} that is processed by the splitter, update the
* state that determines whether the splitter would actually be ready to
* split in the event that it encounters a delimiter character.
*
* @param c
*/
protected void updateIsReadyToSplit(char c) {/* noop */}
/**
* Find the next element to return.
*/
private void findNext() {
nextEOL = false;
next = null;
boolean resetOverrideEmptyNext = true;
boolean processOverrideEmptyNext = true;
while (pos < chars.length && next == null) {
boolean resetIgnoreLF = true;
char c = chars[pos];
++pos;
if(c == delimiter && isReadyToSplit()) {
setNext();
}
else if(SPLIT_ON_NEWLINE.isEnabled(this) && c == '\n'
&& isReadyToSplit()) {
if(ignoreLF) {
start = pos;
}
else {
setNext();
nextEOL = true;
}
}
else if(SPLIT_ON_NEWLINE.isEnabled(this) && c == '\r'
&& isReadyToSplit()) {
ignoreLF = true;
resetIgnoreLF = false;
setNext();
nextEOL = true;
}
else if(TOKENIZE_PARENTHESIS.isEnabled(this)
&& (c == '(' || c == ')') && isReadyToSplit()) {
setNext();
if(next.isEmpty()) {
next = AnyStrings.valueOfCached(c);
overrideEmptyNext = true;
processOverrideEmptyNext = false;
resetOverrideEmptyNext = false;
}
else {
// Need to undo the modifications from #setNext() in order
// to look at the parenthesis char again so it can be
// returned as a single token via the if block above
pos--;
start = pos;
}
}
// For SPLIT_ON_NEWLINE, we must reset #ignoreLF if the current char
// is not == '\r'
ignoreLF = resetIgnoreLF ? false : ignoreLF;
updateIsReadyToSplit(c);
}
if(pos == chars.length && next == null) { // If we reach the end of the
// string without finding
// the delimiter, then set
// next to be all the
// remaining chars.
if(confirmSetNext()) {
int length = pos - start;
if(length == 0) {
next = "";
}
else {
length = trim(length);
next = String.valueOf(chars, start, length);
}
++pos;
}
else {
findNext();
}
}
if(next != null && next.isEmpty()) {
// For compatibility with String#split, we must detect if an empty
// token occurs at the end of a string by trying to find the next
// occurrence of a non delimiter char.
boolean atEnd = true;
for (int i = pos; i < chars.length; ++i) {
if(chars[i] != delimiter) {
atEnd = false;
break;
}
}
next = atEnd ? null : next;
}
// FOR TOKENIZE_PARENTHESIS, we must #overrideEmptyNext if the last
// next was a single parenthesis in case the next char is a delimiter.
// This prevents the appearance of having back-to-back delimiters.
if(overrideEmptyNext && processOverrideEmptyNext) {
if(next != null && next.isEmpty()) {
findNext();
}
resetOverrideEmptyNext = true;
}
overrideEmptyNext = resetOverrideEmptyNext ? false : overrideEmptyNext;
if(next != null && DROP_QUOTES.isEnabled(this)
&& AnyStrings.isWithinQuotes(next)
&& this instanceof QuoteAwareStringSplitter) {
next = next.substring(1, next.length() - 1);
}
}
/**
* Set the {@link #next} element based on the current {@link #pos} and the
* {@link #start} of the search.
*
* The side effects of this method are:
*
* - {@code next} is set equal to all the chars from {@link #start} and
* {@link #pos} - 2
* - {@code start} is set equal to {@link #pos}
* The char at {@link #pos} - 1 is "dropped". This character is usually
* the delimiter, so it is okay to do this, but if there is a corner case,
* the caller must explicitly handle that character
*
*
*/
private void setNext() {
if(confirmSetNext()) {
int length = pos - start - 1;
if(length == 0) {
next = "";
}
else {
length = trim(length);
next = String.valueOf(chars, start, length);
}
start = pos;
}
else {
findNext();
}
}
/**
* Given the desired {@code length} for the {@link #next} token, perform any
* trimming of leading and trailing white space if
* {@link SplitOption#TRIM_WHITESPACE}
* {@link SplitOption#isEnabled(StringSplitter) is enabled}.
*
* This method will modify the global {@link #start} position for the
* {@link #next} string. It returns the appropriate length to assign after
* the trimming has been done.
*
*
* @param length the length of the untrimmed {@link #next} string.
* @return the appropriate length after the trimming
*/
private int trim(int length) {
if(SplitOption.TRIM_WHITESPACE.isEnabled(this)) {
while (Character.isWhitespace(chars[start]) && length > 1) {
start++;
length--;
}
while (Character.isWhitespace(chars[(start + length) - 1])
&& length > 1) {
length--;
}
}
return length;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy