com.cinchapi.common.base.StringSplitter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of accent4j Show documentation
Accent4J is a suite of libraries, helpers and data structures that make Java programming idioms more fluent.
There is a newer version: 1.13.1
/*
 * Copyright (c) 2013-2017 Cinchapi Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cinchapi.common.base;

import static com.cinchapi.common.base.SplitOption.*;

import java.util.NoSuchElementException;

/**
 * An in-place utility to traverse and split a string into substring.
 * 
 * Unlike the {@link String#split(String)} method, this utility returns tokens
 * as they are split on the fly so the caller can process them in place. The
 * traditional {@link String#split(String)} approach must make at least two
 * passes over the string [O(n^2)] whereas this approach is guarantee to make a
 * single pass [O(n)].
 * 
 * 
 * 
Usage
 * 
 *  * String string = "Please split this string by space";
 * StringSplitter splitter = new StringSplitter(string);
 * while (splitter.hasNext()) {
 *     String next = splitter.next();
 * }
 * 
 * 
 * 
 * 
 * @author Jeff Nelson
 */
public class StringSplitter {

    /**
     * An integer that contains bits representing {@link SplitOption split
     * options} that have been enabled. To check whether an option is enabled do
     * 
     *      * return (options & (1 << option.mask())) != 0;
     * 
     */
    protected final int options;

    /**
     * The current position of the splitter.
     */
    protected int pos = 0;

    /**
     * The char array of the string that is being split.
     */
    private char[] chars;

    /**
     * The delimiter to use for splitting.
     */
    private final char delimiter;

    /**
     * A flag that controls whether an attempt to split on a newline character
     * sequence should ignore the line feed character ('\n') because the
     * previous character was a carriage return (\r). Typically, a sequence of
     * \r\n is used by Windows to signify a newline.
     * 
     * 
     * This flag is only relevant if the option to {@link #splitOnNewline()} is
     * enabled.
     * 
     */
    private boolean ignoreLF = false;

    /**
     * A flag that is set in the {@link #next()} method whenever it grabs a
     * {@link #next} token that was determined to be at the end of line. This
     * means that calls to {@link #atEndOfLine()} will return {@code true} until
     * the next call to {@link #next()}.
     */
    private boolean lastEOL = false;

    /**
     * The next string to return.
     */
    private String next = null;

    /**
     * A flag that is set in the {@link #findNext()} method whenever it
     * determines that the {@link #next} token to be returned is at the end of
     * line.
     */
    private boolean nextEOL = false;

    /**
     * A flag that controls whether we should allow {@link #findNext()} to set
     * {@link #next} to an empty string. Normally, whenever two delimiters
     * appear back to back, the splitter will return an empty string (i.e.
     * "foo,,bar,car" means that there is an empty token in the 2nd column).
     * However, when additional {@link #options} are passed to the splitter, it
     * may be unintuitive to return an empty string when we a character that is
     * relevant for one of the options and the delimiter appear back-to-back.
     */
    private boolean overrideEmptyNext = false;

    /**
     * The start of the next token.
     */
    private int start = 0;

    /**
     * Construct a new instance.
     * 
     * @param string the string to split
     */
    public StringSplitter(String string) {
        this(string, ' ');
    }

    /**
     * Construct a new instance.
     * 
     * @param string the string to split
     * @param delimiter the delimiter upon which to split
     */
    public StringSplitter(String string, char delimiter) {
        this(string, delimiter, SplitOption.NONE);
    }

    /**
     * Construct a new instance.
     * 
     * @param string the string to split
     * @param delimiter the delimiter upon which to split
     * @param options an array of {@link SplitOption options} to supplement the
     *            split behaviour
     */
    public StringSplitter(String string, char delimiter,
            SplitOption... options) {
        this.chars = string.toCharArray();
        this.delimiter = delimiter;
        int opts = 0;
        for (SplitOption option : options) {
            opts |= 1 << option.mask();
        }
        this.options = opts;
        findNext();
    }

    /**
     * Construct a new instance.
     * 
     * @param string the string to split
     * @param options an array of {@link SplitOption options} to supplement the
     *            split behaviour
     */
    public StringSplitter(String string, SplitOption... options) {
        this(string, ' ', options);
    }

    /**
     * Return {@code true} if {@link SplitOption#SPLIT_ON_NEWLINE} is
     * {@link SplitOption#isEnabled(StringSplitter) enabled} and the last token
     * returned by {@link #next()} is followed immediately by a line break.
     * Otherwise, return {@code false}.
     * 
     * @return {@code true} if the last token returned was at the end of line
     */
    public boolean atEndOfLine() {
        return lastEOL;
    }

    /**
     * Return {@code true} if this splitter has any remaining substrings.
     * 
     * @return {@code true} if there is another element
     */
    public boolean hasNext() {
        return next != null;
    }

    /**
     * Return the next substring that results from splitting the original source
     * string.
     * 
     * @return the new substring
     */
    public String next() {
        if(next == null) {
            throw new NoSuchElementException();
        }
        else {
            String result = next;
            if(lastEOL) {
                lastEOL = false;
            }
            if(nextEOL) {
                lastEOL = true;
                nextEOL = false;
            }
            findNext();
            return result;
        }
    }

    /**
     * Reset the splitter.
     */
    public void reset() {
        pos = 0;
        start = 0;
        findNext();
    }

    /**
     * Return an array that contains all the tokens after traversing through the
     * entire split process.
     * 
     * @return the tokens
     */
    public String[] toArray() {
        ArrayBuilder builder = ArrayBuilder.builder();
        while (hasNext()) {
            builder.add(next());
        }
        return builder.length() > 0 ? builder.build() : Array.containing();
    }

    /**
     * Before an attempt is made to {@link #setNext() set the next token} do
     * some analysis on the internal state of the splitter to see if its
     * actually appropriate to do so. If the next token should not be set,
     * return {@code false} from this method and also optionally change the
     * {@link #pos} pointer to rewind the splitter.
     * 
     * @return {@code true} if the splitter is indeed ready to set the next
     *         token
     */
    protected boolean confirmSetNext() {
        return true;
    }

    /**
     * Determine, based on state factors that are recorded within the class, if
     * the splitter is actually ready to split the string on an instance of the
     * delimiter. By default, this method always returns {@code true}, but a
     * subclass can use it for awareness of certain conditions that would mean a
     * string should not be split on an instance of the delimiter (i.e. if the
     * delimiter occurs within quotes).
     * 
     * @return {@code true} if the splitter is actually ready to perform a split
     */
    protected boolean isReadyToSplit() {
        return true;
    }

    /**
     * Given a character {@code c} that is processed by the splitter, update the
     * state that determines whether the splitter would actually be ready to
     * split in the event that it encounters a delimiter character.
     * 
     * @param c
     */
    protected void updateIsReadyToSplit(char c) {/* noop */}

    /**
     * Find the next element to return.
     */
    private void findNext() {
        nextEOL = false;
        next = null;
        boolean resetOverrideEmptyNext = true;
        boolean processOverrideEmptyNext = true;
        while (pos < chars.length && next == null) {
            boolean resetIgnoreLF = true;
            char c = chars[pos];
            ++pos;
            if(c == delimiter && isReadyToSplit()) {
                setNext();
            }
            else if(SPLIT_ON_NEWLINE.isEnabled(this) && c == '\n'
                    && isReadyToSplit()) {
                if(ignoreLF) {
                    start = pos;
                }
                else {
                    setNext();
                    nextEOL = true;
                }
            }
            else if(SPLIT_ON_NEWLINE.isEnabled(this) && c == '\r'
                    && isReadyToSplit()) {
                ignoreLF = true;
                resetIgnoreLF = false;
                setNext();
                nextEOL = true;
            }
            else if(TOKENIZE_PARENTHESIS.isEnabled(this)
                    && (c == '(' || c == ')') && isReadyToSplit()) {
                setNext();
                if(next.isEmpty()) {
                    next = AnyStrings.valueOfCached(c);
                    overrideEmptyNext = true;
                    processOverrideEmptyNext = false;
                    resetOverrideEmptyNext = false;
                }
                else {
                    // Need to undo the modifications from #setNext() in order
                    // to look at the parenthesis char again so it can be
                    // returned as a single token via the if block above
                    pos--;
                    start = pos;
                }
            }
            // For SPLIT_ON_NEWLINE, we must reset #ignoreLF if the current char
            // is not == '\r'
            ignoreLF = resetIgnoreLF ? false : ignoreLF;
            updateIsReadyToSplit(c);
        }
        if(pos == chars.length && next == null) { // If we reach the end of the
                                                  // string without finding
                                                  // the delimiter, then set
                                                  // next to be all the
                                                  // remaining chars.
            if(confirmSetNext()) {
                int length = pos - start;
                if(length == 0) {
                    next = "";
                }
                else {
                    length = trim(length);
                    next = String.valueOf(chars, start, length);
                }
                ++pos;
            }
            else {
                findNext();
            }
        }
        if(next != null && next.isEmpty()) {
            // For compatibility with String#split, we must detect if an empty
            // token occurs at the end of a string by trying to find the next
            // occurrence of a non delimiter char.
            boolean atEnd = true;
            for (int i = pos; i < chars.length; ++i) {
                if(chars[i] != delimiter) {
                    atEnd = false;
                    break;
                }
            }
            next = atEnd ? null : next;
        }
        // FOR TOKENIZE_PARENTHESIS, we must #overrideEmptyNext if the last
        // next was a single parenthesis in case the next char is a delimiter.
        // This prevents the appearance of having back-to-back delimiters.
        if(overrideEmptyNext && processOverrideEmptyNext) {
            if(next != null && next.isEmpty()) {
                findNext();
            }
            resetOverrideEmptyNext = true;
        }
        overrideEmptyNext = resetOverrideEmptyNext ? false : overrideEmptyNext;
        if(next != null && DROP_QUOTES.isEnabled(this)
                && AnyStrings.isWithinQuotes(next)
                && this instanceof QuoteAwareStringSplitter) {
            next = next.substring(1, next.length() - 1);
        }
    }

    /**
     * Set the {@link #next} element based on the current {@link #pos} and the
     * {@link #start} of the search.
     * 
     * The side effects of this method are:
     * 

     * {@code next} is set equal to all the chars from {@link #start} and
     * {@link #pos} - 2
     * {@code start} is set equal to {@link #pos}
     * The char at {@link #pos} - 1 is "dropped". This character is usually
     * the delimiter, so it is okay to do this, but if there is a corner case,
     * the caller must explicitly handle that character
     * 
     * 
     */
    private void setNext() {
        if(confirmSetNext()) {
            int length = pos - start - 1;
            if(length == 0) {
                next = "";
            }
            else {
                length = trim(length);
                next = String.valueOf(chars, start, length);
            }
            start = pos;
        }
        else {
            findNext();
        }
    }

    /**
     * Given the desired {@code length} for the {@link #next} token, perform any
     * trimming of leading and trailing white space if
     * {@link SplitOption#TRIM_WHITESPACE}
     * {@link SplitOption#isEnabled(StringSplitter) is enabled}.
     * 
     * This method will modify the global {@link #start} position for the
     * {@link #next} string. It returns the appropriate length to assign after
     * the trimming has been done.
     * 
     * 
     * @param length the length of the untrimmed {@link #next} string.
     * @return the appropriate length after the trimming
     */
    private int trim(int length) {
        if(SplitOption.TRIM_WHITESPACE.isEnabled(this)) {
            while (Character.isWhitespace(chars[start]) && length > 1) {
                start++;
                length--;
            }
            while (Character.isWhitespace(chars[(start + length) - 1])
                    && length > 1) {
                length--;
            }
        }
        return length;
    }

}