org.apache.sysml.runtime.util.FastStringTokenizer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.util;

import java.io.Serializable;
import java.util.NoSuchElementException;

/**
 * This string tokenizer is essentially a simplified StringTokenizer. 
 * In addition to the default functionality it allows to reset the tokenizer and it makes
 * the simplifying assumptions of (1) no returns delimiter, and (2) single character delimiter.
 * 
 */
public class FastStringTokenizer implements Serializable
{
	private static final long serialVersionUID = 4051436015710778611L;
	private String _string = null;
    private char   _del    = 0;
    private int    _pos    = -1;

    /**
     * Constructs a new StringTokenizer for string using the specified
     * delimiters, returnDelimiters is false.
     * 
     * @param string
     *            the string to be tokenized
     * @param delimiters
     *            the delimiters to use
     */
    public FastStringTokenizer(char delimiter) 
    {
        _del = delimiter;
        reset( null );
    }

    /**
     * 
     * @param string
     */
    public void reset( String string )
    {
    	_string = string;
    	_pos = 0;
    }
    
    /**
     * Returns the next token in the string as a String.
     * 
     * @return next token in the string as a String
     * @exception NoSuchElementException
     *                if no tokens remain
     */
    public String nextToken() 
    {
    	int len = _string.length();
    	int start = _pos;	
    	
    	//find start (skip over leading delimiters)
    	while(start < len && _del == _string.charAt(start) )
    		start++;
    	
    	//find end (next delimiter) and return
    	if(start < len) {
        	_pos = _string.indexOf(_del, start);
        	if( start < _pos && _pos < len )
        		return _string.substring(start, _pos);
        	else 
        		return _string.substring(start);
        }
  
    	//no next token
		throw new NoSuchElementException();
    }
    
    ////////////////////////////////////////
    // Custom parsing methods for textcell
    ////////////////////////////////////////
    
    public int nextInt()
    {
    	return Integer.parseInt( nextToken() );
    }
    
    public long nextLong()
    {
    	return Long.parseLong( nextToken() );
    }
    
    public double nextDouble()
    {
    	return Double.parseDouble( nextToken() );
    
    	//see nextDoubleForParallel, we use the same double parsing
    	//for sequential and parallel parsing because (1) it is faster (~10%)
    	//and (2) for consistency between sequential and parallel readers
    	
    	//return FloatingDecimal.parseDouble(nextToken());	
    }
    
    public double nextDoubleForParallel()
    {
    	//JDK 8 floating decimal, which removes a severe scalability bottleneck
    	//(synchronized static cache) in JDK7
    	//return FloatingDecimal.parseDouble(nextToken());
    	return Double.parseDouble( nextToken() );
    	
    	/*
    	//return Double.parseDouble( nextToken() );
    	
    	//NOTE: Depending on the platform string-2-double conversions were
    	//the main bottleneck in reading text data. Furthermore, we observed
    	//severe contention on multi-threaded parsing on Linux JDK.
    	// ---
    	//This is a known issue and has been fixed in JDK8.
    	//JDK-7032154 : Performance tuning of sun.misc.FloatingDecimal/FormattedFloatingDecimal
    	
    	// Simple workaround without JDK8 code, however, this does NOT guarantee exactly
    	// the same result due to potential for round off errors. 
    	
    	String val = nextToken();
    	double ret = 0;
    
    	if( UtilFunctions.isSimpleDoubleNumber(val) )
    	{ 
    		int ix = val.indexOf('.'); 
    		if( ix > 0 ) //DOUBLE parsing  
        	{
        		String s1 = val.substring(0, ix);
        		String s2 = val.substring(ix+1);
        		long tmp1 = Long.parseLong(s1);
        		long tmp2 = Long.parseLong(s2);
        		ret = (double)tmp2 / Math.pow(10, s2.length()) + tmp1;
        	}
        	else //LONG parsing and cast to double  
        		ret = (double)Long.parseLong(val);
    	}
    	else 
    	{
    		//fall-back to slow default impl if special characters
    		//e.g., ...E-0X, NAN, +-INFINITY, etc
    		ret = Double.parseDouble( val );
    	}
    	
    	return ret;
    	*/
    }
}