All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.ac.starlink.table.formats.RowEvaluator Maven / Gradle / Ivy

package uk.ac.starlink.table.formats;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import uk.ac.starlink.table.ColumnInfo;
import uk.ac.starlink.table.DomainMapper;
import uk.ac.starlink.table.TableFormatException;
import uk.ac.starlink.table.TimeMapper;

/**
 * Examines unknown rows (arrays of strings) to work out what they contain.
 * By repeatedly calling {@link #submitRow} the evaluator can refine its
 * idea of what kind of data is represented by each column.
 *
 * @author   Mark Taylor (Starlink)
 * @since    21 Sep 2004
 */
public class RowEvaluator {

    private boolean[] maybeBlank_;
    private boolean[] maybeBoolean_;
    private boolean[] maybeShort_;
    private boolean[] maybeInteger_;
    private boolean[] maybeLong_;
    private boolean[] maybeFloat_;
    private boolean[] maybeDouble_;
    private boolean[] maybeDate_;
    private boolean[] maybeHms_;
    private boolean[] maybeDms_;
    private int[] stringLength_;
    private long nrow_;
    private int ncol_ = -1;

    /** Regular expression for ISO 8601 dates. */
    public static final Pattern ISO8601_REGEX = Pattern.compile(
        "([0-9]+)-([0-9]{1,2})-([0-9]{1,2})" +
        "(?:[" + 'T' + " ]([0-9]{1,2})" +
            "(?::([0-9]{1,2})" +
                "(?::([0-9]{1,2}(?:\\.[0-9]*)?))?" +
            ")?" +
        "Z?)?"
    );
    private static final Pattern HMS_REGEX = Pattern.compile(
        "[ 012]?[0-9][:h ][ 0-6][0-9][:m ][0-6][0-9](\\.[0-9]*)?"
    );
    private static final Pattern DMS_REGEX = Pattern.compile(
        "[-+][ 0-9]?[0-9][:d ][ 0-6][0-9][:m ][0-6][0-9](\\.[0-9]*)?"
    );
    private static final Pattern NAN_REGEX = Pattern.compile(
        "NaN", Pattern.CASE_INSENSITIVE
    );
    private static final Pattern INFINITY_REGEX = Pattern.compile(
        "([+-]?)(Infinity|inf)", Pattern.CASE_INSENSITIVE
    );

    /** Decoder for values that are all blank. */
    private static Decoder BLANK_DECODER = new StringDecoder() {
        private Pattern blankRegex_ = Pattern.compile( " *" );
        public Object decode( String value ) {
            return null;
        }
        public boolean isValid( String value ) {
            return value == null
                || blankRegex_.matcher( value ).matches();
        }
    };

    /** Decoder for booleans. */
    private static Decoder BOOLEAN_DECODER = new Decoder( Boolean.class ) {
        public Object decode( String value ) {
            char v1 = value.trim().charAt( 0 );
            return ( v1 == 't' || v1 == 'T' ) ? Boolean.TRUE
                                              : Boolean.FALSE;
        }
        public boolean isValid( String value ) {
            return value.equalsIgnoreCase( "false" )
                || value.equalsIgnoreCase( "true" )
                || value.equalsIgnoreCase( "f" )
                || value.equalsIgnoreCase( "t" );
        }
    };

    /* We are careful to check for "-0" type cells in the integer type
     * decoders - it is essential that they are coded as floating types
     * (which can represent negative zero) rather than integer types
     * (which can't), since a negative zero is most likely the
     * hours/degrees part of a sexegesimal angle, in which the
     * difference is very important
     * (see uk.ac.starlink.topcat.func.Angles.dmsToRadians). */

    /** Decoder for shorts. */
    private static Decoder SHORT_DECODER = new Decoder( Short.class ) {
        public Object decode( String value ) {
            return new Short( Short.parseShort( value.trim() ) );
        }
        public boolean isValid( String value ) {
            try {
                return Short.parseShort( value ) != 0
                    || value.charAt( 0 ) != '-';
            }
            catch ( NumberFormatException e ) {
                return false;
            }
        }
    };

    /** Decoder for integers. */
    private static Decoder INTEGER_DECODER = new Decoder( Integer.class ) {
        public Object decode( String value ) {
            return new Integer( Integer.parseInt( value.trim() ) );
        }
        public boolean isValid( String value ) {
            try {
                return Integer.parseInt( value ) != 0
                    || value.charAt( 0 ) != '-';
            }
            catch ( NumberFormatException e ) {
                return false;
            }
        }
    };

    /** Decoder for longs. */
    private static Decoder LONG_DECODER = new Decoder( Long.class ) {
        public Object decode( String value ) {
            return new Long( Long.parseLong( value.trim() ) );
        }
        public boolean isValid( String value ) {
            try {
                return Long.parseLong( value ) != 0L
                    || value.charAt( 0 ) != '-';
            }
            catch ( NumberFormatException e ) {
                return false;
            }
        }
    };

    /** Decoder for floats. */
    private static Decoder FLOAT_DECODER = new Decoder( Float.class ) {
        public Object decode( String value ) {
            return new Float( (float) parseFloating( value.trim() ).dValue );
        }
        public boolean isValid( String value ) {
            try {
                ParsedFloat pf = parseFloating( value );
                double dval = pf.dValue;
                return dval == 0
                    || Double.isNaN( dval )
                    || Double.isInfinite( dval )
                    || ( pf.sigFig <= 6 && isSinglePrecision( dval ) );
            }
            catch ( NumberFormatException e ) {
                return false;
            }
        }
        private boolean isSinglePrecision( double dval ) {
            double absVal = Math.abs( dval );
            return absVal > Float.MIN_NORMAL && absVal < Float.MAX_VALUE;
        }
    };

    /** Decoder for doubles. */
    private static Decoder DOUBLE_DECODER = new Decoder( Double.class ) {
        public Object decode( String value ) {
            return new Double( parseFloating( value.trim() ).dValue );
        }
        public boolean isValid( String value ) {
            try {
                parseFloating( value );
                return true;
            }
            catch ( NumberFormatException e ) {
                return false;
            }
        }
    };

    /** Decoder for ISO-8601 dates. */
    private static Decoder DATE_DECODER = new StringDecoder() {
        public ColumnInfo createColumnInfo( String name ) {
            ColumnInfo info = super.createColumnInfo( name );
            info.setUnitString( "iso-8601" );
            info.setUCD( "TIME" );
            info.setDomainMappers( new DomainMapper[] { TimeMapper.ISO_8601 } );
            return info;
        }
        public boolean isValid( String value ) {
            return ISO8601_REGEX.matcher( value ).matches();
        }
    };

    /** Decoder for HMS sexagesimal strings. */
    private static Decoder HMS_DECODER = new StringDecoder() {
        public ColumnInfo createColumnInfo( String name ) {
            ColumnInfo info = super.createColumnInfo( name );
            info.setUnitString( "hms" );
            return info;
        }
        public boolean isValid( String value ) {
            return HMS_REGEX.matcher( value ).matches();
        }
    };

    /** Decoder for DMS sexagesimal strings. */
    private static Decoder DMS_DECODER = new StringDecoder() {
        public ColumnInfo createColumnInfo( String name ) {
            ColumnInfo info = super.createColumnInfo( name );
            info.setUnitString( "dms" );
            return info;
        }
        public boolean isValid( String value ) {
            return DMS_REGEX.matcher( value ).matches();
        }
    };

    /** Decoder for any old string. */
    private static Decoder STRING_DECODER = new StringDecoder() {
        public boolean isValid( String value ) {
            return true;
        }
    };

    /**
     * Constructs a new RowEvaluator which will work out the number of
     * columns from the data.
     */
    public RowEvaluator() {
    }

    /**
     * Constructs a new RowEvaluator which will examine rows with a
     * fixed number of columns.
     *
     * @param  ncol  column count
     */
    public RowEvaluator( int ncol ) {
        init( ncol );
    }

    /**
     * Initializes to deal with rows of a given number of elements.
     */
    private void init( int ncol ) {
        ncol_ = ncol; 

        /* This data could be set up more compactly, indexing via type-specific
         * decoders rather than having a named array for each possible type. */
        maybeBlank_ = makeFlagArray( true );
        maybeBoolean_ = makeFlagArray( true );
        maybeShort_ = makeFlagArray( true );
        maybeInteger_ = makeFlagArray( true );
        maybeLong_ = makeFlagArray( true );
        maybeFloat_ = makeFlagArray( true );
        maybeDouble_ = makeFlagArray( true );
        maybeDate_ = makeFlagArray( true );
        maybeHms_ = makeFlagArray( true );
        maybeDms_ = makeFlagArray( true );
        stringLength_ = new int[ ncol ];
    }

    /**
     * Looks at a given row (list of strings) and records information about
     * what sort of things it looks like it contains.
     *
     * @param   row  ncol-element list of strings
     * @throws  TableFormatException  if the number of elements in
     *          row is not the same as on the first call
     */
    public void submitRow( List row ) throws TableFormatException {
        nrow_++;
        if ( ncol_ < 0 ) {
            init( row.size() );
        }
        if ( row.size() != ncol_ ) {
            throw new TableFormatException(
                "Wrong number of columns at row " + nrow_ +
                " (expecting " + ncol_ + ", found " + row.size() +  ")" );
        }
        for ( int icol = 0; icol < ncol_; icol++ ) {
            boolean done = false;
            String cell0 = row.get( icol );
            int leng0 = cell0 == null ? 0 : cell0.length();
            String cell = cell0 == null ? "" : cell0.trim();
            int leng = cell.length();
            if ( leng0 > stringLength_[ icol ] ) {
                stringLength_[ icol ] = leng0;
            }
            if ( leng > 0 ) {
                updateColFlag( icol, cell, maybeBlank_, BLANK_DECODER );
                updateColFlag( icol, cell, maybeBoolean_, BOOLEAN_DECODER );
                updateColFlag( icol, cell, maybeShort_, SHORT_DECODER );
                updateColFlag( icol, cell, maybeInteger_, INTEGER_DECODER );
                updateColFlag( icol, cell, maybeLong_, LONG_DECODER );
                updateColFlag( icol, cell, maybeFloat_, FLOAT_DECODER );
                updateColFlag( icol, cell, maybeDouble_, DOUBLE_DECODER );
                updateColFlag( icol, cell, maybeDate_, DATE_DECODER );
                updateColFlag( icol, cell, maybeHms_, HMS_DECODER );
                updateColFlag( icol, cell, maybeDms_, DMS_DECODER );
            }
        }
    }

    /**
     * Updates an element of a flags array based on compatibility of
     * a cell value with a given decoder.
     *
     * @param  icol  index into colFlags array
     * @param  cell  test cell value
     * @param  colFlags   flags array
     * @param  decoder   if cell is marked as invalid by decoder,
     *                   then colFlags[icol] will be set false
     */
    private static void updateColFlag( int icol, String cell,
                                       boolean[] colFlags, Decoder decoder ) {

        /* Get the short circuiting right for efficiency; a failed validity
         * test can throw an exception and so be expensive, so it's important
         * that it's not done over and over again for a column. */
        if ( colFlags[ icol ] && ! decoder.isValid( cell ) ) {
            colFlags[ icol ] = false;
        }
    }

    /**
     * Returns information gleaned from previous submitRow
     * calls about the kind of data that appears to be in the columns.
     *
     * @return  metadata
     */
    public Metadata getMetadata() {
        ColumnInfo[] colInfos = new ColumnInfo[ ncol_ ];
        Decoder[] decoders = new Decoder[ ncol_ ];
        for ( int icol = 0; icol < ncol_; icol++ ) {
            final Decoder decoder;
            String name = "col" + ( icol + 1 );
            if ( maybeBlank_[ icol ] ) {
                decoder = BLANK_DECODER;
            }
            else if ( maybeBoolean_[ icol ] ) {
                decoder = BOOLEAN_DECODER;
            }
            else if ( maybeShort_[ icol ] ) {
                decoder = SHORT_DECODER;
            }
            else if ( maybeInteger_[ icol ] ) {
                decoder = INTEGER_DECODER;
            }
            else if ( maybeLong_[ icol ] ) {
                decoder = LONG_DECODER;
            }
            else if ( maybeFloat_[ icol ] ) {
                decoder = FLOAT_DECODER;
            }
            else if ( maybeDouble_[ icol ] ) {
                decoder = DOUBLE_DECODER;
            }
            else if ( maybeDate_[ icol ] ) {
                decoder = DATE_DECODER;
            }
            else if ( maybeHms_[ icol ] ) {
                decoder = HMS_DECODER;
            }
            else if ( maybeDms_[ icol ] ) {
                decoder = DMS_DECODER;
            }
            else {
                decoder = STRING_DECODER;
            }
            decoders[ icol ] = decoder;
            ColumnInfo info = decoder.createColumnInfo( name );
            if ( decoder instanceof StringDecoder ) {
                info.setElementSize( stringLength_[ icol ] );
            }
            colInfos[ icol ] = info;
        }
        return new Metadata( colInfos, decoders, nrow_ );
    }

    /**
     * Returns a new ncol-element boolean array.
     *
     * @param   val  initial value of all flags
     * @return  new flag array initialized to val
     */
    private boolean[] makeFlagArray( boolean val ) {
        boolean[] flags = new boolean[ ncol_ ];
        Arrays.fill( flags, val );
        return flags;
    }

    /**
     * Parses a floating point value.  This does a couple of extra things
     * than Double.parseDouble - it understands 'd' or 'D' as the exponent
     * signifier as well as 'e' or 'E', and it counts the number of
     * significant figures.
     *
     * @param   item  string representing a floating point number
     * @return  object encapsulating information about the floating pont
     *          value extracted from item - note it's always the
     *          same instance returned, so don't hang onto it
     * @throws  NumberFormatException  if item can't be understood
     *          as a float or double
     */
    private static ParsedFloat parseFloating( String item ) {

        /* Check for special values.  Although parseDouble picks up 
         * some of these, it only works with java-friendly forms like
         * "NaN" and not (e.g.) python-friendly ones like "nan". */
        if ( NAN_REGEX.matcher( item ).matches() ) {
            return ParsedFloat.NaN;
        }
        Matcher infMatcher = INFINITY_REGEX.matcher( item );
        if ( infMatcher.matches() ) {
            String sign = infMatcher.group( 1 );
            return sign.length() > 0 && sign.charAt( 0 ) == '-'
                 ? ParsedFloat.NEGATIVE_INFINITY
                 : ParsedFloat.POSITIVE_INFINITY;
        }

        /* Do a couple of jobs by looking at the string directly:
         * Substitute 'd' or 'D' which may indicate an exponent in
         * FORTRAN77-style output for an 'e', and count the number of
         * significant figures.  With some more work it would be possible
         * to do the actual parse here, but since this probably isn't
         * a huge bottleneck we leave it to Double.parseDouble. */
        int nc = item.length();
        boolean foundExp = false;
        int sigFig = 0;
        for ( int i = 0; i < nc; i++ ) {
            char c = item.charAt( i );
            switch ( c ) {
                case 'd':
                case 'D':
                    if ( ! foundExp ) {
                        StringBuffer sbuf = new StringBuffer( item );
                        sbuf.setCharAt( i, 'e' );
                        item = sbuf.toString();
                    }
                    foundExp = true;
                    break;
                case 'e':
                case 'E':
                    foundExp = true;
                    break;
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                case '8':
                case '9':
                    if ( ! foundExp ) {
                        sigFig++;
                    }
                    break;
                default:
            }
        }

        /* Parse the number. */
        double dvalue = Double.parseDouble( item );
        return new ParsedFloat( sigFig, dvalue );
    }

    /**
     * Helper class used to group quantities which describe what the
     * data types found in the columns of a table are.
     */
    public static class Metadata {
        public final ColumnInfo[] colInfos_;
        public final Decoder[] decoders_;
        public final long nrow_;
        public final int ncol_;
        public Metadata( ColumnInfo[] colInfos, Decoder[] decoders,
                         long nrow ) {
            colInfos_ = colInfos;
            decoders_ = decoders;
            nrow_ = nrow;
            if ( colInfos_.length != decoders_.length ) {
                throw new IllegalArgumentException();
            }
            ncol_ = colInfos_.length;
        }
    }

    /**
     * Interface for an object that can turn a string into a cell content
     * object.
     */
    public static abstract class Decoder {
        private final Class clazz_;

        /**
         * Constructor.
         *
         * @param   clazz  class of object to be returned by decode method
         */
        public Decoder( Class clazz ) {
            clazz_ = clazz;
        }

        /**
         * Returns a new ColumnInfo suitable for the decoded values.
         *
         * @param  name  column name
         * @return  new metadata object
         */
        public ColumnInfo createColumnInfo( String name ) {
            return new ColumnInfo( name, clazz_, null );
        }

        /**
         * Decodes a value.
         * Will complete without exception if {@link #isValid} returns true
         * for the presented value; otherwise may throw an
         * unchecked exception.
         *
         * @param  value  string to decode
         * @return   typed object corresponding to value
         */
        public abstract Object decode( String value );

        /**
         * Indicates whether this decoder is capable of decoding a 
         * given string.
         *
         * @param  value  string to decode
         * @return  true iff this decoder can make sense of the string
         */
        public abstract boolean isValid( String value );
    }

    /**
     * Partial Decoder implementation for strings..
     */
    private static abstract class StringDecoder extends Decoder {
        StringDecoder() {
            super( String.class );
        }

        /**
         * Returns the value unchanged.
         */
        public Object decode( String value ) {
            return value;
        }
    }

    /**
     * Helper class to encapsulate the result of a floating point number
     * parse.
     */
    private static class ParsedFloat {

        /** Number of significant figures. */
        final int sigFig;

        /** Value of the number. */
        final double dValue;

        static final ParsedFloat NaN = new ParsedFloat( 0, Double.NaN );
        static final ParsedFloat POSITIVE_INFINITY =
            new ParsedFloat( 0, Double.POSITIVE_INFINITY );
        static final ParsedFloat NEGATIVE_INFINITY =
            new ParsedFloat( 0, Double.NEGATIVE_INFINITY );

        /**
         * Constructor.
         *
         * @param  sigFig  number of significant figures
         * @param  dValue  floating point value
         */
        ParsedFloat( int sigFig, double dValue ) {
            this.sigFig = sigFig;
            this.dValue = dValue;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy