All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.refine.browsing.util.NumericBinIndex Maven / Gradle / Ivy

Go to download

OpenRefine is a free, open source power tool for working with messy data and improving it

There is a newer version: 3.8.7
Show newest version
/*

Copyright 2010, Google Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

    * Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
    * Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,           
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY           
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

package com.google.refine.browsing.util;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Properties;

import com.google.refine.expr.ExpressionUtils;
import com.google.refine.model.Project;
import com.google.refine.model.Row;

/**
 * A utility class for computing the base bins that form the base histograms of numeric range facets. It evaluates an
 * expression on all the rows of a project to get numeric values, determines how many bins to distribute those values
 * in, and bins the rows accordingly.
 * 
 * This class processes all rows rather than just the filtered rows because it needs to compute the base bins of a
 * numeric range facet, which remain unchanged as the user interacts with the facet.
 */
abstract public class NumericBinIndex {

    protected int _totalValueCount;
    protected int _numbericValueCount;
    protected double _min;
    protected double _max;
    protected double _step;
    protected int[] _bins;

    protected int _numericRowCount;
    protected int _nonNumericRowCount;
    protected int _blankRowCount;
    protected int _errorRowCount;

    protected boolean _hasError = false;
    protected boolean _hasNonNumeric = false;
    protected boolean _hasNumeric = false;
    protected boolean _hasBlank = false;

    abstract protected void iterate(Project project, RowEvaluable rowEvaluable, List allValues);

    public NumericBinIndex(Project project, RowEvaluable rowEvaluable) {
        _min = Double.POSITIVE_INFINITY;
        _max = Double.NEGATIVE_INFINITY;

        // TODO: An array of doubles would be more memmory efficient - double[] allValues
        List allValues = new ArrayList();

        iterate(project, rowEvaluable, allValues);

        _numbericValueCount = allValues.size();

        if (_min >= _max) {
            _step = 1;
            _min = Math.min(_min, _max);
            _max = _min + _step;
            _bins = new int[1];

            return;
        }

        double diff = _max - _min;

        _step = 1;
        if (diff > 10) {
            while (_step * 100 < diff) {
                _step *= 10;
            }
        } else {
            while (_step * 100 > diff) {
                _step /= 10;
            }
        }

        double originalMax = _max;
        _min = (Math.floor(_min / _step) * _step);
        _max = (Math.ceil(_max / _step) * _step);

        double binCount = (_max - _min) / _step;
        if (binCount > 100) {
            _step *= 2;
            binCount = (binCount + 1) / 2;
        }

        if (_max <= originalMax) {
            _max += _step;
            binCount++;
        }

        _bins = new int[(int) Math.round(binCount)];
        for (double d : allValues) {
            int bin = Math.max((int) Math.floor((d - _min) / _step), 0);
            _bins[bin]++;
        }
    }

    public boolean isNumeric() {
        return _numbericValueCount > _totalValueCount / 2;
    }

    public double getMin() {
        return _min;
    }

    public double getMax() {
        return _max;
    }

    public double getStep() {
        return _step;
    }

    public int[] getBins() {
        return _bins;
    }

    public int getNumericRowCount() {
        return _numericRowCount;
    }

    public int getNonNumericRowCount() {
        return _nonNumericRowCount;
    }

    public int getBlankRowCount() {
        return _blankRowCount;
    }

    public int getErrorRowCount() {
        return _errorRowCount;
    }

    protected void processRow(
            Project project,
            RowEvaluable rowEvaluable,
            List allValues,
            int rowIndex,
            Row row,
            Properties bindings) {
        Object value = rowEvaluable.eval(project, rowIndex, row, bindings);

        if (ExpressionUtils.isError(value)) {
            _hasError = true;
        } else if (ExpressionUtils.isNonBlankData(value)) {
            if (value.getClass().isArray()) {
                Object[] a = (Object[]) value;
                for (Object v : a) {
                    _totalValueCount++;

                    if (ExpressionUtils.isError(v)) {
                        _hasError = true;
                    } else if (ExpressionUtils.isNonBlankData(v)) {
                        if (v instanceof Number) {
                            if (processValue(((Number) v).doubleValue(), allValues)) {
                                _hasNumeric = true;
                            } else {
                                _hasError = true;
                            }
                        } else {
                            _hasNonNumeric = true;
                        }
                    } else {
                        _hasBlank = true;
                    }
                }
            } else if (value instanceof Collection) {
                for (Object v : ExpressionUtils.toObjectCollection(value)) {
                    _totalValueCount++;

                    if (ExpressionUtils.isError(v)) {
                        _hasError = true;
                    } else if (ExpressionUtils.isNonBlankData(v)) {
                        if (v instanceof Number) {
                            if (processValue(((Number) v).doubleValue(), allValues)) {
                                _hasNumeric = true;
                            } else {
                                _hasError = true;
                            }
                        } else {
                            _hasNonNumeric = true;
                        }
                    } else {
                        _hasBlank = true;
                    }
                }
            } else {
                _totalValueCount++;

                if (value instanceof Number) {
                    if (processValue(((Number) value).doubleValue(), allValues)) {
                        _hasNumeric = true;
                    } else {
                        _hasError = true;
                    }
                } else {
                    _hasNonNumeric = true;
                }
            }
        } else {
            _hasBlank = true;
        }
    }

    protected void preprocessing() {
        _hasBlank = false;
        _hasError = false;
        _hasNonNumeric = false;
        _hasNumeric = false;
    }

    protected void postprocessing() {
        if (_hasError) {
            _errorRowCount++;
        }
        if (_hasBlank) {
            _blankRowCount++;
        }
        if (_hasNumeric) {
            _numericRowCount++;
        }
        if (_hasNonNumeric) {
            _nonNumericRowCount++;
        }
    }

    protected boolean processValue(double v, List allValues) {
        if (!Double.isInfinite(v) && !Double.isNaN(v)) {
            _min = Math.min(_min, v);
            _max = Math.max(_max, v);
            allValues.add(v);
            return true;
        } else {
            return false;
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy