manifold.csv.rt.parser.DataStats Maven / Gradle / Ivy
/*
* Copyright (c) 2020 - Manifold Systems LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package manifold.csv.rt.parser;
/**
* Determines the likelihood that a header exists by comparing the mix of characters used in a potential header field
* vs. a data field. Profiles a field by percentage of alpha, digit, white, and other characters used as well as the
* length of the text, all having equal weights. Then, comparing profiles, if data fields are significantly different
* from corresponding header fields, it is likely the header exists.
*
* DISCLAIMER:
* Note the algorithm using this analysis amounts to a best guess based on statistical makeup of raw data. Thus, there
* is no guarantee the algorithm is suitable for a given use-case.
*/
class DataStats
{
/**
* The lower bound percentage to which a data value must be similar to a header value. Data values determined to be
* "similar" (via {@link #isSimilar(CsvToken)}) in raw makeup to the header imply it is likely there is no header.
*/
private static final int THRESHOLD_PERCENTAGE = 75;
private int _alpha;
private int _digit;
private int _white;
private int _other;
private int _total;
DataStats( CsvToken token )
{
String data = token.getData();
_total = data.length();
if( _total == 0 )
{
return;
}
for( int i = 0; i < _total; i++ )
{
char c = data.charAt( i );
if( Character.isAlphabetic( c ) )
{
_alpha++;
}
else if( Character.isDigit( c ) )
{
_digit++;
}
else if( Character.isWhitespace( c ) )
{
_white++;
}
else
{
_other++;
}
}
_alpha = _alpha*100 / _total;
_digit = _digit*100 / _total;
_white = _white*100 / _total;
_other = _other*100 / _total;
}
boolean isSimilar( CsvToken token )
{
DataStats data = new DataStats( token );
return isSimilar( _alpha, data._alpha ) &&
isSimilar( _digit, data._digit ) &&
isSimilar( _white, data._white ) &&
isSimilar( _other, data._other ) &&
isSimilar( _total, data._total );
}
private boolean isSimilar( int d1, int d2 )
{
if( d1 == d2 )
{
return true;
}
if( d1 == 0 || d2 == 0 )
{
return false;
}
int max = (d1 > d2) ? d1 : d2;
int min = (d1 < d2) ? d1 : d2;
return 100 - (max - min)*100 / max >= THRESHOLD_PERCENTAGE;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy