com.datumbox.framework.common.dataobjects.Dataframe Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datumbox-framework-common Show documentation
There is a newer version: 0.8.2
Show newest version
/**
 * Copyright (C) 2013-2016 Vasilis Vryniotis 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datumbox.framework.common.dataobjects;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.concurrency.ForkJoinStream;
import com.datumbox.framework.common.concurrency.StreamMethods;
import com.datumbox.framework.common.concurrency.ThreadMethods;
import com.datumbox.framework.common.interfaces.Copyable;
import com.datumbox.framework.common.interfaces.Extractable;
import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector;
import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector.MapType;
import com.datumbox.framework.common.persistentstorage.interfaces.DatabaseConnector.StorageHint;
import com.datumbox.framework.common.utilities.StringCleaner;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URI;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;

/**
 * The Dataframe class stores a list of Records Objects and several meta-data. All
 * Machine Learning algorithms get as argument Dataframe objects. The class has an
 * internal static Builder class which can be used to generate Dataframe objects 
 * from Text or CSV files.
 * 
 * @author Vasilis Vryniotis 
 */
public class Dataframe implements Collection, Copyable {
    
    /**
     * Internal name of the response variable.
     */
    public static final String COLUMN_NAME_Y = "~Y";
    
    /**
     * Internal name of the constant.
     */
    public static final String COLUMN_NAME_CONSTANT = "~CONSTANT";
    
    /**
     * The Builder is a utility class which can help you build Dataframe from
 Text files and CSV files.
     */
    public static class Builder {
        
        /**
         * It builds a Dataframe object from a provided list of text files. The data
 map should have as index the names of each class and as values the URIs
 of the training files. The files should contain one training example
 per row. If we want to parse a Text File of unknown category then
 pass a single URI with null as key.
 
 The method requires as arguments a file with the category names and locations
 of the training files, an instance of a TextExtractor which is used
 to extract the keywords from the documents and the Database Configuration
 Object.
         * 
         * @param textFilesMap
         * @param textExtractor
         * @param conf
         * @return 
         */
        public static Dataframe parseTextFiles(Map textFilesMap, Extractable textExtractor, Configuration conf) {
            Dataframe dataset = new Dataframe(conf);
            Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class);
            
            for (Map.Entry entry : textFilesMap.entrySet()) {
                Object theClass = entry.getKey();
                URI datasetURI = entry.getValue();
                
                logger.info("Dataset Parsing {} class", theClass);
                
                try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(datasetURI)), "UTF8"))) {
                    final int baseCounter = dataset.size(); //because we read multiple files we need to keep track of all records added earlier
                    ThreadMethods.throttledExecution(StreamMethods.enumerate(br.lines()), e -> { 
                        Integer rId = baseCounter + e.getKey();
                        String line = e.getValue();
                        
                        AssociativeArray xData = new AssociativeArray(
                                textExtractor.extract(StringCleaner.clear(line))
                        );
                        Record r = new Record(xData, theClass);

                        //we call below the recalculateMeta()
                        dataset.set(rId, r);
                    }, conf.getConcurrencyConfig());
                } 
                catch (IOException ex) {
                    throw new RuntimeException(ex);
                }
            }
            
            return dataset;
        }
        
        /**
         * It builds a Dataframe object from a CSV file; the first line of the provided 
         * CSV file must have a header with the column names.
         * 
         * The method accepts the following arguments: A Reader object from where
         * we will read the contents of the csv file. The name column of the 
         * response variable y. A map with the column names and their respective
         * DataTypes. The char delimiter for the columns, the char for quotes and
         * the string of the record/row separator. The Database Configuration
         * object.
         * 
         * @param reader
         * @param yVariable
         * @param headerDataTypes
         * @param delimiter
         * @param quote
         * @param recordSeparator
         * @param skip
         * @param limit
         * @param conf
         * @return 
         */
        public static Dataframe parseCSVFile(Reader reader, String yVariable, LinkedHashMap headerDataTypes, 
                                           char delimiter, char quote, String recordSeparator, Long skip, Long limit, Configuration conf) {
            Logger logger = LoggerFactory.getLogger(Dataframe.Builder.class);
            
            if(skip == null) {
                skip = 0L;
            }
            
            if(limit == null) {
                limit = Long.MAX_VALUE;
            }
            
            logger.info("Parsing CSV file");
            
            if (!headerDataTypes.containsKey(yVariable)) {
                logger.warn("WARNING: The file is missing the response variable column {}.", yVariable);
            }
            
            TypeInference.DataType yDataType = headerDataTypes.get(yVariable);
            Map xDataTypes = new HashMap<>(headerDataTypes); //copy header types
            xDataTypes.remove(yVariable); //remove the response variable from xDataTypes
            Dataframe dataset = new Dataframe(conf, yDataType, xDataTypes); //use the private constructor to pass DataTypes directly and avoid updating them on the fly
            
            
            CSVFormat format = CSVFormat
                                .RFC4180
                                .withHeader()
                                .withDelimiter(delimiter)
                                .withQuote(quote)
                                .withRecordSeparator(recordSeparator);
            
            try (final CSVParser parser = new CSVParser(reader, format)) { 
                ThreadMethods.throttledExecution(StreamMethods.enumerate(StreamMethods.stream(parser.spliterator(), false)).skip(skip).limit(limit), e -> { 
                    Integer rId = e.getKey();
                    CSVRecord row = e.getValue();
                
                    if (!row.isConsistent()) {
                        logger.warn("WARNING: Skipping row {} because its size does not match the header size.", row.getRecordNumber());
                    }
                    else {
                        Object y = null;
                        AssociativeArray xData = new AssociativeArray();
                        for (Map.Entry entry : headerDataTypes.entrySet()) {
                            String column = entry.getKey();
                            TypeInference.DataType dataType = entry.getValue();

                            Object value = TypeInference.DataType.parse(row.get(column), dataType); //parse the string value according to the DataType
                            if (yVariable != null && yVariable.equals(column)) {
                                y = value;
                            } 
                            else {
                                xData.put(column, value);
                            }
                        }
                        
                        Record r = new Record(xData, y);
                        
                        //use the internal unsafe methods to avoid the update of the Metas. 
                        //The Metas are already set in the construction of the Dataframe.
                        dataset._unsafe_set(rId, r);
                    }
                }, conf.getConcurrencyConfig());
            } 
            catch (IOException ex) {
                throw new RuntimeException(ex);
            }
            return dataset;
        }

    }    
    
    private TypeInference.DataType yDataType; 
    private Map xDataTypes;
    private Map records;
    private final AtomicInteger atomicNextAvailableRecordId = new AtomicInteger();
    
    private final DatabaseConnector dbc; 
    private final Configuration conf; 
    
    /**
     * This executor is used for the parallel processing of streams with custom 
     * Thread pool.
     */
    protected final ForkJoinStream streamExecutor;
    
    /**
     * Public constructor of Dataframe.
     * 
     * @param conf 
     */
    public Dataframe(Configuration conf) {
        this.conf = conf;
        
        //we dont need to have a unique name, because it is not used by the connector on the current implementations
        //String dbName = "dts_"+new BigInteger(130, RandomGenerator.getThreadLocalRandom()).toString(32);
        String dbName = "dts";
        dbc = this.conf.getDbConfig().getConnector(dbName);
        
        records = dbc.getBigMap("tmp_records", MapType.TREEMAP, StorageHint.IN_DISK, true, true);
        
        yDataType = null;
        xDataTypes = dbc.getBigMap("tmp_xDataTypes", MapType.HASHMAP, StorageHint.IN_MEMORY, true, true);
        
        streamExecutor = new ForkJoinStream(this.conf.getConcurrencyConfig());
    }
    
    /**
     * Private constructor used by the Builder inner static class.
     * 
     * @param conf
     * @param yDataType
     * @param xDataTypes 
     */
    private Dataframe(Configuration conf, TypeInference.DataType yDataType, Map xDataTypes) {
        this(conf);
        this.yDataType = yDataType;
        this.xDataTypes.putAll(xDataTypes);
    }
    
    
    //Mandatory Collection Methods
    
    /**
     * Returns the total number of Records of the Dataframe.
     * 
     * @return 
     */
    @Override
    public int size() {
        return records.size();
    }
    
    /**
     * Checks if the Dataframe is empty.
     * 
     * @return 
     */
    @Override
    public boolean isEmpty() {
        return records.isEmpty();
    }
    
    /**
     * Clears all the internal Records of the Dataframe. The Dataframe can be used
     * after you clear it.
     */
    @Override
    public void clear() {
        yDataType = null;
        
        xDataTypes.clear();
        records.clear();
    }

    /**
     * Adds a record in the Dataframe and updates the Meta data. 
     * 
     * @param r
     * @return 
     */
    @Override
    public boolean add(Record r) {
        addRecord(r);
        return true;
    }
    
    /**
     * Checks if the Record exists in the Dataframe. Note that the Record is checked only
     * for its x and y components.
     * 
     * @param o
     * @return 
     */
    @Override
    public boolean contains(Object o) {
        return records.containsValue((Record)o);
    }
    
    /** {@inheritDoc} */
    @Override
    public boolean addAll(Collection c) {
        c.stream().forEach(r -> {
            add(r);
        });
        return true;
    }
    
    /** {@inheritDoc} */
    @Override
    public boolean containsAll(Collection c) {
        return records.values().containsAll(c);
    }
    
    /** {@inheritDoc} */
    @Override
    public Object[] toArray() {
        Object[] array = new Object[size()];
        int i = 0;
        for(Record r : values()) {
            array[i++] = r;
        }
        return array;
    }
    
    /** {@inheritDoc} */      
    @Override
    @SuppressWarnings("unchecked")
    public  T[] toArray(T[] a) {
        int size = size();
        if (a.length < size) {
            a = (T[])java.lang.reflect.Array.newInstance(a.getClass().getComponentType(), size);
        }
        int i = 0;
        for (Record r : values()) {
            a[i++] = (T) r;
        }
        return a;
    }
    
    /**
     * Returns a read-only iterator on the values of the Dataframe.
     * 
     * @return 
     */
    @Override
    public Iterator iterator() {
        return values().iterator();
    }
    
    /** {@inheritDoc} */
    @Override
    public Stream stream() {
        return StreamMethods.stream(values(), false);
    }
    
    //Optional Collection Methods
    
    /**
     * Removes the first occurrence of the specified element from this Dataframe, 
     * if it is present and it does not update the metadata.
     * 
     * @param o
     * @return 
     */
    @Override
    public boolean remove(Object o) {
        Integer id = indexOf((Record) o);
        if(id == null) {
            return false;
        }
        remove(id);
        return true;
    }
    
    /**
     * Removes all of this collection's elements that are also contained in the
     * specified collection and updates the metadata.
     * 
     * @param c
     * @return 
     */
    @Override
    public boolean removeAll(Collection c) {
        boolean modified = false;
        for(Object o : c) {
            modified |= remove((Record)o);
        }
        if(modified) {
            recalculateMeta();
        }
        return modified;
    }

    /**
     * Retains only the elements in this collection that are contained in the
     * specified collection and updates the meta data.
     * 
     * @param c
     * @return 
     */
    @Override
    public boolean retainAll(Collection c) {
        boolean modified = false;
        for(Map.Entry e : entries()) {
            Integer rId = e.getKey();
            Record r = e.getValue();
            if(!c.contains(r)) {
                remove(rId);
                modified = true;
            }
        }
        if(modified) {
            recalculateMeta();
        }
        return modified;
    }
    
    
    //Other methods

    /**
     * Removes a record with a particular id from the Dataframe but does not update
     * the metadata.
     * 
     * @param id
     * @return 
     */
    public Record remove(Integer id) {
        return records.remove(id);
    }
    
    /**
     * Returns the index of the first occurrence of the specified element in this 
     * Dataframe, or null if this Dataframe does not contain the element.
     * WARNING: The Recordsare checked only for their X and Y values, not for 
     * the yPredicted and yPredictedProbabilities values.
     * 
     * @param o
     * @return 
     */
    public Integer indexOf(Record o) {
        if(o!=null) {
            for(Map.Entry e : entries()) {
                Integer rId = e.getKey();
                Record r = e.getValue();
                if(o.equals(r)) {
                    return rId;
                }
            }
        }
        return null;
    }
    
    /**
     * Returns a particular Record using its id.
     * 
     * @param id
     * @return 
     */
    public Record get(Integer id) {
        return records.get(id);
    }
    
    /**
     * Adds a Record in the Dataframe and returns its id.
     * 
     * @param r
     * @return 
     */
    public Integer addRecord(Record r) {
        Integer rId = _unsafe_add(r);
        updateMeta(r);
        return rId;
    }
    
    /**
     * Sets the record of a particular id in the dataset. If the record does not
     * exist it will be added with the specific id and the next added record will
     * have as id the next integer.
     * 
     * Note that the meta-data are partially updated. This means that if the replaced 
     * Record contained a column which is now no longer available in the dataset,
     * then the meta-data will not refect this update (the column will continue to exist
     * in the meta data). If this is a problem, you should call the recalculateMeta()
     * method to force them being recalculated.
     * 
     * @param rId
     * @param r
     * @return 
     */
    public Integer set(Integer rId, Record r) {
        _unsafe_set(rId, r);
        updateMeta(r);
        return rId;
    }
    
    /**
     * Returns the total number of X columns in the Dataframe.
     * 
     * @return 
     */
    public int xColumnSize() {
        return xDataTypes.size();
    }
    
    /**
     * Returns the type of the response variable y.
     * 
     * @return 
     */
    public TypeInference.DataType getYDataType() {
        return yDataType;
    }
    
    /**
     * Returns an Map with column names as index and DataTypes as values.
     * 
     * @return 
     */
    public Map getXDataTypes() {
        return Collections.unmodifiableMap(xDataTypes);
    }
    
    /**
     * It extracts the values of a particular column from all records and
     * stores them into an FlatDataList.
     * 
     * @param column
     * @return 
     */
    public FlatDataList getXColumn(Object column) {
        FlatDataList flatDataList = new FlatDataList();
        
        for(Record r : values()) {
            flatDataList.add(r.getX().get(column));
        }
        
        return flatDataList;
    }
    
    /**
     * It extracts the values of the response variables from all observations and
     * stores them into an FlatDataList.
     * 
     * @return 
     */
    public FlatDataList getYColumn() {
        FlatDataList flatDataList = new FlatDataList();
        
        for(Record r : values()) {
            flatDataList.add(r.getY());
        }
        
        return flatDataList;
    }
    
    /**
     * Removes completely a list of columns from the dataset. The meta-data of 
     * the Dataframe are updated. The method internally uses threads.
     * 
     * @param columnSet
     */
    public void dropXColumns(Set