All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.ac.standrews.cs.utilities.dataset.derived.DerivedDataSet Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Systems Research Group, University of St Andrews:
 * 
 *
 * This file is part of the module ciesvium.
 *
 * ciesvium is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * ciesvium is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with ciesvium. If not, see
 * .
 */
package uk.ac.standrews.cs.utilities.dataset.derived;

import uk.ac.standrews.cs.utilities.archive.QuickSort;
import uk.ac.standrews.cs.utilities.dataset.DataSet;

import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

/**
 * Abstract superclass for datasets derived from existing datasets via a sequence of relational-style transformations.
 *
 * @author Graham Kirby ([email protected])
 */
@SuppressWarnings("unused")
public abstract class DerivedDataSet extends DataSet {

    @SuppressWarnings("WeakerAccess")
    protected static final String ID_COLUMN_LABEL = "ID";

    protected DerivedDataSet() throws IOException {

        init(getDerivedDataSet(getSourceDataSet()));
    }

    /**
     * Gets the source dataset.
     *
     * @return the source dataset
     * @throws IOException if the source dataset cannot be obtained
     */
    @SuppressWarnings("WeakerAccess")
    public abstract DataSet getSourceDataSet() throws IOException;

    /**
     * Gets the derived dataset.
     *
     * @param source_data_set the source dataset
     * @return the derived dataset
     * @throws IOException if the derived dataset cannot be obtained
     */
    protected abstract DataSet getDerivedDataSet(DataSet source_data_set) throws IOException;

    @SuppressWarnings("WeakerAccess")
    public static Extender addIdColumn() {

        return new Extender() {

            int record_count = 1;

            @Override
            public List getAdditionalValues(final List record, final DataSet data_set) {

                final List result = new ArrayList<>();
                result.add(String.valueOf(record_count++));
                return result;
            }

            @Override
            public List getColumnLabels() {

                return Collections.singletonList(ID_COLUMN_LABEL);
            }
        };
    }

    @SuppressWarnings("WeakerAccess")
    public static Projector moveIdColumnToFirst(final List source_column_labels) {

        return () -> {
            final List result = new ArrayList<>();

            result.add(ID_COLUMN_LABEL);
            result.addAll(source_column_labels.stream().filter(s -> !s.equals(ID_COLUMN_LABEL)).collect(Collectors.toList()));

            return result;
        };
    }

    public static DataSet renumber(final DataSet data_set) {

        final List source_labels = data_set.getColumnLabels();
        return data_set.project(removeFirstColumn(source_labels)).extend(addIdColumn()).project(moveIdColumnToFirst(source_labels));
    }

    public static DataSet removeDuplicates(final DataSet data_set) {

        final DataSet result = new DataSet(data_set.getColumnLabels());

        final Set processed_rows = new HashSet<>();

        for (final List record : data_set.getRecords()) {

            final String flattened = flatten(record);
            if (!processed_rows.contains(flattened)) {
                result.addRow(record);
                processed_rows.add(flattened);
            }
        }

        return result;
    }

    public static DataSet sort(final DataSet data_set) {

        final DataSet result = new DataSet(data_set.getColumnLabels());

        // Make a map from flattened records to records.
        final Map>> map = new HashMap<>();

        for (final List record : data_set.getRecords()) {

            final String flattened = flatten(record);
            if (!map.containsKey(flattened)) {
                map.put(flattened, new Pair<>(0, record));
            }
            map.get(flattened).x++;
        }

        // Sort the flattened records.
        final List sorted = new ArrayList<>(map.keySet());
        new QuickSort<>(sorted, String::compareTo).sort();

        // Retrieve the structured records.
        for (final String flattened : sorted) {
            final Pair> pair = map.get(flattened);
            for (int i = 0; i < pair.x; i++) {
                result.addRow(pair.y);
            }
        }

        return result;
    }

    static class Pair {

        X x;
        final Y y;

        Pair(final X x, final Y y) {
            this.x = x;
            this.y = y;
        }
    }

    private static String flatten(final List record) {

        return String.join("",record);
    }

    private static Projector removeFirstColumn(final List labels) {

        return () -> labels.subList(1, labels.size());
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy