org.apache.metamodel.arff.ArffDataContext Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of MetaModel-arff Show documentation
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.metamodel.arff;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.metamodel.MetaModelException;
import org.apache.metamodel.QueryPostprocessDataContext;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.data.MaxRowsDataSet;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.ColumnType;
import org.apache.metamodel.schema.MutableColumn;
import org.apache.metamodel.schema.MutableSchema;
import org.apache.metamodel.schema.MutableTable;
import org.apache.metamodel.schema.Schema;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.schema.TableType;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;

/**
 * DataContext implementation for ARFF (.arff) files, which can be loosely described a specialized type of CSV files
 * with schema information, comments and data types in the header section of the file.
 */
public class ArffDataContext extends QueryPostprocessDataContext {

    private static final Logger logger = LoggerFactory.getLogger(ArffDataContext.class);

    private static final String SECTION_ANNOTATION_RELATION = "@relation";
    private static final String SECTION_ANNOTATION_ATTRIBUTE = "@attribute";
    private static final String SECTION_ANNOTATION_DATA = "@data";
    private static final Charset CHARSET = FileHelper.UTF_8_CHARSET;
    private static final Pattern ATTRIBUTE_DEF_W_DATATYPE_PARAM =
            Pattern.compile("\\'?(.+)\\'?\\s+([a-zA-Z]+)\\s+\\'(.+)\\'");

    private final Splitter whitespaceSplitter = Splitter.on(CharMatcher.whitespace()).trimResults().omitEmptyStrings();

    private final Resource resource;

    public ArffDataContext(Resource resource) {
        this.resource = resource;
    }

    @Override
    protected Schema getMainSchema() throws MetaModelException {
        final MutableSchema schema = new MutableSchema(getMainSchemaName());
        final MutableTable table = new MutableTable(getMainSchemaName(), TableType.TABLE, schema);
        schema.addTable(table);

        try (BufferedReader reader = createReader()) {
            boolean inHeader = true;
            for (String line = reader.readLine(); inHeader && line != null; line = reader.readLine()) {
                if (isIgnoreLine(line)) {
                    continue;
                }
                final List split = whitespaceSplitter.limit(2).splitToList(line);
                switch (split.get(0).toLowerCase()) {
                case SECTION_ANNOTATION_RELATION:
                    // table name
                    final String tableName = trimString(split.get(1));
                    table.setName(tableName);
                    break;
                case SECTION_ANNOTATION_ATTRIBUTE:
                    // column(s)
                    final String attributeDef = split.get(1).trim();

                    final String attributeName;
                    final String attributeType;
                    final String attributeParam;
                    final ColumnType columnType;

                    final int indexOfCurly = attributeDef.indexOf('{');
                    if (indexOfCurly != -1) {
                        attributeName = trimString(attributeDef.substring(0, indexOfCurly).trim());
                        attributeType = attributeDef.substring(indexOfCurly);
                        attributeParam = null;
                    } else {
                        final Matcher matcher = ATTRIBUTE_DEF_W_DATATYPE_PARAM.matcher(attributeDef);
                        if (matcher.find()) {
                            attributeName = matcher.group(1);
                            attributeType = matcher.group(2);
                            attributeParam = matcher.group(3);
                        } else {
                            // simple attribute definition "[name] [type]"
                            final List attributeDefSplit = whitespaceSplitter.splitToList(attributeDef);
                            if (attributeDefSplit.size() != 2) {
                                throw new IllegalStateException(
                                        resource.getName() + ": Unable to parse attribute line: " + line);
                            }
                            attributeName = trimString(attributeDefSplit.get(0));
                            attributeType = attributeDefSplit.get(1);
                            attributeParam = null;
                        }
                    }
                    switch (attributeType.toLowerCase()) {
                    case "numeric":
                        columnType = ColumnType.NUMBER;
                        break;
                    case "int":
                    case "integer":
                    case "short":
                        columnType = ColumnType.INTEGER;
                        break;
                    case "real":
                    case "double":
                    case "float":
                        columnType = ColumnType.DOUBLE;
                        break;
                    case "string":
                        columnType = ColumnType.STRING;
                        break;
                    case "date":
                        columnType = ColumnType.DATE;
                        break;
                    default:
                        if (indexOfCurly == -1) {
                            logger.info(
                                    "{}: Unrecognized data-type for attribute '{}': {}. Mapping to STRING column type.",
                                    resource.getName(), attributeName, attributeType);
                        }
                        columnType = ColumnType.STRING;
                        break;
                    }

                    final MutableColumn column = new MutableColumn(attributeName, columnType, table);
                    column.setRemarks(attributeParam == null ? attributeType : attributeType + " " + attributeParam);
                    column.setColumnNumber(table.getColumnCount());
                    table.addColumn(column);
                    break;
                case SECTION_ANNOTATION_DATA:
                    // the header part of the file is done, no more schema to build up
                    inHeader = false;
                    break;
                }
            }
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
        return schema;
    }

    private String trimString(String string) {
        if (string.startsWith("'") && string.endsWith("'")) {
            string = string.substring(1, string.length() - 1);
        }
        return string;
    }

    private BufferedReader createReader() {
        return FileHelper.getBufferedReader(resource.read(), CHARSET);
    }

    @Override
    protected String getMainSchemaName() throws MetaModelException {
        return resource.getName();
    }

    @Override
    protected DataSet materializeMainSchemaTable(Table table, List columns, int maxRows) {
        BufferedReader reader = createReader();
        try {
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                line = line.trim();
                if (isIgnoreLine(line)) {
                    continue;
                }
                if (line.toLowerCase().equals(SECTION_ANNOTATION_DATA)) {
                    // start of the data
                    break;
                }
            }
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
        final ArffDataSet dataSet = new ArffDataSet(resource, columns, reader);
        if (maxRows > -1) {
            return new MaxRowsDataSet(dataSet, maxRows);
        } else {
            return dataSet;
        }
    }

    protected static boolean isIgnoreLine(String line) {
        if (line.trim().isEmpty()) {
            return true;
        }
        if (line.startsWith("%")) {
            return true; // comment
        }
        return false;
    }

}