All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.data.parser.SparseDatasetParser Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/

package smile.data.parser;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.text.ParseException;
import smile.data.SparseDataset;

/**
 * Parser for spare dataset in coordinate triple tuple list format.
 * Coordinate file stores a list of (row, column, value) tuples:
 * 
 * instanceID attributeID value
 * instanceID attributeID value
 * instanceID attributeID value
 * instanceID attributeID value
 * ...
 * instanceID attributeID value
 * instanceID attributeID value
 * instanceID attributeID value
 * 
* Ideally, the entries are sorted (by row index, then column index) to * improve random access times. This format is good for incremental matrix * construction. *

* Optionally, there may be 2 header lines *

 * D    // The number of instances
 * W    // The number of attributes
 * 
* or 3 header lines *
 * D    // The number of instances
 * W    // The number of attributes
 * N    // The total number of nonzero items in the dataset.
 * 
* These header lines will be ignored. * * @author Haifeng Li */ public class SparseDatasetParser { /** * The starting index of array in the data. */ private int arrayStartingIndex = 0; /** * Constructor. */ public SparseDatasetParser() { } /** * Constructor. * @param arrayStartingIndex the starting index of array. By default, it is * 0 as in C/C++ and Java. But it could be 1 to parse data produced * by other programming language such as Fortran. */ public SparseDatasetParser(int arrayStartingIndex) { this.arrayStartingIndex = arrayStartingIndex; } /** * Parse a sparse dataset from given URI. * @throws java.io.FileNotFoundException */ public SparseDataset parse(URI uri) throws FileNotFoundException, IOException, ParseException { return parse(new File(uri)); } /** * Parse a sparse dataset from given URI. * @param uri the URI of data source. * @throws java.io.FileNotFoundException */ public SparseDataset parse(String name, URI uri) throws FileNotFoundException, IOException, ParseException { return parse(name, new File(uri)); } /** * Parse a sparse dataset from given file. * @throws java.io.FileNotFoundException */ public SparseDataset parse(String path) throws FileNotFoundException, IOException, ParseException { return parse(new File(path)); } /** * Parse a sparse dataset from given file. * @param path the file path of data source. * @throws java.io.FileNotFoundException */ public SparseDataset parse(String name, String path) throws FileNotFoundException, IOException, ParseException { return parse(name, new File(path)); } /** * Parse a sparse dataset from given file. * @throws java.io.FileNotFoundException */ public SparseDataset parse(File file) throws FileNotFoundException, IOException, ParseException { String name = file.getPath(); return parse(name, new FileInputStream(file)); } /** * Parse a sparse dataset from given file. * @param file the file of data source. * @throws java.io.FileNotFoundException */ public SparseDataset parse(String name, File file) throws FileNotFoundException, IOException, ParseException { return parse(name, new FileInputStream(file)); } /** * Parse a sparse dataset from an input stream. * @param stream the input stream of data. * @throws java.io.FileNotFoundException */ public SparseDataset parse(InputStream stream) throws IOException, ParseException { return parse("Sparse Dataset", stream); } /** * Parse a sparse dataset from an input stream. * @param name the name of dataset. * @param stream the input stream of data. * @throws java.io.FileNotFoundException */ public SparseDataset parse(String name, InputStream stream) throws IOException, ParseException { BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); try { // process header int nrow = 1; String line = reader.readLine(); for (; nrow <= 3 && line != null; nrow++) { String[] tokens = line.trim().split(" "); if (tokens.length >= 3) { break; } line = reader.readLine(); } if (line == null) { throw new IOException("Empty data source."); } SparseDataset sparse = new SparseDataset(name); do { String[] tokens = line.trim().split(" "); if (tokens.length != 3) { throw new ParseException("Invalid number of tokens.", nrow); } int d = Integer.valueOf(tokens[0]) - arrayStartingIndex; int w = Integer.valueOf(tokens[1]) - arrayStartingIndex; double c = Double.valueOf(tokens[2]); sparse.set(d, w, c); line = reader.readLine(); nrow++; } while (line != null); return sparse; } finally { reader.close(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy