All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.citec.tcs.alignment.csv.CSVImporter Maven / Gradle / Ivy

Go to download

This module permits exporting and importing of Sequence objects to CSV files. The underlying NodeSpecification is stored via a JSON file. The module aims at human- readable and compatible storage. For storage efficiency, further compression is advised.

There is a newer version: 3.1.1
Show newest version
/* 
 * TCS Alignment Toolbox Version 3
 * 
 * Copyright (C) 2016
 * Bassam Mokbel, Benjamin Paaßen
 * AG Theoretical Computer Science
 * Centre of Excellence Cognitive Interaction Technology (CITEC)
 * University of Bielefeld
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.tcs.alignment.csv;

import de.citec.tcs.alignment.sequence.Alphabet;
import de.citec.tcs.alignment.sequence.KeywordSpecification;
import de.citec.tcs.alignment.sequence.Node;
import de.citec.tcs.alignment.sequence.NodeSpecification;
import de.citec.tcs.alignment.sequence.Sequence;
import de.citec.tcs.alignment.sequence.StringKeywordSpecification;
import de.citec.tcs.alignment.sequence.StringValue;
import de.citec.tcs.alignment.sequence.SymbolicKeywordSpecification;
import de.citec.tcs.alignment.sequence.SymbolicValue;
import de.citec.tcs.alignment.sequence.Value;
import de.citec.tcs.alignment.sequence.VectorialKeywordSpecification;
import de.citec.tcs.alignment.sequence.VectorialValue;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.json.JSONArray;
import org.json.JSONObject;

/**
 * This class holds methods to import a Sequence from a given CSV file, and an
 * according NodeSpecification from a given JSON file.
 *
 * @author Bassam Mokbel - bmokbel(at)techfak.uni-bielefeld.de
 */
public final class CSVImporter {

	/**
	 * The class provides methods only, so no instantiation is necessary.
	 */
	private CSVImporter() {
	}

	/**
	 * This imports a NodeSpecification from a JSON file. The input JSON
	 * structure is expected to be a root object with only one property
	 * ("de.citec.tcs.sequence.NodeSpecification"). The value for that property
	 * should be an array of JSON objects, each of which describes a
	 * KeywordSpecification. These objects should have the properties "Keyword",
	 * which contains the Keyword string and
	 * "de.citec.tcs.alignment.sequence.ValueType"
	 * which contains the ValueType Enum. Further, JSON objects describing a
	 * VectorialKeywordSpecification should have the property "Length", which
	 * contains an integer stating the length and JSON objects describing a
	 * SymbolicKeywordSpecification should have the property "Alphabet", which
	 * contains a |-separated String of symbols for the Alphabet.
	 *
	 * @param jsonFileName a file with JSON content describing a
	 * NodeSpecification.
	 *
	 * @return the NodeSpecification contained in the JSON file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static NodeSpecification importNodeSpecification(String jsonFileName) throws IOException {
		return importNodeSpecification(new File(jsonFileName));
	}

	/**
	 * This imports a NodeSpecification from a JSON file. The input JSON
	 * structure is expected to be a root object with only one property
	 * ("de.citec.tcs.sequence.NodeSpecification"). The value for that property
	 * should be an array of JSON objects, each of which describes a
	 * KeywordSpecification. These objects should have the properties "Keyword",
	 * which contains the Keyword string and
	 * "de.citec.tcs.alignment.sequence.ValueType"
	 * which contains the ValueType Enum. Further, JSON objects describing a
	 * VectorialKeywordSpecification should have the property "Length", which
	 * contains an integer stating the length and JSON objects describing a
	 * SymbolicKeywordSpecification should have the property "Alphabet", which
	 * contains a |-separated String of symbols for the Alphabet.
	 *
	 * @param jsonFile a file with JSON content describing a NodeSpecification.
	 *
	 * @return the NodeSpecification contained in the JSON file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static NodeSpecification importNodeSpecification(File jsonFile) throws IOException {
		try {
			return importNodeSpecification(new FileReader(jsonFile));
		} catch (IOException ex) {
			throw new IOException("Could not read NodeSpecification from File "
					+ jsonFile.getAbsolutePath(), ex);
		}
	}

	/**
	 * This imports a NodeSpecification from a Reader. The input JSON
	 * structure is expected to be a root object with only one property
	 * ("de.citec.tcs.sequence.NodeSpecification"). The value for that property
	 * should be an array of JSON objects, each of which describes a
	 * KeywordSpecification. These objects should have the properties "Keyword",
	 * which contains the Keyword string and
	 * "de.citec.tcs.alignment.sequence.ValueType"
	 * which contains the ValueType Enum. Further, JSON objects describing a
	 * VectorialKeywordSpecification should have the property "Length", which
	 * contains an integer stating the length and JSON objects describing a
	 * SymbolicKeywordSpecification should have the property "Alphabet", which
	 * contains a |-separated String of symbols for the Alphabet.
	 *
	 * @param reader a Reader containing JSON data describing a
	 * NodeSpecification.
	 *
	 * @return the NodeSpecification contained in the JSON file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static NodeSpecification importNodeSpecification(Reader reader) throws IOException {

		final JSONObject root = new JSONObject(readStringData(reader));
		final JSONArray jsonArray = root.getJSONArray("de.citec.tcs.alignment.sequence.NodeSpecification");

		// read all keyword specifications
		KeywordSpecification[] keySpecs = new KeywordSpecification[jsonArray.length()];
		for (int i = 0; i < jsonArray.length(); i++) {
			JSONObject jsonKeySpec = jsonArray.getJSONObject(i);
			String keywordTitle = jsonKeySpec.getString("Keyword");
			String keywordValueType = jsonKeySpec.getString("de.citec.tcs.alignment.sequence.ValueType");

			switch (keywordValueType) {
				case "STRING": {
					StringKeywordSpecification keySpec = new StringKeywordSpecification(keywordTitle);
					keySpecs[i] = keySpec;
					break;
				}
				case "VECTOR": {
					int vectorLength = jsonKeySpec.getInt("Length");
					VectorialKeywordSpecification keySpec = new VectorialKeywordSpecification(vectorLength, keywordTitle);
					keySpecs[i] = keySpec;
					break;
				}
				case "SYMBOLIC": {
					String alphabetStr = jsonKeySpec.getString("Alphabet");
					Alphabet alphabet = new Alphabet(alphabetStr);
					SymbolicKeywordSpecification keySpec = new SymbolicKeywordSpecification(alphabet, keywordTitle);
					keySpecs[i] = keySpec;
					break;
				}
			}
		}

		return new NodeSpecification(keySpecs);
	}

	private static String readStringData(Reader reader) throws IOException {
		final BufferedReader buf;
		if (reader instanceof BufferedReader) {
			buf = (BufferedReader) reader;
		} else {
			buf = new BufferedReader(reader);
		}
		final StringBuilder content = new StringBuilder();
		String line;
		while (((line = buf.readLine())) != null) {
			content.append(line);
		}
		return content.toString();
	}

	/**
	 * This imports a sequence from a CSV file using the default delimiter "\t".
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the default delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFileName The CSV File to be imported.
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, String csvFileName) throws IOException {
		// call the import function with a default delimiter character "tabstop"
		return importSequence(nodeSpec, csvFileName, CSVExporter.DEFAULT_DELIMITER);
	}

	/**
	 * This imports a sequence from a CSV file using the default delimiter "\t".
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the default delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFile The CSV File to be imported.
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, File csvFile) throws IOException {
		// call the import function with a default delimiter character "tabstop"
		return importSequence(nodeSpec, csvFile, CSVExporter.DEFAULT_DELIMITER);
	}

	/**
	 * This imports a sequence from a Reader using the default delimiter "\t".
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the default delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param in the Reader from which the CSV data should be read.
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, Reader in) throws IOException {
		// call the import function with a default delimiter character "tabstop"
		return importSequence(nodeSpec, in, CSVExporter.DEFAULT_DELIMITER);
	}

	/**
	 * This imports a sequence from a CSV file using the given delimiter string.
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the given delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFileName The CSV File to be imported.
	 * @param valueDelimiterString The delimiter String between values in CSV.
	 * (Default is "\t", i.e. a tabstop.)
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, String csvFileName, String valueDelimiterString) throws IOException {
		return importSequence(nodeSpec, new FileReader(csvFileName), valueDelimiterString);
	}

	/**
	 * This imports a sequence from a CSV file using the given delimiter string.
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the given delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFile The CSV File to be imported.
	 * @param valueDelimiterString The delimiter String between values in CSV.
	 * (Default is "\t", i.e. a tabstop.)
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, File csvFile, String valueDelimiterString) throws IOException {
		return importSequence(nodeSpec, new FileReader(csvFile), valueDelimiterString);
	}

	/**
	 * This imports a sequence from a Reader containing CSV data using the
	 * given delimiter string.
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the given delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param in The Reader from which to read CSV data.
	 * @param valueDelimiterString The delimiter String between values in CSV.
	 * (Default is "\t", i.e. a tabstop.)
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, Reader in, String valueDelimiterString) throws IOException {
		final BufferedReader reader;
		if (in instanceof BufferedReader) {
			reader = (BufferedReader) in;
		} else {
			reader = new BufferedReader(in);
		}
		Sequence seq = new Sequence(nodeSpec);
		KeywordSpecification[] keySpecs = nodeSpec.getKeywordSpecifications();

		String line;
		int lineNumber = 0;

		// first read the header line with all keyword titles and types
		line = reader.readLine();
		String[] headerStrings = line.split(valueDelimiterString);
		if (headerStrings.length != keySpecs.length) {
			throw new IOException("In the first line (i.e. the table header line) "
					+ " the number of given keywords (separated by "
					+ valueDelimiterString
					+ ") is "
					+ headerStrings.length
					+ ", which is not the same as expected by the given NodeSpecification, which specifies "
					+ keySpecs.length
					+ "keywords."
			);
		} else {
			for (int i = 0; i < headerStrings.length; i++) {
				if (!(headerStrings[i].regionMatches(0,
						nodeSpec.getKeyword(i),
						0,
						nodeSpec.getKeyword(i).length()
				))
						|| !(headerStrings[i].regionMatches(nodeSpec.getKeyword(i).length() + 2,
								nodeSpec.getKeywordSpecification(i).getType().toString(),
								0,
								nodeSpec.getKeywordSpecification(i).getType().toString().length()
						))) {
					throw new IOException("In the first line (i.e. the table header line) "
							+ " the keyword number "
							+ i
							+ " is supposed to be "
							+ nodeSpec.getKeyword(i)
							+ " ("
							+ nodeSpec.getKeywordSpecification(i).getType().toString()
							+ "), "
							+ ", as defined by the loaded NodeSpecification, but the header line of the csv file specifies "
							+ headerStrings[i]
							+ "."
					);
				}
			}
		}
		lineNumber++;

		// read all remaining lines of the CSV file
		while ((line = reader.readLine()) != null) {

			String[] valueStrings = line.split(valueDelimiterString);
			if (valueStrings.length != keySpecs.length) {
				throw new IOException("In line number "
						+ (lineNumber + 1)
						+ " the number of given keyword values (separated by "
						+ valueDelimiterString
						+ ") is "
						+ valueStrings.length
						+ ", which is not the same as expected by the given NodeSpecification, which specifies "
						+ keySpecs.length
						+ "keywords."
				);
			}

			Node node = new Node(seq);

			// for each keyword, parse the string of the CSV stating its value
			for (int k = 0; k < valueStrings.length; k++) {
				final Value val;
				if (!valueStrings[k].equals("null")) {
					valueStrings[k] = valueStrings[k].substring(1, valueStrings[k].length() - 1);
					// switch the parsing behavior, acc. to the expected ValueType
					switch (keySpecs[k].getType()) {

						// for type VECTOR, read comma-separated values in brackets
						case VECTOR: {
							try {
								VectorialKeywordSpecification keySpec = (VectorialKeywordSpecification) keySpecs[k];
								int vectorLength = keySpec.getLength();
								assert (valueStrings[k].startsWith("[")
										&& valueStrings[k].endsWith("]"));
								valueStrings[k] = valueStrings[k].replace("[", "");
								valueStrings[k] = valueStrings[k].replace("]", "");
								String[] vectorSubStrings = valueStrings[k].split(",");
								assert (vectorSubStrings.length == vectorLength);
								double[] vector = new double[vectorSubStrings.length];
								// read each element of the vector
								for (int j = 0; j < vectorSubStrings.length; j++) {
									vector[j] = Double.parseDouble(vectorSubStrings[j]);
								}
								val = new VectorialValue(vector);
							} catch (RuntimeException ex) {
								throw new IOException("In line number "
										+ (lineNumber + 1)
										+ " the NodeSpecification specifies a VECTOR type,"
										+ " but the length of the vector given in the string "
										+ valueStrings[k]
										+ " is not the same as defined by the VectorialKeywordSpecification.",
										ex);
							}
							break;
						}

						// for type SYMBOLIC, create symbol from the given alphabet
						case SYMBOLIC: {
							try {
								SymbolicKeywordSpecification keySpec = (SymbolicKeywordSpecification) keySpecs[k];
								Alphabet alphabet = keySpec.getAlphabet();
								val = new SymbolicValue(alphabet, valueStrings[k]);
							} catch (RuntimeException ex) {
								throw new IOException("In line number "
										+ (lineNumber + 1)
										+ " the NodeSpecification specifies a SYMBOLIC type,"
										+ " but the symbol "
										+ valueStrings[k]
										+ " could not be found in the alphabet defined by the SymbolicKeywordSpecification.",
										ex);
							}
							break;
						}

						// for type STRING, simply read and create the StringValue
						case STRING: {
							val = new StringValue(valueStrings[k]);
							break;
						}
						default:
							throw new RuntimeException("Unknown ValueType: "
									+ keySpecs[k].getType());
					}
				} else {
					val = null;
				}
				node.setValue(k, val);

			}
			seq.add(node);
			lineNumber++;
		}

		try {
			reader.close();
		} catch (IOException ex) {
			throw new IOException("Reading is finished but file could not be closed.", ex);
		}

		return seq;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy