de.citec.tcs.alignment.csv.CSVImporter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of csv Show documentation
This module permits exporting and importing of Sequence objects to CSV files. The underlying NodeSpecification is stored via a JSON file. The module aims at human- readable and compatible storage. For storage efficiency, further compression is advised.
There is a newer version: 3.1.1
Show newest version
/* 
 * TCS Alignment Toolbox Version 3
 * 
 * Copyright (C) 2016
 * Bassam Mokbel, Benjamin Paaßen
 * AG Theoretical Computer Science
 * Centre of Excellence Cognitive Interaction Technology (CITEC)
 * University of Bielefeld
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see .
 */
package de.citec.tcs.alignment.csv;

import de.citec.tcs.alignment.sequence.Alphabet;
import de.citec.tcs.alignment.sequence.KeywordSpecification;
import de.citec.tcs.alignment.sequence.Node;
import de.citec.tcs.alignment.sequence.NodeSpecification;
import de.citec.tcs.alignment.sequence.Sequence;
import de.citec.tcs.alignment.sequence.StringKeywordSpecification;
import de.citec.tcs.alignment.sequence.StringValue;
import de.citec.tcs.alignment.sequence.SymbolicKeywordSpecification;
import de.citec.tcs.alignment.sequence.SymbolicValue;
import de.citec.tcs.alignment.sequence.Value;
import de.citec.tcs.alignment.sequence.VectorialKeywordSpecification;
import de.citec.tcs.alignment.sequence.VectorialValue;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.json.JSONArray;
import org.json.JSONObject;

/**
 * This class holds methods to import a Sequence from a given CSV file, and an
 * according NodeSpecification from a given JSON file.
 *
 * @author Bassam Mokbel - bmokbel(at)techfak.uni-bielefeld.de
 */
public final class CSVImporter {

	/**
	 * The class provides methods only, so no instantiation is necessary.
	 */
	private CSVImporter() {
	}

	/**
	 * This imports a NodeSpecification from a JSON file. The input JSON
	 * structure is expected to be a root object with only one property
	 * ("de.citec.tcs.sequence.NodeSpecification"). The value for that property
	 * should be an array of JSON objects, each of which describes a
	 * KeywordSpecification. These objects should have the properties "Keyword",
	 * which contains the Keyword string and
	 * "de.citec.tcs.alignment.sequence.ValueType"
	 * which contains the ValueType Enum. Further, JSON objects describing a
	 * VectorialKeywordSpecification should have the property "Length", which
	 * contains an integer stating the length and JSON objects describing a
	 * SymbolicKeywordSpecification should have the property "Alphabet", which
	 * contains a |-separated String of symbols for the Alphabet.
	 *
	 * @param jsonFileName a file with JSON content describing a
	 * NodeSpecification.
	 *
	 * @return the NodeSpecification contained in the JSON file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static NodeSpecification importNodeSpecification(String jsonFileName) throws IOException {
		return importNodeSpecification(new File(jsonFileName));
	}

	/**
	 * This imports a NodeSpecification from a JSON file. The input JSON
	 * structure is expected to be a root object with only one property
	 * ("de.citec.tcs.sequence.NodeSpecification"). The value for that property
	 * should be an array of JSON objects, each of which describes a
	 * KeywordSpecification. These objects should have the properties "Keyword",
	 * which contains the Keyword string and
	 * "de.citec.tcs.alignment.sequence.ValueType"
	 * which contains the ValueType Enum. Further, JSON objects describing a
	 * VectorialKeywordSpecification should have the property "Length", which
	 * contains an integer stating the length and JSON objects describing a
	 * SymbolicKeywordSpecification should have the property "Alphabet", which
	 * contains a |-separated String of symbols for the Alphabet.
	 *
	 * @param jsonFile a file with JSON content describing a NodeSpecification.
	 *
	 * @return the NodeSpecification contained in the JSON file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static NodeSpecification importNodeSpecification(File jsonFile) throws IOException {
		try {
			return importNodeSpecification(new FileReader(jsonFile));
		} catch (IOException ex) {
			throw new IOException("Could not read NodeSpecification from File "
					+ jsonFile.getAbsolutePath(), ex);
		}
	}

	/**
	 * This imports a NodeSpecification from a Reader. The input JSON
	 * structure is expected to be a root object with only one property
	 * ("de.citec.tcs.sequence.NodeSpecification"). The value for that property
	 * should be an array of JSON objects, each of which describes a
	 * KeywordSpecification. These objects should have the properties "Keyword",
	 * which contains the Keyword string and
	 * "de.citec.tcs.alignment.sequence.ValueType"
	 * which contains the ValueType Enum. Further, JSON objects describing a
	 * VectorialKeywordSpecification should have the property "Length", which
	 * contains an integer stating the length and JSON objects describing a
	 * SymbolicKeywordSpecification should have the property "Alphabet", which
	 * contains a |-separated String of symbols for the Alphabet.
	 *
	 * @param reader a Reader containing JSON data describing a
	 * NodeSpecification.
	 *
	 * @return the NodeSpecification contained in the JSON file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static NodeSpecification importNodeSpecification(Reader reader) throws IOException {

		final JSONObject root = new JSONObject(readStringData(reader));
		final JSONArray jsonArray = root.getJSONArray("de.citec.tcs.alignment.sequence.NodeSpecification");

		// read all keyword specifications
		KeywordSpecification[] keySpecs = new KeywordSpecification[jsonArray.length()];
		for (int i = 0; i < jsonArray.length(); i++) {
			JSONObject jsonKeySpec = jsonArray.getJSONObject(i);
			String keywordTitle = jsonKeySpec.getString("Keyword");
			String keywordValueType = jsonKeySpec.getString("de.citec.tcs.alignment.sequence.ValueType");

			switch (keywordValueType) {
				case "STRING": {
					StringKeywordSpecification keySpec = new StringKeywordSpecification(keywordTitle);
					keySpecs[i] = keySpec;
					break;
				}
				case "VECTOR": {
					int vectorLength = jsonKeySpec.getInt("Length");
					VectorialKeywordSpecification keySpec = new VectorialKeywordSpecification(vectorLength, keywordTitle);
					keySpecs[i] = keySpec;
					break;
				}
				case "SYMBOLIC": {
					String alphabetStr = jsonKeySpec.getString("Alphabet");
					Alphabet alphabet = new Alphabet(alphabetStr);
					SymbolicKeywordSpecification keySpec = new SymbolicKeywordSpecification(alphabet, keywordTitle);
					keySpecs[i] = keySpec;
					break;
				}
			}
		}

		return new NodeSpecification(keySpecs);
	}

	private static String readStringData(Reader reader) throws IOException {
		final BufferedReader buf;
		if (reader instanceof BufferedReader) {
			buf = (BufferedReader) reader;
		} else {
			buf = new BufferedReader(reader);
		}
		final StringBuilder content = new StringBuilder();
		String line;
		while (((line = buf.readLine())) != null) {
			content.append(line);
		}
		return content.toString();
	}

	/**
	 * This imports a sequence from a CSV file using the default delimiter "\t".
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the default delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFileName The CSV File to be imported.
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, String csvFileName) throws IOException {
		// call the import function with a default delimiter character "tabstop"
		return importSequence(nodeSpec, csvFileName, CSVExporter.DEFAULT_DELIMITER);
	}

	/**
	 * This imports a sequence from a CSV file using the default delimiter "\t".
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the default delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFile The CSV File to be imported.
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, File csvFile) throws IOException {
		// call the import function with a default delimiter character "tabstop"
		return importSequence(nodeSpec, csvFile, CSVExporter.DEFAULT_DELIMITER);
	}

	/**
	 * This imports a sequence from a Reader using the default delimiter "\t".
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the default delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param in the Reader from which the CSV data should be read.
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, Reader in) throws IOException {
		// call the import function with a default delimiter character "tabstop"
		return importSequence(nodeSpec, in, CSVExporter.DEFAULT_DELIMITER);
	}

	/**
	 * This imports a sequence from a CSV file using the given delimiter string.
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the given delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFileName The CSV File to be imported.
	 * @param valueDelimiterString The delimiter String between values in CSV.
	 * (Default is "\t", i.e. a tabstop.)
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, String csvFileName, String valueDelimiterString) throws IOException {
		return importSequence(nodeSpec, new FileReader(csvFileName), valueDelimiterString);
	}

	/**
	 * This imports a sequence from a CSV file using the given delimiter string.
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the given delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param csvFile The CSV File to be imported.
	 * @param valueDelimiterString The delimiter String between values in CSV.
	 * (Default is "\t", i.e. a tabstop.)
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, File csvFile, String valueDelimiterString) throws IOException {
		return importSequence(nodeSpec, new FileReader(csvFile), valueDelimiterString);
	}

	/**
	 * This imports a sequence from a Reader containing CSV data using the
	 * given delimiter string.
	 *
	 * The function requires the NodeSpecification of the sequence, which can be
	 * imported from a JSON file via the function importNodeSpecification above.
	 * In the CSV, a header line is expected as the first line, in which the
	 * name and ValueType for every Keyword in the sequence nodes are described.
	 * This should be in accordance with the given NodeSpecification. Then, one
	 * node in the sequence is created from every line of the CSV file, in which
	 * the values of each keyword are separated by the given delimiter string.
	 *
	 * @param nodeSpec NodeSpecification for the sequence defined in the CSV.
	 * @param in The Reader from which to read CSV data.
	 * @param valueDelimiterString The delimiter String between values in CSV.
	 * (Default is "\t", i.e. a tabstop.)
	 *
	 * @return The Sequence imported from the given CSV file.
	 * @throws IOException is thrown if the File can't be opened or if something
	 * with the data format is not correct.
	 */
	public static Sequence importSequence(NodeSpecification nodeSpec, Reader in, String valueDelimiterString) throws IOException {
		final BufferedReader reader;
		if (in instanceof BufferedReader) {
			reader = (BufferedReader) in;
		} else {
			reader = new BufferedReader(in);
		}
		Sequence seq = new Sequence(nodeSpec);
		KeywordSpecification[] keySpecs = nodeSpec.getKeywordSpecifications();

		String line;
		int lineNumber = 0;

		// first read the header line with all keyword titles and types
		line = reader.readLine();
		String[] headerStrings = line.split(valueDelimiterString);
		if (headerStrings.length != keySpecs.length) {
			throw new IOException("In the first line (i.e. the table header line) "
					+ " the number of given keywords (separated by "
					+ valueDelimiterString
					+ ") is "
					+ headerStrings.length
					+ ", which is not the same as expected by the given NodeSpecification, which specifies "
					+ keySpecs.length
					+ "keywords."
			);
		} else {
			for (int i = 0; i < headerStrings.length; i++) {
				if (!(headerStrings[i].regionMatches(0,
						nodeSpec.getKeyword(i),
						0,
						nodeSpec.getKeyword(i).length()
				))
						|| !(headerStrings[i].regionMatches(nodeSpec.getKeyword(i).length() + 2,
								nodeSpec.getKeywordSpecification(i).getType().toString(),
								0,
								nodeSpec.getKeywordSpecification(i).getType().toString().length()
						))) {
					throw new IOException("In the first line (i.e. the table header line) "
							+ " the keyword number "
							+ i
							+ " is supposed to be "
							+ nodeSpec.getKeyword(i)
							+ " ("
							+ nodeSpec.getKeywordSpecification(i).getType().toString()
							+ "), "
							+ ", as defined by the loaded NodeSpecification, but the header line of the csv file specifies "
							+ headerStrings[i]
							+ "."
					);
				}
			}
		}
		lineNumber++;

		// read all remaining lines of the CSV file
		while ((line = reader.readLine()) != null) {

			String[] valueStrings = line.split(valueDelimiterString);
			if (valueStrings.length != keySpecs.length) {
				throw new IOException("In line number "
						+ (lineNumber + 1)
						+ " the number of given keyword values (separated by "
						+ valueDelimiterString
						+ ") is "
						+ valueStrings.length
						+ ", which is not the same as expected by the given NodeSpecification, which specifies "
						+ keySpecs.length
						+ "keywords."
				);
			}

			Node node = new Node(seq);

			// for each keyword, parse the string of the CSV stating its value
			for (int k = 0; k < valueStrings.length; k++) {
				final Value val;
				if (!valueStrings[k].equals("null")) {
					valueStrings[k] = valueStrings[k].substring(1, valueStrings[k].length() - 1);
					// switch the parsing behavior, acc. to the expected ValueType
					switch (keySpecs[k].getType()) {

						// for type VECTOR, read comma-separated values in brackets
						case VECTOR: {
							try {
								VectorialKeywordSpecification keySpec = (VectorialKeywordSpecification) keySpecs[k];
								int vectorLength = keySpec.getLength();
								assert (valueStrings[k].startsWith("[")
										&& valueStrings[k].endsWith("]"));
								valueStrings[k] = valueStrings[k].replace("[", "");
								valueStrings[k] = valueStrings[k].replace("]", "");
								String[] vectorSubStrings = valueStrings[k].split(",");
								assert (vectorSubStrings.length == vectorLength);
								double[] vector = new double[vectorSubStrings.length];
								// read each element of the vector
								for (int j = 0; j < vectorSubStrings.length; j++) {
									vector[j] = Double.parseDouble(vectorSubStrings[j]);
								}
								val = new VectorialValue(vector);
							} catch (RuntimeException ex) {
								throw new IOException("In line number "
										+ (lineNumber + 1)
										+ " the NodeSpecification specifies a VECTOR type,"
										+ " but the length of the vector given in the string "
										+ valueStrings[k]
										+ " is not the same as defined by the VectorialKeywordSpecification.",
										ex);
							}
							break;
						}

						// for type SYMBOLIC, create symbol from the given alphabet
						case SYMBOLIC: {
							try {
								SymbolicKeywordSpecification keySpec = (SymbolicKeywordSpecification) keySpecs[k];
								Alphabet alphabet = keySpec.getAlphabet();
								val = new SymbolicValue(alphabet, valueStrings[k]);
							} catch (RuntimeException ex) {
								throw new IOException("In line number "
										+ (lineNumber + 1)
										+ " the NodeSpecification specifies a SYMBOLIC type,"
										+ " but the symbol "
										+ valueStrings[k]
										+ " could not be found in the alphabet defined by the SymbolicKeywordSpecification.",
										ex);
							}
							break;
						}

						// for type STRING, simply read and create the StringValue
						case STRING: {
							val = new StringValue(valueStrings[k]);
							break;
						}
						default:
							throw new RuntimeException("Unknown ValueType: "
									+ keySpecs[k].getType());
					}
				} else {
					val = null;
				}
				node.setValue(k, val);

			}
			seq.add(node);
			lineNumber++;
		}

		try {
			reader.close();
		} catch (IOException ex) {
			throw new IOException("Reading is finished but file could not be closed.", ex);
		}

		return seq;
	}

}