All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.ctakes.ytex.kernel.SparseDataExporterImpl Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.ytex.kernel;

import org.apache.commons.cli.*;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.RowCallbackHandler;
import org.springframework.jdbc.core.namedparam.NamedParameterJdbcTemplate;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.support.TransactionCallback;
import org.springframework.transaction.support.TransactionTemplate;

import javax.sql.DataSource;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;

public class SparseDataExporterImpl implements SparseDataExporter {

	private static final Logger LOGGER = LoggerFactory.getLogger( "SparseDataExporterImpl" );

	@SuppressWarnings("static-access")
	public static void main(String args[]) throws IOException {
		Options options = new Options();
		options.addOption(OptionBuilder
				.withArgName("prop")
				.hasArg()
				.isRequired()
				.withDescription(
						"property file with queries and other parameters.")
				.create("prop"));
		options.addOption(OptionBuilder.withArgName("type").hasArg()
				.isRequired()
				.withDescription("export format; valid values: weka, libsvm")
				.create("type"));
		if (args.length == 0)
			printHelp(options);
		else {
			try {
				CommandLineParser parser = new GnuParser();
				CommandLine line = parser.parse(options, args);
				String propFile = line.getOptionValue("prop");
				String format = line.getOptionValue("type");
				SparseDataExporter exporter = KernelContextHolder
						.getApplicationContext().getBean(
								SparseDataExporter.class);
				exporter.exportData(propFile, format);
			} catch (ParseException pe) {
				printHelp(options);
			}
		}
	}

	private static void printHelp(Options options) {
		HelpFormatter formatter = new HelpFormatter();

		formatter.printHelp("java " + SparseDataExporterImpl.class.getName()
				+ " export sparse data", options);
	}

	protected JdbcTemplate jdbcTemplate;
	protected KernelUtil kernelUtil;

	protected NamedParameterJdbcTemplate namedJdbcTemplate;

	protected Map nameToFormatterMap = new HashMap();	

	protected TransactionTemplate txTemplateNew;

	public SparseDataExporterImpl() {
		super();
	}

	protected void addNominalWordToInstance(SparseData sparseData,
			long instanceId, String word, String wordValue) {
		// add the instance id to the set of instance ids if necessary
		if (!sparseData.getInstanceIds().contains(instanceId))
			sparseData.getInstanceIds().add(instanceId);
		SortedMap instanceWords = sparseData
				.getInstanceNominalWords().get(instanceId);
		SortedSet wordValueSet = sparseData.getNominalWordValueMap()
				.get(word);
		if (instanceWords == null) {
			instanceWords = new TreeMap();
			sparseData.getInstanceNominalWords().put(instanceId, instanceWords);
		}
		if (wordValueSet == null) {
			wordValueSet = new TreeSet();
			sparseData.getNominalWordValueMap().put(word, wordValueSet);
		}
		// add the word-value for the instance
		instanceWords.put(word, wordValue);
		// add the value to the set of valid values
		wordValueSet.add(wordValue);
	}

	protected void addNumericWordToInstance(SparseData sparseData,
			long instanceId, String word, double wordValue) {
		// add the instance id to the set of instance ids if necessary
		if (!sparseData.getInstanceIds().contains(instanceId))
			sparseData.getInstanceIds().add(instanceId);
		// add the numeric word to the map of words for this document
		SortedMap words = sparseData.getInstanceNumericWords()
				.get(instanceId);
		if (words == null) {
			words = new TreeMap();
			sparseData.getInstanceNumericWords().put(instanceId, words);
		}
		words.put(word, wordValue);
		sparseData.getNumericWords().add(word);
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see org.apache.ctakes.ytex.kernel.SparseDataExporter#exportData(org.apache.ctakes.ytex.kernel.SparseData,
	 * org.apache.ctakes.ytex.kernel.SparseDataFormatter, java.util.Properties)
	 */
	public void exportData(InstanceData instanceLabel,
			SparseDataFormatter formatter, Properties properties,
			BagOfWordsDecorator bDecorator) throws IOException {
		String scope = properties.getProperty("scope", null);
		SparseData sparseData = null;
		if (scope == null) {
			sparseData = this.loadData(instanceLabel,
					properties.getProperty("numericWordQuery"),
					properties.getProperty("nominalWordQuery"),
					properties.getProperty("prepareScript"),
					properties.getProperty("prepareScriptDelimiter", ";"),
					bDecorator, null, null, null);
		}
		formatter.initializeExport(instanceLabel, properties, sparseData);
		for (String label : instanceLabel.getLabelToInstanceMap().keySet()) {
			if ("label".equals(scope)) {
				sparseData = this.loadData(instanceLabel,
						properties.getProperty("numericWordQuery"),
						properties.getProperty("nominalWordQuery"),
						properties.getProperty("prepareScript"),
						properties.getProperty("prepareScriptDelimiter", ";"),
						bDecorator, label, null, null);
			}
			formatter
					.initializeLabel(label, instanceLabel
							.getLabelToInstanceMap().get(label), properties,
							sparseData);
			for (int run : instanceLabel.getLabelToInstanceMap().get(label)
					.keySet()) {
				for (int fold : instanceLabel.getLabelToInstanceMap()
						.get(label).get(run).keySet()) {
					if ( LOGGER.isInfoEnabled()
							&& (label.length() > 0 || run > 0 || fold > 0))
						LOGGER.info("exporting, label " + label + " run " + run
								+ " fold " + fold);
					if ("fold".equals(scope)) {
						sparseData = this.loadData(instanceLabel, properties
								.getProperty("numericWordQuery"), properties
								.getProperty("nominalWordQuery"), properties
								.getProperty("prepareScript"), properties
								.getProperty("prepareScriptDelimiter", ";"),
								bDecorator, label, fold, run);
					}
					formatter.initializeFold(sparseData, label, run, fold,
							instanceLabel.getLabelToInstanceMap().get(label)
									.get(run).get(fold));
					for (boolean train : instanceLabel.getLabelToInstanceMap()
							.get(label).get(run).get(fold).keySet()) {
						formatter.exportFold(sparseData, instanceLabel
								.getLabelToInstanceMap().get(label).get(run)
								.get(fold).get(train), train, label,
								0 == run ? null : run, 0 == fold ? null : fold);
					}
					formatter.clearFold();
				}
			}
			formatter.clearLabel();
		}
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see org.apache.ctakes.ytex.kernel.SparseDataExporter#exportData(java.util.Properties,
	 * org.apache.ctakes.ytex.kernel.SparseDataFormatter, org.apache.ctakes.ytex.kernel.BagOfWordsDecorator)
	 */
	@Override
	public void exportData(Properties props, SparseDataFormatter formatter,
			BagOfWordsDecorator bDecorator) throws IOException {
		InstanceData instanceLabel = this.getKernelUtil().loadInstances(
				props.getProperty("instanceClassQuery"));
		if (props.containsKey("folds")) {
			this.getKernelUtil().generateFolds(instanceLabel, props);
		}
		// load label - instance id maps
		// sparseData.setLabelToInstanceMap(this.getKernelUtil().loadInstances(
		// props.getProperty("instanceClassQuery"),
		// sparseData.getLabelToClassMap()));
		this.exportData(instanceLabel, formatter, props, bDecorator);
		// this.loadData(sparseData,
		// props.getProperty("numericWordQuery"),
		// props.getProperty("nominalWordQuery"), bDecorator);
		// this.exportData(sparseData, formatter, props);
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see org.apache.ctakes.ytex.kernel.SparseDataExporter#exportData(java.lang.String,
	 * java.lang.String)
	 */
	@Override
	public void exportData(String propertiesFile, String format)
			throws IOException, InvalidPropertiesFormatException {
		Properties props = new Properties();
		this.getKernelUtil().loadProperties(propertiesFile, props);
		this.exportData(props, nameToFormatterMap.get(format.toLowerCase())
				.getFormatter(), null);
	}

	public DataSource getDataSource(DataSource ds) {
		return this.jdbcTemplate.getDataSource();
	}

	public KernelUtil getKernelUtil() {
		return kernelUtil;
	}

	public Map getNameToFormatterMap() {
		return nameToFormatterMap;
	}

	/**
	 * run the prepare script if defined.
	 * 
	 * @param prepareScript
	 *            sequence of sql statements to be executed with named params.
	 * @param prepareScriptDelimiter
	 *            delimiter separating the sql statements.
	 * @param params
	 *            for named parameters in sql statements.
	 */
	protected void prepare(final String prepareScript,
			final String prepareScriptDelimiter,
			final Map params) {
		if (prepareScript != null && prepareScript.length() > 0) {
			String[] statements = prepareScript.split(prepareScriptDelimiter);
			// throw out empty lines
			for (String sql : statements) {
				if (sql != null && sql.trim().length() > 0) {
					this.namedJdbcTemplate.update(sql, params);
				}
			}
		}
	}

	/**
	 * 
	 * @param sql
	 *            result set has 3 columns. 1st column - integer - instance id.
	 *            2nd column - word. 3rd column - word value.
	 * @param instanceWordMap
	 *            map of instance id to word-word value.
	 * @param wordValueMap
	 *            map of word to valid values for the word.
	 * @return populate maps with results of query.
	 */
	protected void getNominalInstanceWords(final String sql,
			final String prepareScript, final String prepareScriptDelimiter,
			final SparseData sparseData, final Map params) {
		txTemplateNew.execute(new TransactionCallback() {

			// new PreparedStatementCreator() {
			// @Override

			// public PreparedStatement createPreparedStatement(
			// Connection conn) throws SQLException {
			// return conn.prepareStatement(sql,
			// ResultSet.TYPE_FORWARD_ONLY,
			// ResultSet.CONCUR_READ_ONLY);
			// }
			//
			// } @Override
			public Object doInTransaction(TransactionStatus txStatus) {
				prepare(prepareScript, prepareScriptDelimiter, params);
				namedJdbcTemplate.query(sql, params, new RowCallbackHandler() {

					@Override
					public void processRow(ResultSet rs) throws SQLException {
						long instanceId = rs.getLong(1);
						String word = rs.getString(2);
						String wordValue = rs.getString(3);
						addNominalWordToInstance(sparseData, instanceId, word,
								wordValue);
					}
				});
				return null;
			}
		});
	}

	/**
	 * 
	 * @param sql
	 *            result 1st column: instance id, 2nd column: word, 3rd column:
	 *            numeric word value
	 * @param instanceNumericWords
	 *            map of instance id - [map word - word value] to be populated
	 */
	protected void getNumericInstanceWords(final String sql,
			final String prepareScript, final String prepareScriptDelimiter,
			final SparseData sparseData, final Map params) {
		txTemplateNew.execute(new TransactionCallback() {

			@Override
			public Object doInTransaction(TransactionStatus txStatus) {
				prepare(prepareScript, prepareScriptDelimiter, params);
				namedJdbcTemplate.query(sql, params
				// new PreparedStatementCreator() {
				//
				// @Override
				// public PreparedStatement createPreparedStatement(
				// Connection conn) throws SQLException {
				// return conn.prepareStatement(sql,
				// ResultSet.TYPE_FORWARD_ONLY,
				// ResultSet.CONCUR_READ_ONLY);
				// }
				//
				// }
						, new RowCallbackHandler() {

							@Override
							public void processRow(ResultSet rs)
									throws SQLException {
								long instanceId = rs.getLong(1);
								String word = rs.getString(2);
								double wordValue = rs.getDouble(3);
								addNumericWordToInstance(sparseData,
										instanceId, word, wordValue);
							}
						});
				return null;
			}

		});
	}

	public TransactionTemplate getTxTemplateNew() {
		return txTemplateNew;
	}

	/**
	 * 
	 * @param instanceLabel
	 *            instance data: label - fold - instance id - class map
	 * @param instanceNumericWordQuery
	 *            query to get numeric attributes
	 * @param instanceNominalWordQuery
	 *            query to get nominal attributes
	 * @param prepareScript
	 *            prepare script to be executed in same tx as instance attribute
	 *            queries
	 * @param prepareScriptDelimiter
	 *            delimiter for statements in prepare script
	 * @param bDecorator
	 *            decorator to add attributes
	 * @param label
	 * @param fold
	 * @param run
	 * @return
	 */
	protected SparseData loadData(InstanceData instanceLabel,
			String instanceNumericWordQuery, String instanceNominalWordQuery,
			String prepareScript, String prepareScriptDelimiter,
			BagOfWordsDecorator bDecorator, String label, Integer fold,
			Integer run) {
		SparseData sparseData = new SparseData();
		Map params = new HashMap();
		if (label != null && label.length() > 0)
			params.put("label", label);
		if (fold != null && fold != 0)
			params.put("fold", fold);
		if (run != null && run != 0)
			params.put("run", run);
		// load numeric attributes
		if (instanceNumericWordQuery != null
				&& instanceNumericWordQuery.trim().length() > 0)
			this.getNumericInstanceWords(instanceNumericWordQuery,
					prepareScript, prepareScriptDelimiter, sparseData, params);
		// added to support adding gram matrix index in GramMatrixExporter
		if (bDecorator != null)
			bDecorator.decorateNumericInstanceWords(
					sparseData.getInstanceNumericWords(),
					sparseData.getNumericWords());
		// load nominal attributes
		if (instanceNominalWordQuery != null
				&& instanceNominalWordQuery.trim().length() > 0)
			this.getNominalInstanceWords(instanceNominalWordQuery,
					prepareScript, prepareScriptDelimiter, sparseData, params);
		if (bDecorator != null)
			bDecorator.decorateNominalInstanceWords(
					sparseData.getInstanceNominalWords(),
					sparseData.getNominalWordValueMap());
		return sparseData;
	}

	public void setDataSource(DataSource ds) {
		this.jdbcTemplate = new JdbcTemplate(ds);		
		this.namedJdbcTemplate = new NamedParameterJdbcTemplate(ds);
	}

	public void setKernelUtil(KernelUtil kernelUtil) {
		this.kernelUtil = kernelUtil;
	}

	public void setNameToFormatterMap(
			Map nameToFormatterMap) {
		this.nameToFormatterMap = nameToFormatterMap;
	}

	public void setTxTemplateNew(TransactionTemplate txTemplateNew) {
		this.txTemplateNew = txTemplateNew;
	}
}