All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.maltparserx.parser.SingleMalt Maven / Gradle / Ivy

package org.maltparserx.parser;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Formatter;
import java.util.regex.Pattern;

import org.apache.log4j.FileAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.maltparserx.core.config.ConfigurationDir;
import org.maltparserx.core.config.ConfigurationException;
import org.maltparserx.core.config.ConfigurationRegistry;
import org.maltparserx.core.exception.MaltChainedException;
import org.maltparserx.core.helper.SystemLogger;
import org.maltparserx.core.helper.URLFinder;
import org.maltparserx.core.io.dataformat.DataFormatInstance;
import org.maltparserx.core.options.OptionManager;
import org.maltparserx.core.propagation.PropagationManager;
import org.maltparserx.core.symbol.SymbolTableHandler;
import org.maltparserx.core.syntaxgraph.DependencyStructure;
import org.maltparserx.parser.guide.ClassifierGuide;

/**
 * @author Johan Hall
 *
 */
public class SingleMalt implements DependencyParserConfig {
	public static final int LEARN = 0;
	public static final int PARSE = 1;
	protected ConfigurationDir configDir;
	protected Logger configLogger;
	protected int optionContainerIndex;
	protected Algorithm parsingAlgorithm = null;
	protected int mode;
	protected ConfigurationRegistry registry;
	protected SymbolTableHandler symbolTableHandler;
	protected DataFormatInstance dataFormatInstance;
	protected long startTime;
	protected long endTime;
	protected int nIterations = 0;
	protected PropagationManager propagationManager;
	private Parser parser;
	private Trainer trainer;
	
	public void initialize(int containerIndex, DataFormatInstance dataFormatInstance, ConfigurationDir configDir, int mode) throws MaltChainedException {

		this.optionContainerIndex = containerIndex;
		this.mode = mode;
		setConfigurationDir(configDir);
		startTime = System.currentTimeMillis();
		configLogger = initConfigLogger(getOptionValue("config", "logfile").toString(), getOptionValue("config", "logging").toString());
		registry = new ConfigurationRegistry();
		this.dataFormatInstance = dataFormatInstance;
		symbolTableHandler = dataFormatInstance.getSymbolTables();

		if (mode == SingleMalt.LEARN) {
			checkOptionDependency();
		}
		registry.put(org.maltparserx.core.symbol.SymbolTableHandler.class, getSymbolTables());
		registry.put(org.maltparserx.core.io.dataformat.DataFormatInstance.class, dataFormatInstance);
//		registry.put(org.maltparser.parser.DependencyParserConfig.class, this);
		initPropagation();
		initParsingAlgorithm(); 
		if (configLogger.isInfoEnabled()) {
			URL inputFormatURL = configDir.getInputFormatURL(); 
			URL outputFormatURL = configDir.getOutputFormatURL();
			if (inputFormatURL != null) {
				if (outputFormatURL == null || outputFormatURL.toString().equals(inputFormatURL.toString())) {
					int index = inputFormatURL.toString().indexOf('!');
					if (index == -1) {
						configLogger.info("  Data Format          : "+inputFormatURL.toString()+"\n");
					} else {
						configLogger.info("  Data Format          : "+inputFormatURL.toString().substring(index+1)+"\n");
					}
				} else {
					int indexIn = inputFormatURL.toString().indexOf('!');
					int indexOut = outputFormatURL.toString().indexOf('!');
					if (indexIn == -1) {
						configLogger.info("  Input Data Format    : "+inputFormatURL.toString()+"\n");
					} else {
						configLogger.info("  Input Data Format    : "+inputFormatURL.toString().substring(indexIn+1)+"\n");
					}
					if (indexOut == -1) {
						configLogger.info("  Output Data Format   : "+outputFormatURL.toString()+"\n");
					} else {
						configLogger.info("  Output Data Format   : "+outputFormatURL.toString().substring(indexOut+1)+"\n");
					}
				}
			}
		}
	}
	
	private void initPropagation()  throws MaltChainedException {
		String propagationSpecFileName = getOptionValue("singlemalt", "propagation").toString();
		if (propagationSpecFileName == null || propagationSpecFileName.length() == 0) {
			return;
		}
		propagationManager = new PropagationManager(configDir);
		if (mode == SingleMalt.LEARN) {
			propagationSpecFileName = configDir.copyToConfig(propagationSpecFileName);
			OptionManager.instance().overloadOptionValue(optionContainerIndex, "singlemalt", "propagation", propagationSpecFileName);
		}
		getConfigLogger().info("  Propagation          : " + propagationSpecFileName+"\n");
		propagationManager.loadSpecification(propagationSpecFileName);
	}
	
	/**
	 * Initialize the parsing algorithm
	 * 
	 * @throws MaltChainedException
	 */
	protected void initParsingAlgorithm() throws MaltChainedException {
		if (mode == LEARN) {
			parsingAlgorithm = trainer = new BatchTrainer(this);
		} else if (mode == PARSE) {
			parsingAlgorithm = parser = new DeterministicParser(this);
		}
	}
	
	public void addRegistry(Class clazz, Object o) {
		registry.put(clazz, o);
	}
	
	public void process(Object[] arguments) throws MaltChainedException {
		if (mode == LEARN) {
			if (arguments.length < 2 || !(arguments[0] instanceof DependencyStructure) || !(arguments[1] instanceof DependencyStructure)) {
				throw new MaltChainedException("The single malt learn task must be supplied with at least two dependency structures. ");
			}
			DependencyStructure systemGraph = (DependencyStructure)arguments[0];
			DependencyStructure goldGraph = (DependencyStructure)arguments[1];
			if (systemGraph.hasTokens() && getGuide() != null) {
				getGuide().finalizeSentence(((Trainer)getAlgorithm()).parse(goldGraph, systemGraph));
			}
		} else if (mode == PARSE) {
			if (arguments.length < 1 || !(arguments[0] instanceof DependencyStructure)) {
				throw new MaltChainedException("The single malt parse task must be supplied with at least one input terminal structure and one output dependency structure. ");
			}
			DependencyStructure processGraph = (DependencyStructure)arguments[0];
			if (processGraph.hasTokens()) {
				parser.parse(processGraph);
//				((Parser)getAlgorithm()).parse(processGraph);
			}
		}
	}
	
	public void parse(DependencyStructure graph) throws MaltChainedException {
		if (graph.hasTokens()) {
//			((Parser)getAlgorithm()).parse(graph);
			parser.parse(graph);
		}
	}
	
	public void oracleParse(DependencyStructure goldGraph, DependencyStructure oracleGraph) throws MaltChainedException {
		if (oracleGraph.hasTokens()) {
			if (getGuide() != null) {
				getGuide().finalizeSentence(trainer.parse(goldGraph, oracleGraph));
			} else {
				trainer.parse(goldGraph, oracleGraph);
			}
		}
	}
	
	public void train() throws MaltChainedException {
		if (getGuide() == null) {
			((Trainer)getAlgorithm()).train();
		}
	}
	
	public void terminate(Object[] arguments) throws MaltChainedException {
//		if (getAlgorithm() instanceof Trainer) {
//			((Trainer)getAlgorithm()).terminate();
//		}
		getAlgorithm().terminate();
		if (getGuide() != null) {
			getGuide().terminate();
		}
		if (mode == LEARN) {
			endTime = System.currentTimeMillis();
			long elapsed = endTime - startTime;
			if (configLogger.isInfoEnabled()) {
				configLogger.info("Learning time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n");
			}
		} else if (mode == PARSE) {
			endTime = System.currentTimeMillis();
			long elapsed = endTime - startTime;
			if (configLogger.isInfoEnabled()) {
				configLogger.info("Parsing time: " +new Formatter().format("%02d:%02d:%02d", elapsed/3600000, elapsed%3600000/60000, elapsed%60000/1000)+" ("+elapsed+" ms)\n");
			}
		}
		if (SystemLogger.logger() != configLogger && configLogger != null) {
			configLogger.removeAllAppenders();
		}
	}
	
	/**
	 * Initialize the configuration logger
	 * 
	 * @return the configuration logger
	 * @throws MaltChainedException
	 */
	public Logger initConfigLogger(String logfile, String level) throws MaltChainedException {
		if (logfile != null && logfile.length() > 0 && !logfile.equalsIgnoreCase("stdout") && configDir != null) {
			configLogger = Logger.getLogger(logfile);
			FileAppender fileAppender = null;
			try {
				fileAppender = new FileAppender(new PatternLayout("%m"),configDir.getWorkingDirectory().getPath()+File.separator+logfile, true);
			} catch(IOException e) {
				throw new ConfigurationException("It is not possible to create a configuration log file. ", e);
			}
			fileAppender.setThreshold(Level.toLevel(level, Level.INFO));
			configLogger.addAppender(fileAppender);
			configLogger.setLevel(Level.toLevel(level, Level.INFO));	
		} else {
			configLogger = SystemLogger.logger();
		}

		return configLogger;
	}
	
	public Logger getConfigLogger() {
		return configLogger;
	}

	public void setConfigLogger(Logger logger) {
		configLogger = logger;
	}
	
	public ConfigurationDir getConfigurationDir() {
		return configDir;
	}
	
	public void setConfigurationDir(ConfigurationDir configDir) {
		this.configDir = configDir;
	}
	
	public int getMode() {
		return mode;
	}
	
	public ConfigurationRegistry getRegistry() {
		return registry;
	}

	public void setRegistry(ConfigurationRegistry registry) {
		this.registry = registry;
	}

	public Object getOptionValue(String optiongroup, String optionname) throws MaltChainedException {
		return OptionManager.instance().getOptionValue(optionContainerIndex, optiongroup, optionname);
	}
	
	public String getOptionValueString(String optiongroup, String optionname) throws MaltChainedException {
		return OptionManager.instance().getOptionValueString(optionContainerIndex, optiongroup, optionname);
	}
	
	public OptionManager getOptionManager() throws MaltChainedException {
		return OptionManager.instance();
	}
	/******************************** MaltParserConfiguration specific  ********************************/
	
	/**
	 * Returns the list of symbol tables
	 * 
	 * @return the list of symbol tables
	 */
	public SymbolTableHandler getSymbolTables() {
		return symbolTableHandler;
	}
	
	public PropagationManager getPropagationManager() {
		return propagationManager;
	}

	public Algorithm getAlgorithm() {
		return parsingAlgorithm;
	}
	/**
	 * Returns the guide
	 * 
	 * @return the guide
	 */
	public ClassifierGuide getGuide() {
		return parsingAlgorithm.getGuide();
	}
	
	public void checkOptionDependency() throws MaltChainedException {
		try {
			if (configDir.getInfoFileWriter() != null) {
				configDir.getInfoFileWriter().write("\nDEPENDENCIES\n");
			}
			
			// Copy the feature model file into the configuration directory
			String featureModelFileName = getOptionValue("guide", "features").toString().trim();
			if (featureModelFileName.equals("")) {
				// use default feature model depending on the selected parser algorithm
				OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", getOptionValueString("singlemalt", "parsing_algorithm"));
				featureModelFileName = getOptionValue("guide", "features").toString().trim();
				/* START: Temp fix during development of new liblinear and libsvm interface */
				String learner = getOptionValueString("guide", "learner");
				if (!learner.startsWith("lib")) {
					learner = "lib"+learner;
				}
				/* END: Temp fix during development of new liblinear and libsvm interface */
				featureModelFileName = featureModelFileName.replace("{learner}", learner);
				final URLFinder f = new URLFinder();
				featureModelFileName = configDir.copyToConfig(f.findURLinJars(featureModelFileName));
			} else {
				featureModelFileName = configDir.copyToConfig(featureModelFileName);
			}
			OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "features", featureModelFileName);
			if (configDir.getInfoFileWriter() != null) {
				configDir.getInfoFileWriter().write("--guide-features (  -F)                 "+getOptionValue("guide", "features").toString()+"\n");
			}

			if (getOptionValue("guide", "data_split_column").toString().equals("") && !getOptionValue("guide", "data_split_structure").toString().equals("")) {
				configLogger.warn("Option --guide-data_split_column = '' and --guide-data_split_structure != ''. Option --guide-data_split_structure is overloaded with '', this will cause the parser to induce a single model.\n ");
				OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_structure", "");
				if (configDir.getInfoFileWriter() != null) {
					configDir.getInfoFileWriter().write("--guide-data_split_structure (  -s)\n");
				}
			}
			if (!getOptionValue("guide", "data_split_column").toString().equals("") && getOptionValue("guide", "data_split_structure").toString().equals("")) {
				configLogger.warn("Option --guide-data_split_column != '' and --guide-data_split_structure = ''. Option --guide-data_split_column is overloaded with '', this will cause the parser to induce a single model.\n");
				OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "data_split_column", "");
				if (configDir.getInfoFileWriter() != null) {
					configDir.getInfoFileWriter().write("--guide-data_split_column (  -d)\n");
				}
			}
			
			String decisionSettings = getOptionValue("guide", "decision_settings").toString().trim();
			String markingStrategy = getOptionValue("pproj", "marking_strategy").toString().trim();
			String coveredRoot = getOptionValue("pproj", "covered_root").toString().trim();
			StringBuilder newDecisionSettings = new StringBuilder();

			if (decisionSettings == null || decisionSettings.length() < 1 || decisionSettings.equals("default")) {
				decisionSettings = "T.TRANS+A.DEPREL";
			} else {
				decisionSettings = decisionSettings.toUpperCase();
			}
			
			if (markingStrategy.equalsIgnoreCase("head") || markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) {
				if (!Pattern.matches(".*A\\.PPLIFTED.*", decisionSettings)) {
					newDecisionSettings.append("+A.PPLIFTED");
				}
			}
			if (markingStrategy.equalsIgnoreCase("path") || markingStrategy.equalsIgnoreCase("head+path")) {
				if (!Pattern.matches(".*A\\.PPPATH.*", decisionSettings)) {
					newDecisionSettings.append("+A.PPPATH");
				}
			}
			if (!coveredRoot.equalsIgnoreCase("none") && !Pattern.matches(".*A\\.PPCOVERED.*", decisionSettings)) {
				newDecisionSettings.append("+A.PPCOVERED");
			}
			if (!getOptionValue("guide", "decision_settings").toString().equals(decisionSettings) || newDecisionSettings.length() > 0) {
				OptionManager.instance().overloadOptionValue(optionContainerIndex, "guide", "decision_settings", decisionSettings+newDecisionSettings.toString());
				if (configDir.getInfoFileWriter() != null) {
					configDir.getInfoFileWriter().write("--guide-decision_settings (  -gds)                 "+getOptionValue("guide", "decision_settings").toString()+"\n");
				}
			}
			if (configDir.getInfoFileWriter() != null) {
				configDir.getInfoFileWriter().flush();
			}
		} catch (IOException e) {
			throw new ConfigurationException("Could not write to the configuration information file. ", e);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy