org.daisy.pipeline.nlp.calabash.impl.BreakDetectStep Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp-common Show documentation
Common API for NLP functionality and XProc steps
The newest version!
package org.daisy.pipeline.nlp.calabash.impl;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.function.Predicate;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import net.sf.saxon.s9api.QName;
import net.sf.saxon.s9api.SaxonApiException;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmNodeKind;
import net.sf.saxon.sxpath.XPathExpression;
import net.sf.saxon.trans.XPathException;

import org.daisy.common.saxon.SaxonHelper;
import org.daisy.common.xproc.calabash.XProcStep;
import org.daisy.pipeline.nlp.DummyLangDetector;
import org.daisy.pipeline.nlp.LangDetector;
import org.daisy.pipeline.nlp.lexing.LexService.LexerInitException;
import org.daisy.pipeline.nlp.lexing.LexService.LexerToken;
import org.daisy.pipeline.nlp.lexing.LexServiceRegistry;

import com.xmlcalabash.core.XProcException;
import com.xmlcalabash.core.XProcRuntime;
import com.xmlcalabash.io.ReadablePipe;
import com.xmlcalabash.io.WritablePipe;
import com.xmlcalabash.library.DefaultStep;
import com.xmlcalabash.model.RuntimeValue;
import com.xmlcalabash.runtime.XAtomicStep;
import com.xmlcalabash.util.TreeWriter;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * XprocStep built on the top of a Lexer meant to be provided by an OSGI service
 * through BreakDetectProvider.
 */
public class BreakDetectStep extends DefaultStep implements TreeWriterFactory, InlineSectionProcessor, XProcStep {

	private Logger mLogger = LoggerFactory.getLogger(BreakDetectStep.class);
	private ReadablePipe mSource = null;
	private WritablePipe mResult = null;
	private XProcRuntime mRuntime = null;
	private LexServiceRegistry mLexerRegistry;
	private Set mLangs;
	private RuntimeValue inlineTagsOption;
	private RuntimeValue wordBeforeOption;
	private RuntimeValue wordAfterOption;
	private RuntimeValue sentenceBeforeOption;
	private RuntimeValue sentenceAfterOption;
	private QName wordTagOption;
	private QName sentenceTagOption;
	private LangDetector mLangDetector = null;

	public BreakDetectStep(XProcRuntime runtime, XAtomicStep step, LexServiceRegistry registry) {
		super(runtime, step);
		mRuntime = runtime;
		mLexerRegistry = registry;
	}

	public void setInput(String port, ReadablePipe pipe) {
		if ("source".equals(port)) {
			mSource = pipe;
		}
	}

	@Override
	public void setOption(QName name, RuntimeValue value) {
		super.setOption(name, value);
		if ("inline-tags".equalsIgnoreCase(name.getLocalName())) {
			inlineTagsOption = value;
		} else if ("ensure-word-before".equalsIgnoreCase(name.getLocalName())) {
			wordBeforeOption = value;
		} else if ("ensure-word-after".equalsIgnoreCase(name.getLocalName())) {
			wordAfterOption = value;
		} else if ("ensure-sentence-before".equalsIgnoreCase(name.getLocalName())) {
			sentenceBeforeOption = value;
		} else if ("ensure-sentence-after".equalsIgnoreCase(name.getLocalName())) {
			sentenceAfterOption = value;
		} else if ("output-word-tag".equalsIgnoreCase(name.getLocalName())) {
			wordTagOption = value.getQName();
		} else if ("output-sentence-tag".equalsIgnoreCase(name.getLocalName())) {
			sentenceTagOption = value.getQName();
		} else {
			runtime.error(new RuntimeException("unrecognized option " + name));
		}
	}

	public void setOutput(String port, WritablePipe pipe) {
		mResult = pipe;
	}

	public void reset() {
		mSource.resetReader();
		mResult.resetWriter();
	}

	private Predicate processXSLTMatchPatternOption(RuntimeValue option) throws XPathException {
		if (!option.getString().equals("")) {
			XPathExpression matcher = SaxonHelper.compileExpression(
				option.getString(),
				option.getNamespaceBindings(),
				runtime.getProcessor().getUnderlyingConfiguration());
			return n -> n.getNodeKind() == XdmNodeKind.ELEMENT && SaxonHelper.evaluateBoolean(matcher, n);
		} else
			return n -> false;
	}

	public void run() throws SaxonApiException {
		super.run();

		HashMap langToToken = new HashMap();

		//Retrieve a generic lexer that can handle unexpected languages.
		LexerToken generic;
		try {
			generic = mLexerRegistry.getFallbackToken(Collections.EMPTY_LIST);
		} catch (LexerInitException e1) {
			throw XProcStep.raiseError(e1, step);
		}
		langToToken.put(null, generic);

		FormatSpecifications formatSpecs;
		try {
			formatSpecs = new FormatSpecifications(
				sentenceTagOption, wordTagOption, "http://www.w3.org/XML/1998/namespace", "lang",
				processXSLTMatchPatternOption(inlineTagsOption),
				processXSLTMatchPatternOption(wordBeforeOption),
				processXSLTMatchPatternOption(wordAfterOption),
				processXSLTMatchPatternOption(sentenceBeforeOption),
				processXSLTMatchPatternOption(sentenceAfterOption));
		} catch (XPathException e) {
			throw XProcStep.raiseError(e, step);
		}

		XmlBreakRebuilder xmlRebuilder = new XmlBreakRebuilder();

		long before = System.currentTimeMillis();

		if (mLangDetector == null) {
			mLangDetector = new DummyLangDetector();
			mLangDetector.train();
		}

		while (mSource.moreDocuments()) {
			XdmNode doc = mSource.read();

			//init the lexers with the languages
			mLangs = new HashSet();
			try {
				new InlineSectionFinder().find(doc, 0, formatSpecs, this,
				        Collections.EMPTY_SET);
				for (Locale lang : mLangs) {
					if (!langToToken.containsKey(lang)) {
						LexerToken token = mLexerRegistry.getTokenForLang(lang, langToToken
						        .values());
						if (token == null) {
							throw new LexerInitException(
							        "cannot find a lexer for the language: " + lang);
						}
						langToToken.put(lang, token);
					}
				}
			} catch (LexerInitException e) {
				throw XProcStep.raiseError(e, step);
			}

			mLogger.debug("Total number of language(s): " + (langToToken.size() - 1));
			for (Map.Entry entry : langToToken.entrySet()) {
				mLogger.debug("LexService for language '"
				        + (entry.getKey() == null ? "" : entry.getKey()) + "': "
				        + entry.getValue().getLexService().getName());
			}

			//rebuild the XML tree and lex the content on-the-fly
			List parsingErrors = new ArrayList();
			XdmNode tree;
			try {
				tree = xmlRebuilder.rebuild(this, langToToken, doc, formatSpecs,
				        mLangDetector, false, parsingErrors);
				mResult.write(tree);
			} catch (LexerInitException e) {
				throw XProcStep.raiseError(e, step);
			}
			for (String error : parsingErrors) {
				mRuntime.info(null, null, doc.getBaseURI() + ": " + error);
			}
		}

		for (LexerToken token : langToToken.values()) {
			mLexerRegistry.releaseToken(token);
		}

		long after = System.currentTimeMillis();
		mLogger.debug("lexing time = " + (after - before) / 1000.0 + " s.");

		mLangs = null;
	}

	@Override
	public TreeWriter newInstance() {
		return new TreeWriter(mRuntime);
	}

	@Override
	public void onInlineSectionFound(List leaves, List text, Locale lang)
	        throws LexerInitException {

		//TODO: find a way to not doing this multiple times (it is done also in the rebuilder)
		//If this is really too CPU-intensive, one can skip this detection.
		//As a result, no lexer would be loaded for the true language if the language is not
		//also used somewhere else, and the XMLRebuilder will eventually use a generic lexer for
		//the detected language.

		lang = mLangDetector.findLang(lang, text);

		if (lang != null) {
			mLangs.add(lang);
		}
	}

	@Override
	public void onEmptySectionFound(List leaves) {

	}
}