simplenlg.orthography.english.OrthographyProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of SimpleNLG Show documentation
Java API for Natural Language Generation
The newest version!
/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 2.0 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * https://www.mozilla.org/en-US/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is "Simplenlg".
 *
 * The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
 * Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
 *
 * Contributor(s): Ehud Reiter, Albert Gatt, Dave Westwater, Roman Kutlak, Margaret Mitchell, and Saad Mahamood.
 */
package simplenlg.orthography.english;

import java.util.ArrayList;
import java.util.List;

import simplenlg.features.DiscourseFunction;
import simplenlg.features.Feature;
import simplenlg.features.InternalFeature;
import simplenlg.framework.*;

/**
 * 
 * This processing module deals with punctuation when applied to
 * DocumentElements. The punctuation currently handled by this
 * processor includes the following (as of version 4.0):
 * 

 * Capitalisation of the first letter in sentences.
 * Termination of sentences with a period if not interrogative.
 * Termination of sentences with a question mark if they are interrogative.
 * Replacement of multiple conjunctions with a comma. For example,
 * John and Peter and Simon becomes John, Peter and Simon.
 * 
 * 
 *
 * @author D. Westwater, University of Aberdeen.
 * @version 4.0
 */
public class OrthographyProcessor extends NLGModule {

	private boolean commaSepPremodifiers; // set whether to separate
	// premodifiers using commas

	private boolean commaSepCuephrase;   // set whether to include a comma after a
	// cue phrase (if marked by the
	// CUE_PHRASE=true) feature.

	@Override
	public void initialise() {
		this.commaSepPremodifiers = true;
		this.commaSepCuephrase = false;
	}

	/**
	 * Check whether this processor separates premodifiers using a comma.
	 *
	 * @return true if premodifiers in the noun phrase are
	 * 		comma-separated.
	 */
	public boolean isCommaSepPremodifiers() {
		return commaSepPremodifiers;
	}

	/**
	 * Set whether to separate premodifiers using a comma. If true,
	 * premodifiers will be comma-separated, as in the long, dark road.
	 * If false, they won't.
	 *
	 * @param commaSepPremodifiers the commaSepPremodifiers to set
	 */
	public void setCommaSepPremodifiers(boolean commaSepPremodifiers) {
		this.commaSepPremodifiers = commaSepPremodifiers;
	}

	/**
	 * Check whether this processor separates cue phrases from a matrix phrase
	 * using a comma.
	 *
	 * @return true if this parameter is set.
	 */
	public boolean isCommaSepCuephrase() {
		return commaSepCuephrase;
	}

	/**
	 * If set to true, separates a cue phrase from the matrix
	 * phrase using a comma. Cue phrases are typically at the start of a
	 * sentence (e.g. However, John left early). This will only
	 * apply to phrases with the feature
	 * {@link simplenlg.features.DiscourseFunction#CUE_PHRASE} or {@link simplenlg.features.DiscourseFunction#FRONT_MODIFIER}.
	 *
	 * @param commaSepCuephrase whether to separate cue phrases using a comma
	 */
	public void setCommaSepCuephrase(boolean commaSepCuephrase) {
		this.commaSepCuephrase = commaSepCuephrase;
	}

	@Override
	public NLGElement realise(NLGElement element) {
		NLGElement realisedElement = null;
		Object function = null; //the element's discourse function

		//get the element's function first
		if(element instanceof ListElement) {
			List children = element.getChildren();
			if(!children.isEmpty()) {
				NLGElement firstChild = children.get(0);
				function = firstChild.getFeature(InternalFeature.DISCOURSE_FUNCTION);
			}
		} else {
			if(element != null) {
				function = element.getFeature(InternalFeature.DISCOURSE_FUNCTION);
			}
		}

		if(element != null) {
			ElementCategory category = element.getCategory();

			if(category instanceof DocumentCategory && element instanceof DocumentElement) {
				List components = ((DocumentElement) element).getComponents();

				switch((DocumentCategory) category){

				case SENTENCE:
					realisedElement = realiseSentence(components, element);
					break;

				case LIST_ITEM:
					if(components != null && components.size() > 0) {
						// recursively realise whatever is in the list item
						// NB: this will realise embedded lists within list
						// items
						realisedElement = new ListElement(realise(components));
						realisedElement.setParent(element.getParent());
					}
					break;

				default:
					((DocumentElement) element).setComponents(realise(components));
					realisedElement = element;
				}

			} else if(element instanceof ListElement) {
				// AG: changes here: if we have a premodifier, then we ask the
				// realiseList method to separate with a comma.
				// if it's a postmod, we need commas at the start and end only
				// if it's appositive
				StringBuffer buffer = new StringBuffer();

				if(DiscourseFunction.PRE_MODIFIER.equals(function)) {

					boolean all_appositives = true;
					for(NLGElement child : element.getChildren()) {
						all_appositives = all_appositives && child.getFeatureAsBoolean(Feature.APPOSITIVE);
					}

					// TODO: unless this is the end of the sentence
					if(all_appositives) {
						buffer.append(", ");
					}
					realiseList(buffer, element.getChildren(), this.commaSepPremodifiers ? "," : "");
					if(all_appositives) {
						buffer.append(", ");
					}
				} else if(DiscourseFunction.POST_MODIFIER.equals(function)) {// &&
					// appositive)
					// {
					List postmods = element.getChildren();
					// bug fix due to Owen Bennett
					int len = postmods.size();

					for(int i = 0; i < len; i++) {
						// for(NLGElement postmod: element.getChildren()) {
						NLGElement postmod = postmods.get(i);

						// if the postmod is appositive, it's sandwiched in
						// commas
						if(postmod.getFeatureAsBoolean(Feature.APPOSITIVE)) {
							buffer.append(", ");
							buffer.append(realise(postmod));
							buffer.append(", ");
						} else {
							buffer.append(realise(postmod));
							if(postmod instanceof ListElement || (postmod.getRealisation() != null
							                                      && !postmod.getRealisation().equals(""))) {
								buffer.append(" ");
							}
						}
					}

				} else if((DiscourseFunction.CUE_PHRASE.equals(function) || DiscourseFunction.FRONT_MODIFIER.equals(
						function)) && this.commaSepCuephrase) {
					realiseList(buffer, element.getChildren(), this.commaSepCuephrase ? "," : "");

				} else {
					realiseList(buffer, element.getChildren(), "");
				}

				// realiseList(buffer, element.getChildren(), "");
				realisedElement = new StringElement(buffer.toString());

			} else if(element instanceof CoordinatedPhraseElement) {
				realisedElement = realiseCoordinatedPhrase(element.getChildren());
			} else {
				realisedElement = element;
			}

			// make the realised element inherit the original category
			// essential if list items are to be properly formatted later
			if(realisedElement != null) {
				realisedElement.setCategory(category);
			}

			//check if this is a cue phrase; if param is set, postfix a comma
			if((DiscourseFunction.CUE_PHRASE.equals(function) || DiscourseFunction.FRONT_MODIFIER.equals(function))
			   && this.commaSepCuephrase) {
				String realisation = realisedElement.getRealisation();

				if(!realisation.endsWith(",")) {
					realisation = realisation + ",";
				}

				realisedElement.setRealisation(realisation);
			}
		}

		//remove preceding and trailing whitespace from internal punctuation
		removePunctSpace(realisedElement);
		return realisedElement;
	}

	/**
	 * removes extra spaces preceding punctuation from a realised element
	 */
	private void removePunctSpace(NLGElement realisedElement) {

		if(realisedElement != null) {

			String realisation = realisedElement.getRealisation();

			if(realisation != null) {
				realisation = realisation.replaceAll(" ,", ",");
				realisation = realisation.replaceAll(",,+", ",");
				realisedElement.setRealisation(realisation);
			}

		}
	}

	/**
	 * Performs the realisation on a sentence. This includes adding the
	 * terminator and capitalising the first letter.
	 *
	 * @param components the List of NLGElements representing
	 * 		the components that make up the sentence.
	 * @param element the NLGElement representing the sentence.
	 * @return the realised element as an NLGElement.
	 */
	private NLGElement realiseSentence(List components, NLGElement element) {

		NLGElement realisedElement = null;
		if(components != null && components.size() > 0) {
			StringBuffer realisation = new StringBuffer();
			realiseList(realisation, components, "");

			stripLeadingCommas(realisation);
			capitaliseFirstLetter(realisation);
			terminateSentence(realisation,
			                  element.getFeatureAsBoolean(InternalFeature.INTERROGATIVE).booleanValue(),
			                  element.getFeatureAsBoolean(Feature.EXCLAMATORY).booleanValue());

			((DocumentElement) element).clearComponents();
			// realisation.append(' ');
			element.setRealisation(realisation.toString());
			realisedElement = element;
		}

		return realisedElement;
	}

	/**
	 * Adds the sentence terminator to the sentence. This is a period ('.') for
	 * normal sentences or a question mark ('?') for interrogatives.
	 *
	 * @param realisation the StringBuffer containing the current
	 * 		realisation of the sentence.
	 * @param interrogative a boolean flag showing true if the
	 * 		sentence is an interrogative, false otherwise.
	 */
	private void terminateSentence(StringBuffer realisation, boolean interrogative, boolean exclamatory) {
		char character = realisation.charAt(realisation.length() - 1);
		if(character != '.' && character != '?') {
			if(interrogative) {
				realisation.append('?');
			} else if(exclamatory) {
				realisation.append('!');
			} else {
				realisation.append('.');
			}
		}
	}

	/**
	 * Remove recursively any leading spaces or commas at the start
	 * of a sentence.
	 *
	 * @param realisation the StringBuffer containing the current
	 * 		realisation of the sentence.
	 */
	private void stripLeadingCommas(StringBuffer realisation) {
		char character = realisation.charAt(0);
		if(character == ' ' || character == ',') {
			realisation.deleteCharAt(0);
			stripLeadingCommas(realisation);
		}
	}

	/**
	 * Capitalises the first character of a sentence if it is a lower case
	 * letter.
	 *
	 * @param realisation the StringBuffer containing the current
	 * 		realisation of the sentence.
	 */
	private void capitaliseFirstLetter(StringBuffer realisation) {
		char character = realisation.charAt(0);
		if(character >= 'a' && character <= 'z') {
			character = (char) ('A' + (character - 'a'));
			realisation.setCharAt(0, character);
		}
	}

	@Override
	public List realise(List elements) {
		List realisedList = new ArrayList();

		if(elements != null && elements.size() > 0) {
			for(NLGElement eachElement : elements) {
				if(eachElement instanceof DocumentElement) {
					realisedList.add(realise(eachElement));
				} else {
					realisedList.add(eachElement);
				}
			}
		}
		return realisedList;
	}

	/**
	 * Realises a list of elements appending the result to the on-going
	 * realisation.
	 *
	 * @param realisation the StringBuffer containing the current
	 * 		realisation of the sentence.
	 * @param components the List of NLGElements representing
	 * 		the components that make up the sentence.
	 * @param listSeparator the string to use to separate elements of the list, empty if
	 * 		no separator needed
	 */
	private void realiseList(StringBuffer realisation, List components, String listSeparator) {

		NLGElement realisedChild = null;

		for(int i = 0; i < components.size(); i++) {
			NLGElement thisElement = components.get(i);
			realisedChild = realise(thisElement);
			String childRealisation = realisedChild.getRealisation();

			// check that the child realisation is non-empty
			if(childRealisation != null && childRealisation.length() > 0 && !childRealisation.matches("^[\\s\\n]+$")) {
				realisation.append(realisedChild.getRealisation());

				if(components.size() > 1 && i < components.size() - 1) {
					realisation.append(listSeparator);
				}

				realisation.append(' ');
			}
		}

		if(realisation.length() > 0) {
			realisation.setLength(realisation.length() - 1);
		}
	}

	/**
	 * Realises coordinated phrases. Where there are more than two coordinates,
	 * then a comma replaces the conjunction word between all the coordinates
	 * save the last two. For example, John and Peter and Simon becomes
	 * John, Peter and Simon.
	 *
	 * @param components the List of NLGElements representing
	 * 		the components that make up the sentence.
	 * @return the realised element as an NLGElement.
	 */
	private NLGElement realiseCoordinatedPhrase(List components) {
		StringBuffer realisation = new StringBuffer();
		NLGElement realisedChild = null;

		int length = components.size();

		for(int index = 0; index < length; index++) {
			realisedChild = components.get(index);
			if(index < length - 2
			   && DiscourseFunction.CONJUNCTION.equals(realisedChild.getFeature(InternalFeature.DISCOURSE_FUNCTION))) {

				realisation.append(", "); //$NON-NLS-1$
			} else {
				realisedChild = realise(realisedChild);
				realisation.append(realisedChild.getRealisation()).append(' ');
			}
		}
		realisation.setLength(realisation.length() - 1);
		return new StringElement(realisation.toString().replace(" ,", ",")); //$NON-NLS-1$ //$NON-NLS-2$
	}
}