All Downloads are FREE. Search and download functionalities are using the official Maven repository.

simplenlg.orthography.english.OrthographyProcessor Maven / Gradle / Ivy

/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is "Simplenlg".
 *
 * The Initial Developer of the Original Code is Ehud Reiter, Albert Gatt and Dave Westwater.
 * Portions created by Ehud Reiter, Albert Gatt and Dave Westwater are Copyright (C) 2010-11 The University of Aberdeen. All Rights Reserved.
 *
 * Contributor(s): Ehud Reiter, Albert Gatt, Dave Wewstwater, Roman Kutlak, Margaret Mitchell.
 */
package simplenlg.orthography.english;

import java.util.ArrayList;
import java.util.List;

import simplenlg.features.DiscourseFunction;
import simplenlg.features.Feature;
import simplenlg.features.InternalFeature;
import simplenlg.framework.CoordinatedPhraseElement;
import simplenlg.framework.DocumentCategory;
import simplenlg.framework.DocumentElement;
import simplenlg.framework.ElementCategory;
import simplenlg.framework.ListElement;
import simplenlg.framework.NLGElement;
import simplenlg.framework.NLGModule;
import simplenlg.framework.StringElement;

/**
 * 

* This processing module deals with punctuation when applied to * DocumentElements. The punctuation currently handled by this * processor includes the following (as of version 4.0): *

    *
  • Capitalisation of the first letter in sentences.
  • *
  • Termination of sentences with a period if not interrogative.
  • *
  • Termination of sentences with a question mark if they are interrogative.
  • *
  • Replacement of multiple conjunctions with a comma. For example, * John and Peter and Simon becomes John, Peter and Simon.
  • *
*

* * * @author D. Westwater, University of Aberdeen. * @version 4.0 * */ public class OrthographyProcessor extends NLGModule { private boolean commaSepPremodifiers; // set whether to separate // premodifiers using commas private boolean commaSepCuephrase; // set whether to include a comma after a // cue phrase (if marked by the // CUE_PHRASE=true) feature. @Override public void initialise() { this.commaSepPremodifiers = true; this.commaSepCuephrase = false; } /** * Check whether this processor separates premodifiers using a comma. * * @return true if premodifiers in the noun phrase are * comma-separated. */ public boolean isCommaSepPremodifiers() { return commaSepPremodifiers; } /** * Set whether to separate premodifiers using a comma. If true, * premodifiers will be comma-separated, as in the long, dark road. * If false, they won't. * * @param commaSepPremodifiers * the commaSepPremodifiers to set */ public void setCommaSepPremodifiers(boolean commaSepPremodifiers) { this.commaSepPremodifiers = commaSepPremodifiers; } /** * Check whether this processor separates cue phrases from a matrix phrase * using a comma. * * @return true if this parameter is set. */ public boolean isCommaSepCuephrase() { return commaSepCuephrase; } /** * If set to true, separates a cue phrase from the matrix * phrase using a comma. Cue phrases are typically at the start of a * sentence (e.g. However, John left early). This will only * apply to phrases with the feature * {@link simplenlg.features.DiscourseFunction#CUE_PHRASE} or {@link simplenlg.features.DiscourseFunction#FRONT_MODIFIER}. * * @param commaSepCuephrase * whether to separate cue phrases using a comma */ public void setCommaSepCuephrase(boolean commaSepCuephrase) { this.commaSepCuephrase = commaSepCuephrase; } @Override public NLGElement realise(NLGElement element) { NLGElement realisedElement = null; Object function = null; //the element's discourse function //get the element's function first if(element instanceof ListElement) { List children = element.getChildren(); if(!children.isEmpty()) { NLGElement firstChild = children.get(0); function = firstChild.getFeature(InternalFeature.DISCOURSE_FUNCTION); } } else { if(element != null) { function = element.getFeature(InternalFeature.DISCOURSE_FUNCTION); } } if(element != null) { ElementCategory category = element.getCategory(); if(category instanceof DocumentCategory && element instanceof DocumentElement) { List components = ((DocumentElement) element).getComponents(); switch((DocumentCategory) category){ case SENTENCE : realisedElement = realiseSentence(components, element); break; case LIST_ITEM : if(components != null && components.size() > 0) { // recursively realise whatever is in the list item // NB: this will realise embedded lists within list // items realisedElement = new ListElement(realise(components)); realisedElement.setParent(element.getParent()); } break; default : ((DocumentElement) element).setComponents(realise(components)); realisedElement = element; } } else if(element instanceof ListElement) { // AG: changes here: if we have a premodifier, then we ask the // realiseList method to separate with a comma. // if it's a postmod, we need commas at the start and end only // if it's appositive StringBuffer buffer = new StringBuffer(); if(DiscourseFunction.PRE_MODIFIER.equals(function)) { boolean all_appositives = true; for(NLGElement child : element.getChildren()){ all_appositives = all_appositives && child.getFeatureAsBoolean(Feature.APPOSITIVE); } // TODO: unless this is the end of the sentence if(all_appositives){ buffer.append(", "); } realiseList(buffer, element.getChildren(), this.commaSepPremodifiers ? "," : ""); if(all_appositives){ buffer.append(", "); } } else if(DiscourseFunction.POST_MODIFIER.equals(function)) {// && // appositive) // { List postmods = element.getChildren(); // bug fix due to Owen Bennett int len = postmods.size(); for(int i = 0; i < len; i++ ) { // for(NLGElement postmod: element.getChildren()) { NLGElement postmod = postmods.get(i); // if the postmod is appositive, it's sandwiched in // commas if(postmod.getFeatureAsBoolean(Feature.APPOSITIVE)) { buffer.append(", "); buffer.append(realise(postmod)); if(i < len - 1) { buffer.append(", "); } } else { buffer.append(realise(postmod)); if(postmod instanceof ListElement || (postmod.getRealisation() != null && !postmod.getRealisation().equals(""))) { buffer.append(" "); } } } } else if((DiscourseFunction.CUE_PHRASE.equals(function) || DiscourseFunction.FRONT_MODIFIER.equals(function)) && this.commaSepCuephrase){ realiseList(buffer, element.getChildren(), this.commaSepCuephrase ? "," : ""); } else { realiseList(buffer, element.getChildren(), ""); } // realiseList(buffer, element.getChildren(), ""); realisedElement = new StringElement(buffer.toString()); } else if(element instanceof CoordinatedPhraseElement) { realisedElement = realiseCoordinatedPhrase(element.getChildren()); } else { realisedElement = element; } // make the realised element inherit the original category // essential if list items are to be properly formatted later if(realisedElement != null) { realisedElement.setCategory(category); } //check if this is a cue phrase; if param is set, postfix a comma if((DiscourseFunction.CUE_PHRASE.equals(function) || DiscourseFunction.FRONT_MODIFIER.equals(function)) && this.commaSepCuephrase) { String realisation = realisedElement.getRealisation(); if(!realisation.endsWith(",")) { realisation = realisation + ","; } realisedElement.setRealisation(realisation); } } //remove preceding and trailing whitespace from internal punctuation removePunctSpace(realisedElement); return realisedElement; } /** * removes extra spaces preceding punctuation from a realised element * * @param realisedElement */ private void removePunctSpace(NLGElement realisedElement) { if(realisedElement != null) { String realisation = realisedElement.getRealisation(); if(realisation != null) { realisation = realisation.replaceAll(" ,", ","); realisation = realisation.replaceAll(",,+", ","); realisedElement.setRealisation(realisation); } } } /** * Performs the realisation on a sentence. This includes adding the * terminator and capitalising the first letter. * * @param components * the List of NLGElements representing * the components that make up the sentence. * @param element * the NLGElement representing the sentence. * @return the realised element as an NLGElement. */ private NLGElement realiseSentence(List components, NLGElement element) { NLGElement realisedElement = null; if(components != null && components.size() > 0) { StringBuffer realisation = new StringBuffer(); realiseList(realisation, components, ""); stripLeadingCommas(realisation); capitaliseFirstLetter(realisation); terminateSentence(realisation, element.getFeatureAsBoolean(InternalFeature.INTERROGATIVE).booleanValue()); ((DocumentElement) element).clearComponents(); // realisation.append(' '); element.setRealisation(realisation.toString()); realisedElement = element; } return realisedElement; } /** * Adds the sentence terminator to the sentence. This is a period ('.') for * normal sentences or a question mark ('?') for interrogatives. * * @param realisation * the StringBuffer containing the current * realisation of the sentence. * @param interrogative * a boolean flag showing true if the * sentence is an interrogative, false otherwise. */ private void terminateSentence(StringBuffer realisation, boolean interrogative) { char character = realisation.charAt(realisation.length() - 1); if(character != '.' && character != '?') { if(interrogative) { realisation.append('?'); } else { realisation.append('.'); } } } /** * Remove recursively any leading spaces or commas at the start * of a sentence. * * @param realisation * the StringBuffer containing the current * realisation of the sentence. */ private void stripLeadingCommas(StringBuffer realisation) { char character = realisation.charAt(0); if(character == ' ' || character == ',') { realisation.deleteCharAt(0); stripLeadingCommas(realisation); } } /** * Capitalises the first character of a sentence if it is a lower case * letter. * * @param realisation * the StringBuffer containing the current * realisation of the sentence. */ private void capitaliseFirstLetter(StringBuffer realisation) { char character = realisation.charAt(0); if(character >= 'a' && character <= 'z') { character = (char) ('A' + (character - 'a')); realisation.setCharAt(0, character); } } @Override public List realise(List elements) { List realisedList = new ArrayList(); if(elements != null && elements.size() > 0) { for(NLGElement eachElement : elements) { if(eachElement instanceof DocumentElement) { realisedList.add(realise(eachElement)); } else { realisedList.add(eachElement); } } } return realisedList; } /** * Realises a list of elements appending the result to the on-going * realisation. * * @param realisation * the StringBuffer containing the current * realisation of the sentence. * @param components * the List of NLGElements representing * the components that make up the sentence. * @param listSeparator * the string to use to separate elements of the list, empty if * no separator needed */ private void realiseList(StringBuffer realisation, List components, String listSeparator) { NLGElement realisedChild = null; for(int i = 0; i < components.size(); i++ ) { NLGElement thisElement = components.get(i); realisedChild = realise(thisElement); String childRealisation = realisedChild.getRealisation(); // check that the child realisation is non-empty if(childRealisation != null && childRealisation.length() > 0 && !childRealisation.matches("^[\\s\\n]+$")) { realisation.append(realisedChild.getRealisation()); if(components.size() > 1 && i < components.size() - 1) { realisation.append(listSeparator); } realisation.append(' '); } } if(realisation.length() > 0) { realisation.setLength(realisation.length() - 1); } } /** * Realises coordinated phrases. Where there are more than two coordinates, * then a comma replaces the conjunction word between all the coordinates * save the last two. For example, John and Peter and Simon becomes * John, Peter and Simon. * * @param components * the List of NLGElements representing * the components that make up the sentence. * @return the realised element as an NLGElement. */ private NLGElement realiseCoordinatedPhrase(List components) { StringBuffer realisation = new StringBuffer(); NLGElement realisedChild = null; int length = components.size(); for(int index = 0; index < length; index++ ) { realisedChild = components.get(index); if(index < length - 2 && DiscourseFunction.CONJUNCTION.equals(realisedChild.getFeature(InternalFeature.DISCOURSE_FUNCTION))) { realisation.append(", "); //$NON-NLS-1$ } else { realisedChild = realise(realisedChild); realisation.append(realisedChild.getRealisation()).append(' '); } } realisation.setLength(realisation.length() - 1); return new StringElement(realisation.toString().replace(" ,", ",")); //$NON-NLS-1$ //$NON-NLS-2$ } }