All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.synthesis.en.EnglishSynthesizer Maven / Gradle / Ivy

/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.synthesis.en;

import org.languagetool.*;
import org.languagetool.rules.en.AvsAnRule;
import org.languagetool.synthesis.BaseSynthesizer;
import org.languagetool.tools.StringTools;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * English word form synthesizer.
 * Based on part-of-speech lists in Public Domain. See readme.txt for details,
 * the POS tagset is described in tagset.txt.
 * 
 * There are to special additions:
 * 
    *
  1. +DT - tag that adds "a" or "an" (according to the way the word is * pronounced) and "the"
  2. *
  3. +INDT - a tag that adds only "a" or "an"
  4. *
* * @author Marcin Miłkowski */ public class EnglishSynthesizer extends BaseSynthesizer { private static final String RESOURCE_FILENAME = "/en/english_synth.dict"; private static final String TAGS_FILE_NAME = "/en/english_tags.txt"; private static final String SOR_FILE_NAME = "/en/en.sor"; private static final List exceptions = Arrays.asList("ne'er", "e'er", "o'er", "ol'", "ma'am", "n't", "informations"); // A special tag to add determiners. private static final String ADD_DETERMINER = "+DT"; // A special tag to add only indefinite articles. private static final String ADD_IND_DETERMINER = "+INDT"; private final AvsAnRule aVsAnRule = new AvsAnRule(JLanguageTool.getMessageBundle(Languages.getLanguageForShortCode("en"))); public static final EnglishSynthesizer INSTANCE = new EnglishSynthesizer(); /** @deprecated use {@link #INSTANCE} */ public EnglishSynthesizer(Language lang) { this(); } private EnglishSynthesizer() { super(SOR_FILE_NAME, RESOURCE_FILENAME, TAGS_FILE_NAME, "en"); } /** * Get a form of a given AnalyzedToken, where the form is defined by a * part-of-speech tag. * * @param token AnalyzedToken to be inflected. * @param posTag A desired part-of-speech tag. * @return String value - inflected word. */ @Override public String[] synthesize(AnalyzedToken token, String posTag) throws IOException { if (posTag.startsWith(SPELLNUMBER_TAG)) { return super.synthesize(token, posTag); } String aOrAn = aVsAnRule.suggestAorAn(token.getToken()); if (ADD_DETERMINER.equals(posTag)) { return new String[] { aOrAn, "the " + StringTools.lowercaseFirstCharIfCapitalized(token.getToken()) }; } else if (ADD_IND_DETERMINER.equals(posTag)) { return new String[] { aOrAn }; } return removeExceptions(super.synthesize(token, posTag)); } /** * Special English regexp based synthesizer that allows adding articles * when the regexp-based tag ends with a special signature {@code \\+INDT} or {@code \\+DT}. * * @since 2.5 */ @Override public String[] synthesize(AnalyzedToken token, String posTag, boolean posTagRegExp) throws IOException { if (posTag.startsWith(SPELLNUMBER_TAG)) { return synthesize(token, posTag); } if (posTagRegExp) { String myPosTag = posTag; String det = ""; if (posTag.endsWith(ADD_IND_DETERMINER)) { myPosTag = myPosTag.substring(0, myPosTag.indexOf(ADD_IND_DETERMINER) - "\\".length()); det = aVsAnRule.suggestAorAn(token.getLemma()); det = det.substring(0, det.indexOf(' ') + " ".length()); } else if (posTag.endsWith(ADD_DETERMINER)) { myPosTag = myPosTag.substring(0, myPosTag.indexOf(ADD_DETERMINER) - "\\".length()); det = "the "; } initPossibleTags(); Pattern p = Pattern.compile(myPosTag); List results = new ArrayList<>(); for (String tag : possibleTags) { Matcher m = p.matcher(tag); if (m.matches() && token.getLemma() != null) { lookup(token.getLemma(), tag, results, det); } } return removeExceptions(results.toArray(new String[0])); } return removeExceptions(synthesize(token, posTag)); } private void lookup(String lemma, String posTag, List results, String determiner) { List lookup = super.lookup(lemma, posTag); for (String result : lookup) { //results.add(determiner + StringTools.lowercaseFirstCharIfCapitalized(result)); //why lowercase? results.add(determiner + result); } } @Override protected boolean isException(String w) { // remove: 've, 's, 're... return w.startsWith("'") || exceptions.contains(w); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy