All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.ContractionUtility Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
 * Utility class to handle Portuguese contractions.
 * 

* Some Corpora splits contractions in its parts, for example, "da" > "de" + * "a", but according to the fase of language processing, NER for instance, we * can't decide if to split a contraction or not, specially because contractions * inside names are not separated, but outside are. * *

* Note: Do not use this class, internal use only! */ public class ContractionUtility { private static final Map CONTRACTIONS; static { Map elems = new HashMap(); // 103 CONTRACTIONS. elems.put("a+a", "\u00e0"); elems.put("a+as", "\u00e0s"); elems.put("a+aquele", "\u00e0quele"); elems.put("a+aqueles", "\u00e0queles"); elems.put("a+aquela", "\u00e0quela"); elems.put("a+aquelas", "\u00e0quelas"); elems.put("a+aquilo", "\u00e0quilo"); elems.put("a+o", "ao"); elems.put("a+os", "aos"); elems.put("com+mim", "comigo"); elems.put("com+n\u00f2s", "conosco"); elems.put("com+si", "consigo"); elems.put("com+ti", "contigo"); elems.put("com+v\u00f2s", "convosco"); elems.put("de+a\u00ed", "da\u00ed"); elems.put("de+algu\u00e9m", "dalgu\u00e9m"); elems.put("de+algum", "dalgum"); elems.put("de+alguma", "dalguma"); elems.put("de+alguns", "dalguns"); elems.put("de+algumas", "dalgumas"); elems.put("de+ali", "dali"); elems.put("de+aqu\u00e9m", "daqu\u00e9m"); elems.put("de+aquele", "daquele"); elems.put("de+aquela", "daquela"); elems.put("de+aqueles", "daqueles"); elems.put("de+aquelas", "daquelas"); elems.put("de+aqui", "daqui"); elems.put("de+aquilo", "daquilo"); elems.put("de+ele", "dele"); elems.put("de+ela", "dela"); elems.put("de+eles", "deles"); elems.put("de+elas", "delas"); elems.put("de+entre", "dentre"); elems.put("de+esse", "desse"); elems.put("de+essa", "dessa"); elems.put("de+esses", "desses"); elems.put("de+essas", "dessas"); elems.put("de+este", "deste"); elems.put("de+esta", "desta"); elems.put("de+estes", "destes"); elems.put("de+estas", "destas"); elems.put("de+isso", "disso"); elems.put("de+isto", "disto"); elems.put("de+o", "do"); elems.put("de+a", "da"); elems.put("de+os", "dos"); elems.put("de+as", "das"); elems.put("de+outrem", "doutrem"); elems.put("de+outro", "doutro"); elems.put("de+outra", "doutra"); elems.put("de+outros", "doutros"); elems.put("de+outras", "doutras"); elems.put("de+um", "dum"); elems.put("de+uma", "duma"); elems.put("de+uns", "duns"); elems.put("de+umas", "dumas"); elems.put("esse+outro", "essoutro"); elems.put("essa+outra", "essoutra"); elems.put("este+outro", "estoutro"); elems.put("este+outra", "estoutra"); elems.put("ele+o", "lho"); elems.put("ele+a", "lha"); elems.put("ele+os", "lhos"); elems.put("ele+as", "lhas"); elems.put("em+algum", "nalgum"); elems.put("em+alguma", "nalguma"); elems.put("em+alguns", "nalguns"); elems.put("em+algumas", "nalgumas"); elems.put("em+aquele", "naquele"); elems.put("em+aquela", "naquela"); elems.put("em+aqueles", "naqueles"); elems.put("em+aquelas", "naquelas"); elems.put("em+aquilo", "naquilo"); elems.put("em+ele", "nele"); elems.put("em+ela", "nela"); elems.put("em+eles", "neles"); elems.put("em+elas", "nelas"); elems.put("em+esse", "nesse"); elems.put("em+essa", "nessa"); elems.put("em+esses", "nesses"); elems.put("em+essas", "nessas"); elems.put("em+este", "neste"); elems.put("em+esta", "nesta"); elems.put("em+estes", "nestes"); elems.put("em+estas", "nestas"); elems.put("em+isso", "nisso"); elems.put("em+isto", "nisto"); elems.put("em+o", "no"); elems.put("em+a", "na"); elems.put("em+os", "nos"); elems.put("em+as", "nas"); elems.put("em+outro", "noutro"); elems.put("em+outra", "noutra"); elems.put("em+outros", "noutros"); elems.put("em+outras", "noutras"); elems.put("em+um", "num"); elems.put("em+uma", "numa"); elems.put("em+uns", "nuns"); elems.put("em+umas", "numas"); elems.put("por+o", "pelo"); elems.put("por+a", "pela"); elems.put("por+os", "pelos"); elems.put("por+as", "pelas"); elems.put("para+a", "pra"); elems.put("para+o", "pro"); elems.put("para+as", "pras"); elems.put("para+os", "pros"); CONTRACTIONS = Collections.unmodifiableMap(elems); } /** * Merges a contraction * * @param left * the left component * @param right * the right component * @return the merged contraction */ public static String toContraction(String left, String right) { String key = left + "+" + right; if (CONTRACTIONS.containsKey(key)) { return CONTRACTIONS.get(key); } else { StringBuilder sb = new StringBuilder(); String[] parts = left.split("_"); for (int i = 0; i < parts.length - 1; i++) { sb.append(parts[i] + " "); } key = parts[parts.length - 1] + "+" + right; if (CONTRACTIONS.containsKey(key)) { sb.append(CONTRACTIONS.get(key)); return sb.toString(); } if (right.contains("_")) { parts = right.split("_"); key = left + "+" + parts[0]; if (CONTRACTIONS.containsKey(key)) { sb.append(CONTRACTIONS.get(key) + " "); for (int i = 1; i < parts.length; i++) { sb.append(parts[i] + " "); } return sb.toString(); } } String leftLower = parts[parts.length - 1].toLowerCase(); key = leftLower + "+" + right; if (CONTRACTIONS.containsKey(key)) { String r = CONTRACTIONS.get(key); String firstChar = r.substring(0, 1); r = firstChar.toUpperCase() + r.substring(1); sb.append(r); return sb.toString(); } } return null; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy