All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.lemmatizer.DictionaryLemmatizer Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.lemmatizer;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * A {@link Lemmatizer} implementation that works by simple dictionary lookup into
 * a {@link Map} built from a file containing, for each line:
 * 

* {@code word\tabpostag\tablemma}. */ public class DictionaryLemmatizer implements Lemmatizer { /* * The hashmap containing the dictionary. */ private final Map, List> dictMap = new HashMap<>(); /** * Initializes a {@link DictionaryLemmatizer} and related {@link HashMap} * from the input tab separated dictionary. *

* The input file should have, for each line, {@code word\tabpostag\tablemma}. * Alternatively, if multiple lemmas are possible for each word-postag pair, * then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}. * * @param dictionaryStream The dictionary referenced by an open {@link InputStream}. * @param charset The {@link Charset character encoding} of the dictionary. * * @throws IOException Thrown if IO errors occurred while reading in from * {@code dictionaryStream}. */ public DictionaryLemmatizer(final InputStream dictionaryStream, Charset charset) throws IOException { init(dictionaryStream, charset); } /** * Initializes a {@link DictionaryLemmatizer} and related {@link HashMap} * from the input tab separated dictionary. *

* The input file should have, for each line, {@code word\tabpostag\tablemma}. * Alternatively, if multiple lemmas are possible for each word-postag pair, * then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}. * * @param dictionaryStream The dictionary referenced by an open {@link InputStream}. * * @throws IOException Thrown if IO errors occurred while reading in from * {@code dictionaryStream}. */ public DictionaryLemmatizer(final InputStream dictionaryStream) throws IOException { this(dictionaryStream, StandardCharsets.UTF_8); } /** * Initializes a {@link DictionaryLemmatizer} and related {@link HashMap} * from the input tab separated dictionary. *

* The input file should have, for each line, {@code word\tabpostag\tablemma}. * Alternatively, if multiple lemmas are possible for each word-postag pair, * then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}. * * @param dictionaryFile The dictionary referenced by a valid, readable {@link File}. * * @throws IOException Thrown if IO errors occurred while reading in from * {@code dictionaryFile}. */ public DictionaryLemmatizer(File dictionaryFile) throws IOException { this(dictionaryFile, StandardCharsets.UTF_8); } /** * Initializes a {@link DictionaryLemmatizer} and related {@link HashMap} * from the input tab separated dictionary. *

* The input file should have, for each line, {@code word\tabpostag\tablemma}. * Alternatively, if multiple lemmas are possible for each word-postag pair, * then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}. * * @param dictionaryFile The dictionary referenced by a valid, readable {@link File}. * @param charset The {@link Charset character encoding} of the dictionary. * * @throws IOException Thrown if IO errors occurred while reading in from * {@code dictionaryFile}. */ public DictionaryLemmatizer(File dictionaryFile, Charset charset) throws IOException { try (InputStream in = new BufferedInputStream(new FileInputStream(dictionaryFile))) { init(in, charset); } } /** * Initializes a {@link DictionaryLemmatizer} and related {@link HashMap} * from the input tab separated dictionary. *

* The input file should have, for each line, {@code word\tabpostag\tablemma}. * Alternatively, if multiple lemmas are possible for each word-postag pair, * then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}. * * @param dictionaryPath The dictionary referenced via a valid, readable {@link Path}. * * @throws IOException Thrown if IO errors occurred while reading in from * {@code dictionaryPath}. */ public DictionaryLemmatizer(Path dictionaryPath) throws IOException { init(Files.newInputStream(dictionaryPath), StandardCharsets.UTF_8); } private void init(InputStream dictionary, Charset charset) throws IOException { final BufferedReader breader = new BufferedReader( new InputStreamReader(dictionary, charset)); String line; while ((line = breader.readLine()) != null) { final String[] elems = line.split("\t"); final String[] lemmas = elems[2].split("#"); this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(lemmas)); } } /** * @return Retrieves the {@link Map} containing the dictionary. */ public Map, List> getDictMap() { return this.dictMap; } /** * @param word The surface form word. * @param postag The assigned postag. * * @return Retrieves the dictionary keys (word and postag). */ private List getDictKeys(final String word, final String postag) { return new ArrayList<>(Arrays.asList(word.toLowerCase(), postag)); } @Override public String[] lemmatize(final String[] tokens, final String[] postags) { List lemmas = new ArrayList<>(); for (int i = 0; i < tokens.length; i++) { lemmas.add(this.lemmatize(tokens[i], postags[i])); } return lemmas.toArray(new String[0]); } @Override public List> lemmatize(final List tokens, final List posTags) { List> allLemmas = new ArrayList<>(); for (int i = 0; i < tokens.size(); i++) { allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i))); } return allLemmas; } /** * Lookup lemma in a dictionary. Outputs {@code "0"} if no lemma could be found * for the specified {@code word}. * * @param word The token to look up the lemma for. * @param postag The postag. * * @return The corresponding lemma, or {@code "0"} if no lemma for {@code word} * could be found. */ private String lemmatize(final String word, final String postag) { String lemma; final List keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List keyValues = this.dictMap.get(keys); if ( keyValues != null && !keyValues.isEmpty()) { lemma = keyValues.get(0); } else { lemma = "O"; } return lemma; } /** * Lookup every lemma for a word,pos tag in a dictionary. Outputs {@code "0"} if no * lemmas could be found for the specified {@code word}. * * @param word The token to look up the lemma for. * @param postag The postag. * * @return A list of relevant lemmas. */ private List getAllLemmas(final String word, final String postag) { List lemmasList = new ArrayList<>(); final List keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List keyValues = this.dictMap.get(keys); if (keyValues != null && !keyValues.isEmpty()) { lemmasList.addAll(keyValues); } else { lemmasList.add("O"); } return lemmasList; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy