opennlp.tools.lemmatizer.DictionaryLemmatizer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.lemmatizer;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A {@link Lemmatizer} implementation that works by simple dictionary lookup into
* a {@link Map} built from a file containing, for each line:
*
* {@code word\tabpostag\tablemma}.
*/
public class DictionaryLemmatizer implements Lemmatizer {
/*
* The hashmap containing the dictionary.
*/
private final Map, List> dictMap = new HashMap<>();
/**
* Initializes a {@link DictionaryLemmatizer} and related {@link HashMap}
* from the input tab separated dictionary.
*
* The input file should have, for each line, {@code word\tabpostag\tablemma}.
* Alternatively, if multiple lemmas are possible for each word-postag pair,
* then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}.
*
* @param dictionaryStream The dictionary referenced by an open {@link InputStream}.
* @param charset The {@link Charset character encoding} of the dictionary.
*
* @throws IOException Thrown if IO errors occurred while reading in from
* {@code dictionaryStream}.
*/
public DictionaryLemmatizer(final InputStream dictionaryStream, Charset charset)
throws IOException {
init(dictionaryStream, charset);
}
/**
* Initializes a {@link DictionaryLemmatizer} and related {@link HashMap}
* from the input tab separated dictionary.
*
* The input file should have, for each line, {@code word\tabpostag\tablemma}.
* Alternatively, if multiple lemmas are possible for each word-postag pair,
* then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}.
*
* @param dictionaryStream The dictionary referenced by an open {@link InputStream}.
*
* @throws IOException Thrown if IO errors occurred while reading in from
* {@code dictionaryStream}.
*/
public DictionaryLemmatizer(final InputStream dictionaryStream) throws IOException {
this(dictionaryStream, StandardCharsets.UTF_8);
}
/**
* Initializes a {@link DictionaryLemmatizer} and related {@link HashMap}
* from the input tab separated dictionary.
*
* The input file should have, for each line, {@code word\tabpostag\tablemma}.
* Alternatively, if multiple lemmas are possible for each word-postag pair,
* then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}.
*
* @param dictionaryFile The dictionary referenced by a valid, readable {@link File}.
*
* @throws IOException Thrown if IO errors occurred while reading in from
* {@code dictionaryFile}.
*/
public DictionaryLemmatizer(File dictionaryFile) throws IOException {
this(dictionaryFile, StandardCharsets.UTF_8);
}
/**
* Initializes a {@link DictionaryLemmatizer} and related {@link HashMap}
* from the input tab separated dictionary.
*
* The input file should have, for each line, {@code word\tabpostag\tablemma}.
* Alternatively, if multiple lemmas are possible for each word-postag pair,
* then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}.
*
* @param dictionaryFile The dictionary referenced by a valid, readable {@link File}.
* @param charset The {@link Charset character encoding} of the dictionary.
*
* @throws IOException Thrown if IO errors occurred while reading in from
* {@code dictionaryFile}.
*/
public DictionaryLemmatizer(File dictionaryFile, Charset charset) throws IOException {
try (InputStream in = new BufferedInputStream(new FileInputStream(dictionaryFile))) {
init(in, charset);
}
}
/**
* Initializes a {@link DictionaryLemmatizer} and related {@link HashMap}
* from the input tab separated dictionary.
*
* The input file should have, for each line, {@code word\tabpostag\tablemma}.
* Alternatively, if multiple lemmas are possible for each word-postag pair,
* then the format should be {@code word\tab\postag\tablemma01#lemma02#lemma03}.
*
* @param dictionaryPath The dictionary referenced via a valid, readable {@link Path}.
*
* @throws IOException Thrown if IO errors occurred while reading in from
* {@code dictionaryPath}.
*/
public DictionaryLemmatizer(Path dictionaryPath) throws IOException {
init(Files.newInputStream(dictionaryPath), StandardCharsets.UTF_8);
}
private void init(InputStream dictionary, Charset charset) throws IOException {
final BufferedReader breader = new BufferedReader(
new InputStreamReader(dictionary, charset));
String line;
while ((line = breader.readLine()) != null) {
final String[] elems = line.split("\t");
final String[] lemmas = elems[2].split("#");
this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(lemmas));
}
}
/**
* @return Retrieves the {@link Map} containing the dictionary.
*/
public Map, List> getDictMap() {
return this.dictMap;
}
/**
* @param word The surface form word.
* @param postag The assigned postag.
*
* @return Retrieves the dictionary keys (word and postag).
*/
private List getDictKeys(final String word, final String postag) {
return new ArrayList<>(Arrays.asList(word.toLowerCase(), postag));
}
@Override
public String[] lemmatize(final String[] tokens, final String[] postags) {
List lemmas = new ArrayList<>();
for (int i = 0; i < tokens.length; i++) {
lemmas.add(this.lemmatize(tokens[i], postags[i]));
}
return lemmas.toArray(new String[0]);
}
@Override
public List> lemmatize(final List tokens, final List posTags) {
List> allLemmas = new ArrayList<>();
for (int i = 0; i < tokens.size(); i++) {
allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i)));
}
return allLemmas;
}
/**
* Lookup lemma in a dictionary. Outputs {@code "0"} if no lemma could be found
* for the specified {@code word}.
*
* @param word The token to look up the lemma for.
* @param postag The postag.
*
* @return The corresponding lemma, or {@code "0"} if no lemma for {@code word}
* could be found.
*/
private String lemmatize(final String word, final String postag) {
String lemma;
final List keys = this.getDictKeys(word, postag);
// lookup lemma as value of the map
final List keyValues = this.dictMap.get(keys);
if ( keyValues != null && !keyValues.isEmpty()) {
lemma = keyValues.get(0);
} else {
lemma = "O";
}
return lemma;
}
/**
* Lookup every lemma for a word,pos tag in a dictionary. Outputs {@code "0"} if no
* lemmas could be found for the specified {@code word}.
*
* @param word The token to look up the lemma for.
* @param postag The postag.
*
* @return A list of relevant lemmas.
*/
private List getAllLemmas(final String word, final String postag) {
List lemmasList = new ArrayList<>();
final List keys = this.getDictKeys(word, postag);
// lookup lemma as value of the map
final List keyValues = this.dictMap.get(keys);
if (keyValues != null && !keyValues.isEmpty()) {
lemmasList.addAll(keyValues);
} else {
lemmasList.add("O");
}
return lemmasList;
}
}