smile.nlp.pos.EnglishPOSLexicon Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.nlp.pos;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An English lexicon with part-of-speech tags.
*
* @author Haifeng Li
*/
public class EnglishPOSLexicon {
private static final Logger logger = LoggerFactory.getLogger(EnglishPOSLexicon.class);
/**
* A list of English words with POS tags.
*/
private static final HashMap dict = new HashMap<>();
/**
* The part-of-speech.txt file contains is a combination of
* "Moby (tm) Part-of-Speech II" and the WordNet database.
*
* The latest version can be found at http://aspell.sourceforge.net/wl/.
*
* The format of each entry is
*
*
* Where the POS tag is one or more of the following:
*
* N Noun
* P Plural
* h Noun Phrase
* V Verb (usu participle)
* t Verb (transitive)
* i Verb (intransitive)
* A Adjective
* v Adverb
* C Conjunction
* P Preposition
* ! Interjection
* r Pronoun
* D Definite Article
* I Indefinite Article
* o Nominative
*
* The parts of speech before any '|' (if at all) come from the original
* Moby database. Anything after the '|' comes from the WordNet
* database. The part of speech tags from the original Moby database are
* in priority order where the principle usage is listed first. The ones
* from the WordNet database are not. Entries from the moby database
* have had any accents removed, the "ae" character expanded, the
* pound, yen, and peseta sign converted to a '$' and the CP437
* character 0xBE replaced with a '~'.
*/
static {
try (BufferedReader input = new BufferedReader(new InputStreamReader(EnglishPOSLexicon.class.getResourceAsStream("/smile/nlp/pos/part-of-speech_en.txt")))) {
String line = null;
while ((line = input.readLine()) != null) {
String[] pos = line.trim().split("\t");
if (pos.length == 2) {
int len = pos[1].length();
if (pos[1].indexOf('|') != -1) {
len -= 1;
}
PennTreebankPOS[] tag = new PennTreebankPOS[len];
for (int i = 0, k = 0; i < pos[1].length(); i++) {
switch (pos[1].charAt(i)) {
case 'N':
tag[k++] = PennTreebankPOS.NN;
break;
case 'p':
tag[k++] = PennTreebankPOS.NNS;
break;
case 'h':
tag[k++] = PennTreebankPOS.NN;
break;
case 'V':
case 't':
case 'i':
tag[k++] = PennTreebankPOS.VB;
break;
case 'A':
tag[k++] = PennTreebankPOS.JJ;
break;
case 'v':
tag[k++] = PennTreebankPOS.RB;
break;
case 'C':
tag[k++] = PennTreebankPOS.CC;
break;
case 'P':
tag[k++] = PennTreebankPOS.IN;
break;
case '!':
tag[k++] = PennTreebankPOS.UH;
break;
case 'r':
tag[k++] = PennTreebankPOS.PRP;
break;
case 'D':
case 'I':
tag[k++] = PennTreebankPOS.DT;
break;
case 'o':
tag[k++] = PennTreebankPOS.NN;
break;
}
}
dict.put(pos[0], tag);
}
}
} catch (IOException ex) {
logger.error("Failed to load /smile/nlp/pos/part-of-speech_en.txt", ex);
}
}
/**
* Returns part-of-speech tags for given word, or null if the word does
* not exist in the dictionary.
*/
public static PennTreebankPOS[] get(String word) {
return dict.get(word);
}
}