net.java.sen.dictionary.Morpheme Maven / Gradle / Ivy
/*
* Copyright (C) 2002-2007
* Takashi Okamoto
* Matt Francis
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
package net.java.sen.dictionary;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* A class representing part-of-speech data for a morpheme. When created during
* the analysis of a string, a Morpheme is built as a lazy proxy onto the
* Dictionary's part-of-speech file. Once created, a Morpheme
can
* be altered without changing the underlying data (if any)
*
* Thread Safety: Objects of this class are NOT thread safe and
* should not be accessed simultaneously by multiple threads.
*
*
CAUTION: {@link Morpheme}s are implemented as lazy proxies onto a
* {@link Dictionary}, and care should be taken not to access the same
* {@link Dictionary} from multiple threads. Once any member of a
* {@link Morpheme} has been read, its link to the {@link Dictionary} is broken
* and this restriction is relaxed
*/
public class Morpheme {
/**
* The {@link Dictionary} that contains this Morpheme
*/
private Dictionary dictionary;
/**
* The index of this part-of-speech within the part-of-speech information
* file
*/
private int partOfSpeechIndex;
/**
* Indicates if the part-of-speech data has been loaded
*/
private LoadState loaded;
private static enum LoadState {
NONE,
BASIC,
FULL
}
/**
* The conjugation type of the morpheme
*/
private String conjugationalType = null;
/**
* The conjugation form of the morpheme
*/
private String conjugationalForm = null;
/**
* The unconjugated form of the morpheme
*/
private String basicForm = null;
/**
* The readings of the morpheme
*/
private List readings = null;
/**
* The pronunciations of the morpheme
*/
private List pronunciations = null;
/**
* The part-of-speech in Chasen format
*/
private String partOfSpeech = null;
/**
* Arbitrary additional information
*/
private String additionalInformation = null;
/**
* Loads the part-of-speech data from the {@link Dictionary}
*
*/
private void load(LoadState requested) {
if (loaded.compareTo(requested) >= 0) {
return; // we already loaded what we need
}
ByteBuffer buffer = dictionary.getPartOfSpeechInfoBuffer();
buffer.position(partOfSpeechIndex);
char[] temp = new char[512];
int length;
if (loaded == LoadState.NONE) {
partOfSpeech = dictionary.posIndex[DictionaryUtil.readVInt(buffer)];
conjugationalType = dictionary.conjTypeIndex[DictionaryUtil.readVInt(buffer)];
conjugationalForm = dictionary.conjFormIndex[DictionaryUtil.readVInt(buffer)];
length = DictionaryUtil.readVInt(buffer);
if (length == 0) {
basicForm = "*"; // basic form is the same as surface form
} else {
DictionaryUtil.readString(buffer, temp, 0, length);
basicForm = new String(temp, 0, length);
}
} else {
// we've already loaded the basic data, but not the full, just skip over bytes
DictionaryUtil.readVInt(buffer); // POS
DictionaryUtil.readVInt(buffer); // conjType
DictionaryUtil.readVInt(buffer); // conjForm
length = DictionaryUtil.readVInt(buffer); // length
if (length != 0)
DictionaryUtil.readString(buffer, temp, 0, length); // basicForm
}
loaded = LoadState.BASIC;
if (requested == LoadState.FULL) {
int readingData = DictionaryUtil.readVInt(buffer);
int numReadings = readingData >>> 1;
readings = new ArrayList(numReadings);
pronunciations = new ArrayList(numReadings);
for (int i = 0; i < numReadings; i++) {
length = DictionaryUtil.readVInt(buffer);
final int readingLength = length >>> 1;
if ((readingData & 1) == 0) {
DictionaryUtil.readKatakana(buffer, temp, 0, readingLength);
} else {
DictionaryUtil.readString(buffer, temp, 0, readingLength);
}
String reading = new String(temp, 0, readingLength);
readings.add(reading);
if ((length & 1) != 0) {
final int pronunciationLength = DictionaryUtil.readVInt(buffer);
if ((readingData & 1) == 0) {
DictionaryUtil.readKatakana(buffer, temp, 0, pronunciationLength);
} else {
DictionaryUtil.readString(buffer, temp, 0, pronunciationLength);
}
pronunciations.add(new String(temp, 0, pronunciationLength));
} else {
pronunciations.add(reading);
}
}
loaded = LoadState.FULL;
}
}
/**
* Gets the conjugation type of the morpheme
*
* @return The conjugation type
*/
public String getConjugationalType() {
load(LoadState.BASIC);
return conjugationalType;
}
/**
* Gets the conjugation form of the morpheme
*
* @return The conjugation form
*/
public String getConjugationalForm() {
load(LoadState.BASIC);
return conjugationalForm;
}
/**
* Gets the unconjugated form of the morpheme
*
* @return The unconjugated form
*/
public String getBasicForm() {
load(LoadState.BASIC);
return basicForm;
}
/**
* Gets the readings of the morpheme
*
* @return The readings
*/
public List getReadings() {
load(LoadState.FULL);
return readings;
}
/**
* Gets the pronunciations of the morpheme
*
* @return The pronunciations
*/
public List getPronunciations() {
load(LoadState.FULL);
return pronunciations;
}
/**
* Gets the part-of-speech in Chasen format
*
* @return The part-of-speech in Chasen format
*/
public String getPartOfSpeech() {
load(LoadState.BASIC);
return partOfSpeech;
}
/**
* Gets the additional information string
*
* @return The additional information string
*/
public String getAdditionalInformation() {
return additionalInformation;
}
/**
* Sets an arbitrary string of additional information
*
* @param additionalInformation The additional information to set
*/
public void setAdditionalInformation(String additionalInformation) {
this.additionalInformation = additionalInformation;
}
/**
* Compare two lists of strings
*
* @param list1 The first list
* @param list2 The second list
* @return true
if the lists' contents are exactly equal
*/
private boolean stringListsEqual(List list1, List list2) {
if (list1 == list2) {
return true;
}
if ((list1 == null) || (list2 == null)) {
return false;
}
if (list1.size() != list2.size()) {
return false;
}
for (int i = 0; i < list1.size(); i++) {
if (!list1.get(i).equals(list2.get(i))) {
return false;
}
}
return true;
}
@Override
public boolean equals(Object object) {
if (object instanceof Morpheme) {
Morpheme morpheme = (Morpheme) object;
String partOfSpeech = this.getPartOfSpeech();
String conjugationalType = this.getConjugationalType();
String conjugationalForm = this.getConjugationalForm();
String basicForm = this.getBasicForm();
List pronunciations = this.getPronunciations();
List readings = this.getReadings();
String additionalInformation = this.getAdditionalInformation();
String otherPartOfSpeech = morpheme.getPartOfSpeech();
String otherConjugationalType = morpheme.getConjugationalType();
String otherConjugationalForm = morpheme.getConjugationalForm();
String otherBasicForm = morpheme.getBasicForm();
List otherPronunciations = morpheme.getPronunciations();
List otherReadings = morpheme.getReadings();
String otherAdditionalInformation = morpheme.getAdditionalInformation();
if (
((basicForm == otherBasicForm) || (basicForm != null && basicForm.equals(otherBasicForm)))
&& ((conjugationalType == otherConjugationalType) || (conjugationalType != null && conjugationalType.equals(otherConjugationalType)))
&& ((conjugationalForm == otherConjugationalForm) || (conjugationalForm != null && conjugationalForm.equals(otherConjugationalForm)))
&& ((partOfSpeech == otherPartOfSpeech) || (partOfSpeech != null && partOfSpeech.equals(otherPartOfSpeech)))
&& (stringListsEqual(pronunciations, otherPronunciations))
&& (stringListsEqual(readings, otherReadings))
&& ((additionalInformation == otherAdditionalInformation) || (additionalInformation != null && additionalInformation.equals(otherAdditionalInformation)))
)
{
return true;
}
}
return false;
}
@Override
public String toString() {
load(LoadState.FULL);
StringBuilder partOfSpeechData = new StringBuilder();
String[] parts = partOfSpeech.split("-");
for (int i = 0; i < 4; i++) {
if (i < parts.length) {
partOfSpeechData.append(parts[i]);
} else {
partOfSpeechData.append("*");
}
partOfSpeechData.append(",");
}
partOfSpeechData.append(conjugationalType);
partOfSpeechData.append(",");
partOfSpeechData.append(conjugationalForm);
partOfSpeechData.append(",");
partOfSpeechData.append(basicForm);
partOfSpeechData.append(",");
partOfSpeechData.append((readings.size() > 0) ? readings.get(0) : "null");
partOfSpeechData.append(",");
partOfSpeechData.append((pronunciations.size() > 0) ? pronunciations.get(0) : "null");
return partOfSpeechData.toString();
}
/**
* Builds a lazy proxy onto a part-of-speech stored in a Dictionary
*
* @param dictionary The dicationary to proxy upon
* @param partOfSpeechIndex The index into the part-of-speech file
*/
public Morpheme(Dictionary dictionary, int partOfSpeechIndex) {
this.dictionary = dictionary;
this.partOfSpeechIndex = partOfSpeechIndex;
this.loaded = LoadState.NONE;
}
/**
* Creates a literal Morpheme
that does not link to any
* Dictionary
* @param partOfSpeech The Chasen-format part-of-speech
* @param conjugationalType The conjugational type
* @param conjugationalForm The conjugational form
* @param basicForm The unconjugated form
* @param readings The readings
* @param pronunciations The pronunciations
* @param additionalInformation Arbitrary additional information
*/
public Morpheme(String partOfSpeech, String conjugationalType, String conjugationalForm, String basicForm, String[] readings, String[] pronunciations, String additionalInformation) {
this.basicForm = basicForm;
this.conjugationalType = conjugationalType;
this.conjugationalForm = conjugationalForm;
this.readings = new ArrayList(Arrays.asList(readings));
this.pronunciations = new ArrayList(Arrays.asList(pronunciations));
this.partOfSpeech = partOfSpeech;
this.additionalInformation = additionalInformation;
this.loaded = LoadState.FULL;
}
/**
* Creates a blank, modifiable Morpheme
that does not link
* to any Dictionary
*/
public Morpheme() {
this.readings = Collections.emptyList();
this.pronunciations = Collections.emptyList();
this.loaded = LoadState.FULL;
}
}