All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.java.sen.dictionary.Morpheme Maven / Gradle / Ivy

/*
 * Copyright (C) 2002-2007
 * Takashi Okamoto 
 * Matt Francis 
 * 
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 * 
 */

package net.java.sen.dictionary;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * A class representing part-of-speech data for a morpheme. When created during
 * the analysis of a string, a Morpheme is built as a lazy proxy onto the
 * Dictionary's part-of-speech file. Once created, a Morpheme can
 * be altered without changing the underlying data (if any)
 * 
 *  

Thread Safety: Objects of this class are NOT thread safe and * should not be accessed simultaneously by multiple threads. * *

CAUTION: {@link Morpheme}s are implemented as lazy proxies onto a * {@link Dictionary}, and care should be taken not to access the same * {@link Dictionary} from multiple threads. Once any member of a * {@link Morpheme} has been read, its link to the {@link Dictionary} is broken * and this restriction is relaxed */ public class Morpheme { /** * The {@link Dictionary} that contains this Morpheme */ private Dictionary dictionary; /** * The index of this part-of-speech within the part-of-speech information * file */ private int partOfSpeechIndex; /** * Indicates if the part-of-speech data has been loaded */ private LoadState loaded; private static enum LoadState { NONE, BASIC, FULL } /** * The conjugation type of the morpheme */ private String conjugationalType = null; /** * The conjugation form of the morpheme */ private String conjugationalForm = null; /** * The unconjugated form of the morpheme */ private String basicForm = null; /** * The readings of the morpheme */ private List readings = null; /** * The pronunciations of the morpheme */ private List pronunciations = null; /** * The part-of-speech in Chasen format */ private String partOfSpeech = null; /** * Arbitrary additional information */ private String additionalInformation = null; /** * Loads the part-of-speech data from the {@link Dictionary} * */ private void load(LoadState requested) { if (loaded.compareTo(requested) >= 0) { return; // we already loaded what we need } ByteBuffer buffer = dictionary.getPartOfSpeechInfoBuffer(); buffer.position(partOfSpeechIndex); char[] temp = new char[512]; int length; if (loaded == LoadState.NONE) { partOfSpeech = dictionary.posIndex[DictionaryUtil.readVInt(buffer)]; conjugationalType = dictionary.conjTypeIndex[DictionaryUtil.readVInt(buffer)]; conjugationalForm = dictionary.conjFormIndex[DictionaryUtil.readVInt(buffer)]; length = DictionaryUtil.readVInt(buffer); if (length == 0) { basicForm = "*"; // basic form is the same as surface form } else { DictionaryUtil.readString(buffer, temp, 0, length); basicForm = new String(temp, 0, length); } } else { // we've already loaded the basic data, but not the full, just skip over bytes DictionaryUtil.readVInt(buffer); // POS DictionaryUtil.readVInt(buffer); // conjType DictionaryUtil.readVInt(buffer); // conjForm length = DictionaryUtil.readVInt(buffer); // length if (length != 0) DictionaryUtil.readString(buffer, temp, 0, length); // basicForm } loaded = LoadState.BASIC; if (requested == LoadState.FULL) { int readingData = DictionaryUtil.readVInt(buffer); int numReadings = readingData >>> 1; readings = new ArrayList(numReadings); pronunciations = new ArrayList(numReadings); for (int i = 0; i < numReadings; i++) { length = DictionaryUtil.readVInt(buffer); final int readingLength = length >>> 1; if ((readingData & 1) == 0) { DictionaryUtil.readKatakana(buffer, temp, 0, readingLength); } else { DictionaryUtil.readString(buffer, temp, 0, readingLength); } String reading = new String(temp, 0, readingLength); readings.add(reading); if ((length & 1) != 0) { final int pronunciationLength = DictionaryUtil.readVInt(buffer); if ((readingData & 1) == 0) { DictionaryUtil.readKatakana(buffer, temp, 0, pronunciationLength); } else { DictionaryUtil.readString(buffer, temp, 0, pronunciationLength); } pronunciations.add(new String(temp, 0, pronunciationLength)); } else { pronunciations.add(reading); } } loaded = LoadState.FULL; } } /** * Gets the conjugation type of the morpheme * * @return The conjugation type */ public String getConjugationalType() { load(LoadState.BASIC); return conjugationalType; } /** * Gets the conjugation form of the morpheme * * @return The conjugation form */ public String getConjugationalForm() { load(LoadState.BASIC); return conjugationalForm; } /** * Gets the unconjugated form of the morpheme * * @return The unconjugated form */ public String getBasicForm() { load(LoadState.BASIC); return basicForm; } /** * Gets the readings of the morpheme * * @return The readings */ public List getReadings() { load(LoadState.FULL); return readings; } /** * Gets the pronunciations of the morpheme * * @return The pronunciations */ public List getPronunciations() { load(LoadState.FULL); return pronunciations; } /** * Gets the part-of-speech in Chasen format * * @return The part-of-speech in Chasen format */ public String getPartOfSpeech() { load(LoadState.BASIC); return partOfSpeech; } /** * Gets the additional information string * * @return The additional information string */ public String getAdditionalInformation() { return additionalInformation; } /** * Sets an arbitrary string of additional information * * @param additionalInformation The additional information to set */ public void setAdditionalInformation(String additionalInformation) { this.additionalInformation = additionalInformation; } /** * Compare two lists of strings * * @param list1 The first list * @param list2 The second list * @return true if the lists' contents are exactly equal */ private boolean stringListsEqual(List list1, List list2) { if (list1 == list2) { return true; } if ((list1 == null) || (list2 == null)) { return false; } if (list1.size() != list2.size()) { return false; } for (int i = 0; i < list1.size(); i++) { if (!list1.get(i).equals(list2.get(i))) { return false; } } return true; } @Override public boolean equals(Object object) { if (object instanceof Morpheme) { Morpheme morpheme = (Morpheme) object; String partOfSpeech = this.getPartOfSpeech(); String conjugationalType = this.getConjugationalType(); String conjugationalForm = this.getConjugationalForm(); String basicForm = this.getBasicForm(); List pronunciations = this.getPronunciations(); List readings = this.getReadings(); String additionalInformation = this.getAdditionalInformation(); String otherPartOfSpeech = morpheme.getPartOfSpeech(); String otherConjugationalType = morpheme.getConjugationalType(); String otherConjugationalForm = morpheme.getConjugationalForm(); String otherBasicForm = morpheme.getBasicForm(); List otherPronunciations = morpheme.getPronunciations(); List otherReadings = morpheme.getReadings(); String otherAdditionalInformation = morpheme.getAdditionalInformation(); if ( ((basicForm == otherBasicForm) || (basicForm != null && basicForm.equals(otherBasicForm))) && ((conjugationalType == otherConjugationalType) || (conjugationalType != null && conjugationalType.equals(otherConjugationalType))) && ((conjugationalForm == otherConjugationalForm) || (conjugationalForm != null && conjugationalForm.equals(otherConjugationalForm))) && ((partOfSpeech == otherPartOfSpeech) || (partOfSpeech != null && partOfSpeech.equals(otherPartOfSpeech))) && (stringListsEqual(pronunciations, otherPronunciations)) && (stringListsEqual(readings, otherReadings)) && ((additionalInformation == otherAdditionalInformation) || (additionalInformation != null && additionalInformation.equals(otherAdditionalInformation))) ) { return true; } } return false; } @Override public String toString() { load(LoadState.FULL); StringBuilder partOfSpeechData = new StringBuilder(); String[] parts = partOfSpeech.split("-"); for (int i = 0; i < 4; i++) { if (i < parts.length) { partOfSpeechData.append(parts[i]); } else { partOfSpeechData.append("*"); } partOfSpeechData.append(","); } partOfSpeechData.append(conjugationalType); partOfSpeechData.append(","); partOfSpeechData.append(conjugationalForm); partOfSpeechData.append(","); partOfSpeechData.append(basicForm); partOfSpeechData.append(","); partOfSpeechData.append((readings.size() > 0) ? readings.get(0) : "null"); partOfSpeechData.append(","); partOfSpeechData.append((pronunciations.size() > 0) ? pronunciations.get(0) : "null"); return partOfSpeechData.toString(); } /** * Builds a lazy proxy onto a part-of-speech stored in a Dictionary * * @param dictionary The dicationary to proxy upon * @param partOfSpeechIndex The index into the part-of-speech file */ public Morpheme(Dictionary dictionary, int partOfSpeechIndex) { this.dictionary = dictionary; this.partOfSpeechIndex = partOfSpeechIndex; this.loaded = LoadState.NONE; } /** * Creates a literal Morpheme that does not link to any * Dictionary * @param partOfSpeech The Chasen-format part-of-speech * @param conjugationalType The conjugational type * @param conjugationalForm The conjugational form * @param basicForm The unconjugated form * @param readings The readings * @param pronunciations The pronunciations * @param additionalInformation Arbitrary additional information */ public Morpheme(String partOfSpeech, String conjugationalType, String conjugationalForm, String basicForm, String[] readings, String[] pronunciations, String additionalInformation) { this.basicForm = basicForm; this.conjugationalType = conjugationalType; this.conjugationalForm = conjugationalForm; this.readings = new ArrayList(Arrays.asList(readings)); this.pronunciations = new ArrayList(Arrays.asList(pronunciations)); this.partOfSpeech = partOfSpeech; this.additionalInformation = additionalInformation; this.loaded = LoadState.FULL; } /** * Creates a blank, modifiable Morpheme that does not link * to any Dictionary */ public Morpheme() { this.readings = Collections.emptyList(); this.pronunciations = Collections.emptyList(); this.loaded = LoadState.FULL; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy