com.github.steveash.jg2p.align.Alignment Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2014 Steve Ash
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.steveash.jg2p.align;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.github.steveash.jg2p.Word;
import com.github.steveash.jg2p.syll.SWord;
import com.github.steveash.jg2p.syll.SyllTagTrainer;
import com.github.steveash.jg2p.syllchain.SyllChainTrainer;
import com.github.steveash.jg2p.util.Funcs;
import com.github.steveash.jg2p.util.Zipper;
import org.apache.commons.lang3.tuple.Pair;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.annotation.Nullable;
import static com.google.common.collect.Iterables.filter;
import static com.google.common.collect.Iterables.transform;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
/**
* Represents one alignment from X to Y. In the case of running alignment between X to Y the x represents the grapheme
* and the y represents the phoneme from the training example. In the inference case, only the X side will be populated
* and the Y side will be null
*
* @author Steve Ash
*/
public class Alignment implements Iterable>, Comparable {
private static final Joiner pipeJoiner = Joiner.on('|');
private static final Splitter spaceSplit = Splitter.on(' ');
private static final Function, String> SELECT_LEFT = Funcs.selectLeft();
private static final Function, String> SELECT_RIGHT = Funcs.selectRight();
private final List> graphones; // the pair of grapheme + phoneme
private final double score;
private final Word input;
// these are only used when we're dealing with syllable words
private final List graphoneSyllableGrams;
private final SWord syllWord;
private final Set graphemeSyllStarts;
public Alignment(Word input, double score) {
this(input, Lists.>newArrayList(), score);
}
public Alignment(Word input, List> finalList, double score) {
this(input, finalList, score, null, null);
}
public Alignment(Word input, List> finalList, double score, List graphoneSyllableGrams,
SWord syllWord) {
this(input, finalList, score, graphoneSyllableGrams, syllWord, null);
}
private Alignment(Word input,
List> finalList,
double score,
List graphoneSyllableGrams,
SWord syllWord,
Set graphemeSyllStarts) {
this.input = input;
this.graphones = finalList;
this.score = score;
this.graphoneSyllableGrams = graphoneSyllableGrams;
this.syllWord = syllWord;
this.graphemeSyllStarts = graphemeSyllStarts;
}
public List> getGraphones() {
return graphones;
}
public Iterable, List>> getGraphonesSplit() {
return Iterables.transform(graphones, splitBoth);
}
// one entry per X graphone with grams for Onset, Nucleus, and Coda (O,N,C). Thus graphSyllGrams.size
// == graphones.size and for each entry in the graphone and graphoneSyllGrams -- the gram count is equal
@Nullable
public List getGraphoneSyllableGrams() {
if (graphoneSyllableGrams != null) {
return graphoneSyllableGrams;
}
if (syllWord != null) {
return SyllTagTrainer.makeOncGramsFromTraining(this);
}
return null;
}
public SWord getSyllWord() {
return syllWord;
}
public Word getInputWord() {
return input;
}
public Set getGraphemeSyllStarts() {
if (this.graphemeSyllStarts != null) {
return this.graphemeSyllStarts;
}
if (syllWord != null) {
return SyllChainTrainer.splitGraphsByPhoneSylls(this);
}
return null;
}
void append(String xGram, String yGram) {
graphones.add(Pair.of(xGram, yGram));
}
Alignment finish() {
return new Alignment(input, Lists.reverse(this.graphones), score);
}
public Alignment withReplacedYs(Iterable newYs) {
return new Alignment(input, Zipper.replaceRight(this.graphones, newYs), score);
}
@Override
public Iterator> iterator() {
return graphones.iterator();
}
public double getScore() {
return score;
}
public Iterable getXTokens() {
return makeGrams(SELECT_LEFT);
}
public List getAllXTokensAsList() {
return Lists.newArrayList(transform(graphones, SELECT_LEFT));
}
public Iterable getYTokens() {
return makeGrams(SELECT_RIGHT);
}
public List getAllYTokensAsList() {
return Lists.newArrayList(transform(graphones, SELECT_RIGHT));
}
private Iterable makeGrams(Function, String> selector) {
return filter(transform(graphones, selector), Funcs.onlyNonBlank());
}
@Override
public String toString() {
return getXAsPipeString() + " -> " +
getYAsPipeString() +
(graphoneSyllableGrams == null ? "" : " /" + pipeJoiner.join(graphoneSyllableGrams) + "/") +
String.format(" (score %.4f)", score);
}
public String getYAsPipeString() {
return pipeJoiner.join(transform(graphones, SELECT_RIGHT));
}
public String getXAsPipeString() {
return pipeJoiner.join(transform(graphones, SELECT_LEFT));
}
public String getAsPipeString(Iterable symbols) {
return pipeJoiner.join(symbols);
}
public String getWordAsSpaceString() {
return input.getAsSpaceString();
}
public List getWordUnigrams() {
return input.getValue();
}
public Pair xyWordPair() {
return Pair.of(input, Word.fromGrams(getYTokens()));
}
/**
* @return a list of flags that indicate the _last_ letter in the grapheme group for the X word; this doesn't work if
* you allow epsilons on the X side
*/
public List getXBoundaryMarks() {
Preconditions.checkArgument(graphones.size() > 0);
Iterator> xIter = Iterators.filter(this.graphones.iterator(), nonEmptyXGraphones);
List xEntry = getNextX(xIter);
int xChar = 0;
List marks = Lists.newArrayListWithCapacity(input.unigramCount());
for (int i = 0; i < input.unigramCount(); i++) {
// have we exhasuted the graphone entry we're on
if (xChar >= xEntry.size()) {
xChar = 0;
xEntry = getNextX(xIter);
}
String wordGram = input.getValue().get(i);
String graphoneGram = xEntry.get(xChar);
Preconditions.checkState(wordGram.equals(graphoneGram), "Should be equal %s and %s", wordGram, graphoneGram);
boolean isLast = xChar == xEntry.size() - 1;
marks.add(isLast);
xChar += 1;
}
Preconditions.checkState(!xIter.hasNext());
return marks;
}
public List getXStartMarks() {
List marks = getXBoundaryMarks();
List starts = Lists.newArrayListWithCapacity(marks.size());
starts.add(true); // first spot is always a start
for (int i = 1; i < marks.size(); i++) {
starts.add(marks.get(i - 1));
}
return starts;
}
public String getXBoundaryMarksAsString() {
return getBoolsAsString(getXBoundaryMarks());
}
public String getXStartMarksAsString() {
return getBoolsAsString(getXStartMarks());
}
protected String getBoolsAsString(List marks) {
StringBuilder sb = new StringBuilder(marks.size());
for (Boolean mark : marks) {
sb.append(mark ? "1" : "0");
}
return sb.toString();
}
private List getNextX(Iterator> iter) {
Pair graphone = iter.next();
return spaceSplit.splitToList(graphone.getLeft());
}
public Alignment withSyllWord(SWord sword) {
return new Alignment(this.input, this.graphones, this.score, this.graphoneSyllableGrams, sword);
}
public Alignment withGraphoneSyllGrams(List graphoneSyllGrams) {
return new Alignment(this.input, this.graphones, this.score, graphoneSyllGrams, this.syllWord);
}
public Alignment withGraphemeSyllStarts(Set graphemeSyllStarts) {
return new Alignment(this.input, this.graphones, this.score, this.graphoneSyllableGrams, this.syllWord,
graphemeSyllStarts);
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Alignment pairs = (Alignment) o;
if (!graphones.equals(pairs.graphones)) {
return false;
}
return true;
}
@Override
public int hashCode() {
return graphones.hashCode();
}
@Override
public int compareTo(Alignment that) {
return Double.compare(this.score, that.score);
}
private static final Predicate> nonEmptyXGraphones = new Predicate>() {
@Override
public boolean apply(Pair input) {
return isNotBlank(input.getLeft());
}
};
private static final Function, Pair, List>> splitBoth =
new Function, Pair, List>>() {
@Override
public Pair, List> apply(Pair input) {
return Pair.of(spaceSplit.splitToList(input.getLeft()), spaceSplit.splitToList(input.getRight()));
}
};
}