com.github.steveash.jg2p.align.AlignerInferencer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jg2p-core Show documentation
The newest version!
/*
 * Copyright 2014 Steve Ash
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.steveash.jg2p.align;

import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
import com.google.common.math.DoubleMath;

import com.github.steveash.jg2p.Word;

import java.util.Collections;
import java.util.List;

/**
 * Runs inference on a sequence X to determine the top-k probable alignment(s)
 *
 * @author Steve Ash
 */
public class AlignerInferencer {

  private final GramOptions opts;
  private final ProbTable probs;
  private final ProbTable.Marginals margs;

  public AlignerInferencer(GramOptions opts, ProbTable probs) {
    this.opts = opts;
    this.probs = probs;
    this.margs = probs.calculateMarginals();
  }

  public List bestGraphemes(Word x, int bestPathCount) {
    PathXTable t = new PathXTable(x.unigramCount() + 1, bestPathCount);
    t.offer(0, t.make(0, -1, -1));

    for (int xx = 1; xx < x.unigramCount() + 1; xx++) {
      for (int i = 1; (i <= opts.getMaxXGram()) && (xx - i >= 0); i++) {
        String xGram = x.gram(xx - i, i);
        double margX = margs.probX(xGram);

        double score = DoubleMath.log2(margX) * i;
        t.extendPath(xx, xx - i, PathXTable.Entry.sample(score, i));
      }
    }

    return createAlignments(x, t, bestPathCount);
  }

  private List createAlignments(Word x, PathXTable t, int bestPathCount) {
    List results = Lists.newArrayListWithCapacity(bestPathCount);

    Iterable lastEntries = t.get(x.unigramCount());

    for (PathXTable.Entry lastEntry : lastEntries) {
      if (lastEntry.score < ProbTable.minLogProb) {
        continue;
      }

      results.add(decodePathFrom(x, t, lastEntry));
    }
    Collections.sort(results, Ordering.natural().reverse());
    return results;
  }

  private Alignment decodePathFrom(Word x, PathXTable t, PathXTable.Entry entry) {
    int xx = x.unigramCount();
    Alignment a = new Alignment(x, entry.score);

    while (xx > 0) {
      String xGram = x.gram(xx - entry.xBackRef, entry.xBackRef);
      a.append(xGram, "");

      xx -= entry.xBackRef;
      entry = t.get(xx, entry.pathBackRef);
    }
    return a.finish();
  }

}