All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.joshua.decoder.hypergraph.WordAlignmentState Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.joshua.decoder.hypergraph;

import static java.lang.Integer.MAX_VALUE;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;

import org.apache.joshua.decoder.ff.tm.Rule;

/**
 * This class encodes a derivation state in terms of a list of alignment points.
 * Whenever a child instance is substituted into the parent instance, we need to
 * adjust source indexes of the alignments.
 * 
 * @author fhieber
 */
public class WordAlignmentState {

  /**
   * each element in this list corresponds to a token on the target side of the
   * rule. The values of the elements correspond to the aligned source token on
   * the source side of the rule.
   */
  private final List trgPoints;
  private final int srcStart;
  /** number of NTs we need to substitute. */
  private int numNT;
  /** grows with substitutions of child rules. Reaches original Rule span if substitutions are complete */
  private int srcLength;

  /**
   * construct AlignmentState object from a virgin Rule and its source span.
   * Determines if state is complete (if no NT present)
   * 
   * @param rule the input Rule
   * @param start the start index
   */
  public WordAlignmentState(final Rule rule, final int start) {
    trgPoints = new LinkedList<>();
    srcLength = rule.getFrench().length;
    numNT = rule.getArity();
    srcStart = start;
    final Map> alignmentMap = rule.getAlignmentMap();
    final int[] nonTerminalSourcePositions = rule.getNonTerminalSourcePositions();
    final int[] trg = rule.getEnglish();
    // for each target index, create a TargetAlignmentPoint
    for (int trgIndex = 0; trgIndex < trg.length; trgIndex++) {
      final AlignedSourceTokens trgPoint = new AlignedSourceTokens();

      if (trg[trgIndex] >= 0) { // this is a terminal symbol, check for alignment
        if (alignmentMap.containsKey(trgIndex)) {
          // add source indexes to TargetAlignmentPoint
          for (int srcIdx : alignmentMap.get(trgIndex)) {
            trgPoint.add(srcStart + srcIdx);
          }
        } else { // this target word is NULL-aligned
          trgPoint.setNull();
        }
      } else { // this is a nonterminal ([X]) [actually its the (negative) index of the NT in the source]
        trgPoint.setNonTerminal(); // mark as non-terminal
        final int absoluteNonTerminalSourcePosition = srcStart + nonTerminalSourcePositions[Math.abs(trg[trgIndex]) - 1];
        trgPoint.add(absoluteNonTerminalSourcePosition);
      }
      trgPoints.add(trgPoint);
    }
  }

  /**
   * if there are no more NonTerminals to substitute,
   * this state is said to be complete
   * @return true if complete
   */
  public boolean isComplete() {
    return numNT == 0;
  }

  /**
   * builds the final alignment string in the standard alignment format: src -
   * trg. Sorted by trg indexes. Disregards the sentence markers.
   * @return result string
   */
  public String toFinalString() {
    final StringBuilder sb = new StringBuilder();
    int t = 0;
    for (AlignedSourceTokens pt : trgPoints) {
      for (int s : pt) {
        sb.append(String.format(" %d-%d", s-1, t-1)); // disregard sentence markers
      }
      t++;
    }
    final String result = sb.toString();
    if (!result.isEmpty()) {
      return result.substring(1);
    }
    return result;
  }
  
  /**
   * builds the final alignment list.
   * each entry in the list corresponds to a list of aligned source tokens.
   * First and last item in trgPoints is skipped.
   * @return a final alignment list
   */
  public List> toFinalList() {
    final List> alignment = new ArrayList<>(trgPoints.size());
    if (trgPoints.isEmpty()) {
      return alignment;
    }
    final ListIterator it = trgPoints.listIterator();
    it.next(); // skip first item (sentence marker)
    while (it.hasNext()) {
      final AlignedSourceTokens alignedSourceTokens = it.next();
      if (it.hasNext()) { // if not last element in trgPoints
        final List newAlignedSourceTokens = new ArrayList<>();
        for (Integer sourceIndex : alignedSourceTokens) {
          newAlignedSourceTokens.add(sourceIndex - 1); // shift by one to disregard sentence marker
        }
        alignment.add(newAlignedSourceTokens);
      }
    }
    return alignment;
  }

  /**
   * String representation for debugging.
   */
  @Override
  public String toString() {
    return String.format("%s , len=%d start=%d, isComplete=%s",
        trgPoints.toString(), srcLength, srcStart, this.isComplete());
  }

  /**
   * Substitutes a child WorldAlignmentState into this instance at the next
   * nonterminal slot. Also shifts the indeces in this instance by the span/width of the
   * child that is to be substituted.
   * Substitution order is determined by the source-first traversal through the hypergraph.
   * 
   * @param child The child
   */
  public void substituteIn(WordAlignmentState child) {
    // find the index of the NonTerminal where we substitute the child targetPoints into.
    // The correct NT is the first one on the SOURCE side.
    // Also shift all trgPoints by the child length.
    int substitutionIndex = 0;
    int sourcePosition = MAX_VALUE;
    for (final ListIterator trgPointsIterator = trgPoints.listIterator(); trgPointsIterator.hasNext();) {
      final AlignedSourceTokens trgPoint = trgPointsIterator.next();
      trgPoint.shiftBy(child.srcStart, child.srcLength - 1);
      if (trgPoint.isNonTerminal() && trgPoint.get(0) < sourcePosition) {
        sourcePosition = trgPoint.get(0);
        substitutionIndex = trgPointsIterator.previousIndex();
      }
    }
    
    // point and remove NT element determined from above
    final ListIterator insertionIterator = trgPoints.listIterator(substitutionIndex);
    insertionIterator.next();
    insertionIterator.remove();
    
    // insert child target points and set them to final.
    for (AlignedSourceTokens childElement : child.trgPoints) {
      childElement.setFinal();
      insertionIterator.add(childElement);
    }
    
    // update length and number of non terminal slots
    this.srcLength += child.srcLength - 1; // -1 (NT)
    this.numNT--;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy