
org.github.evenjn.align.graph.TupleAlignmentGraphFactory Maven / Gradle / Ivy
/**
*
* Copyright 2016 Marco Trevisan
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.github.evenjn.align.graph;
import java.util.function.BiFunction;
import org.github.evenjn.knit.KnittingTuple;
import org.github.evenjn.yarn.Tuple;
/*
* These are notes about how Markov models use the tuple alignment graph
* data structure.
*
* T is the number of symbols (above), N is the number of states.
*
* in the regular forward procedure (same applies to backward procedure)
* we partition the paths in 1+T sections, each section having N nodes,
* each node associated to a state, and each state is connected to all
* the nodes of the next section, each edge having a cost that is
* the cost of the node-to-node transition times the cost of the emission
* of the T-th symbol.
*
* Except the edges leaving the first sections, where
* we use the initial cost instead of the transition cost.
* In fact, section 0 has only one node, and there is an edge to each node
* of section 1, with cost set to the initial probability times the cost of the
* emission of the first symbol.
*
* We can sort of compress this representation into a chain of macro-states
* where each macro-state represents a section.
* In this chain, the first arc represent the choice of initial state.
* Each arc represents the emission of one symbol.
*
*
* In a one-to-n setting, under the assumption of one arc per input
* element, an arc represents 0..(n + 1) symbols. We can still compress the
* paths graph using macro-states, but the graph will be a DAG. The macro
* states can be arranged into a matrix NxM, where M is the number of output
* symbols.
*
* When only 1-to-1 emissions are allowed, there is one state at each cell
* in the main diagonal. Only pairs of the same length are supported.
*
* When there is no limit to the "many" size of the one-to-many emissions,
* there will be one macro-state at each cell.
*
*
*
* In the one-to-many system, we extends the alpha table to account for
* the situation where the system takes an alternate route and emits
* 0 or more symbols below. To this purpose, we extend the alpha table to
* track how many symbols above and below have been emitted.
*
* C/k A/o L/5 L/- E/- D/d
* B -> [s1] -> [s1] -> [s1] -> [s1] -> [s1] -> [s1]
*
* - k o l d
* - B * * * *
* C * * * * *
* A * * * * *
* L * * * * *
* L * * * * *
* E * * * * *
* D * * * * *
*
* In position [0 0 s1] we store nothing because we will never allow
* -/- emissions.
*
* In position [0 1 s1] we could store the probability of being in s1 after
* observing -/k if we were to allow 0-to-1 emissions.
*
* In position [1 0 s1] we store the probability of being in s1 after
* observing C/-.
*
* [1 0] can be reached only from the initial state, so the value to cache
* in [ 1 0 s1 ] is computed as
*
* initial( s1 ) * emissions( s1 , "C/-" )
*
*
* In position [1 1 s3] we store the probability of being in s3 after
* observing C/k.
*
*
* Because we consider only ONE-to-many, it's not possible to observe
* -/k C/-, so [1 1] cannot be reached from [0 1]. In fact, [1 0] can be
* reached only from [0 0], so the value to cache in [ 1 1 s3 ]
* is also computed as
*
* initial( s3 ) * emissions( s3 , "C/k" )
*
* In position [2 2 s3] we store the probability of being in s3 after
* observing CA/ko.
*
* [2 2] can be reached from [1 1] through the composition of C/k A/o
* In addition, it can be reached from [1 0], throught the composition of
* C/- A/ko. Finally, it can be reached from [1 2], through the composition
* of C/ko A/-
*
* This algorithm computes, for each cell in the matrix, a set of edges.
* Each edge is represented using the coordinate of the target cell.
*
* For example: in position [0 0], no edges. in position [1 0] a single edge
* [0 0]. In position [1 1] again [0 0]. In position [2 2] we store
* [1 0] [1 1] [1 2].
*
* What exact edges there are, it depends on two parameters.
*
* If we force a ONE-to-many schema, in each cell there can be at most as
* many edges as there are elements in the string below.
*
*/
public class TupleAlignmentGraphFactory {
public static
TupleAlignmentGraph
graph(
BiFunction, Tuple, Integer> pair_encoder,
Tuple above,
Tuple below,
final int min_above,
final int max_above,
final int min_below,
final int max_below )
throws NotAlignableException {
KnittingTuple ka = KnittingTuple.wrap( above );
KnittingTuple kb = KnittingTuple.wrap( below );
final int labove = above.size( );
final int lbelow = below.size( );
final int max_number_of_edges = (1+max_below) * (1+max_above);
/* Indexing may be confusing.
* Cell in position [4 7] the matrix holds a node representing
* information about the prefix of length 4 above and length 7 below.
*/
TupleAlignmentNode[][] matrix =
new TupleAlignmentNode[1 + labove][1 + lbelow];
boolean[][] reachable = new boolean[1 + labove][1 + lbelow];
/*
* Cell in position [0 0] exists and has a purpose.
*/
TupleAlignmentNode root = new TupleAlignmentNode( );
matrix[0][0] = root;
// we fill in the structure
for ( int a = 0; a <= labove; a++ ) {
for ( int b = 0; b <= lbelow; b++ ) {
TupleAlignmentNode source_node = matrix[a][b];
if ( source_node == null ) {
continue;
}
for (int q = a + min_above; q <= a + max_above && q <= labove; q++) {
KnittingTuple key = ka.head( a, q - a );
for ( int z = b + min_below; z <= b + max_below && z <= lbelow; z++ ) {
if (q == 0 && z == 0) {
continue;
}
/*
* We are leaping forwared from [a, b] by inserting q symbol
* above and z symbols below.
*/
// when the above/below pair is not in the pair alphabet,
// the edge is not legal. skip it.
KnittingTuple sub = kb.head( b, z - b );
Integer enc = pair_encoder.apply( key, sub );
if ( enc == null ) {
continue;
}
TupleAlignmentNode target_node = matrix[q][z];
if ( target_node == null ) {
target_node = new TupleAlignmentNode( );
target_node.incoming_edges = new int[max_number_of_edges][3];
matrix[q][z] = target_node;
}
int edges = target_node.number_of_incoming_edges;
target_node.incoming_edges[edges][0] = a;
target_node.incoming_edges[edges][1] = b;
target_node.incoming_edges[edges][2] = enc;
target_node.number_of_incoming_edges = edges + 1;
}
}
}
}
if ( matrix[labove][lbelow] == null ) {
/* The pair is not representable */
throw NotAlignableException.neo;
}
reachable[labove][lbelow] = true;
for ( int a = labove; a >= 0; a-- ) {
for ( int b = lbelow; b >= 0; b-- ) {
TupleAlignmentNode node = matrix[a][b];
if ( node == null ) {
continue;
}
if ( !reachable[a][b] ) {
matrix[a][b] = null;
}
int edges = node.number_of_incoming_edges;
for ( int e = 0; e < edges; e++ ) {
int x = node.incoming_edges[e][0];
int y = node.incoming_edges[e][1];
reachable[x][y] = true;
}
}
}
TupleAlignmentGraph graph = new TupleAlignmentGraph( matrix, labove, lbelow );
return graph;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy