![JAR search and dependency download from the Maven repository](/logo.png)
org.github.evenjn.align.graph.TupleAlignmentGraphFactory Maven / Gradle / Ivy
/**
*
* Copyright 2016 Marco Trevisan
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.github.evenjn.align.graph;
import java.util.function.BiFunction;
import org.github.evenjn.knit.KnittingTuple;
import org.github.evenjn.yarn.Tuple;
/*
* These are notes about how Markov models use the tuple alignment graph
* data structure.
*
* T is the number of symbols (above), N is the number of states.
*
* in the regular forward procedure (same applies to backward procedure)
* we partition the paths in 1+T sections, each section having N nodes,
* each node associated to a state, and each state is connected to all
* the nodes of the next section, each edge having a cost that is
* the cost of the node-to-node transition times the cost of the emission
* of the T-th symbol.
*
* Section 0 has only one node, and there is an edge to each node
* of section 1, with cost set to the initial probability.
*
* We can sort of compress this representation into a chain of macro-states
* where each macro-state represents a section.
* In this chain, the first arc represent the choice of initial state.
* Each arc represents the emission of one symbol.
*
*
* In a one-to-n setting, under the assumption of one arc per input
* element, an arc represents 0..(n + 1) symbols. We can still compress the
* paths graph using macro-states, but the graph will be a DAG. The macro
* states can be arranged into a matrix NxM, where M is the number of output
* symbols.
*
* When only 1-to-1 emissions are allowed, there is one state at each cell
* in the main diagonal. Only pairs of the same length are supported.
*
* When there is no limit to the "many" size of the one-to-many emissions,
* there will be one macro-state at each cell.
*
*
*
* In the one-to-many system, we extends the alpha table to account for
* the situation where the system takes an alternate route and emits
* 0 or more symbols below. To this purpose, we extend the alpha table to
* track how many symbols above and below have been emitted.
*
* C/k A/o L/5 L/- E/- D/d
* B -> [s1] -> [s1] -> [s1] -> [s1] -> [s1] -> [s1] -> E
*
* - k o l d
* - B * * * *
* C * * * * *
* A * * * * *
* L * * * * *
* L * * * * *
* E * * * * *
* D * * * * *
*
* In position [0 0 s1] we store nothing because we will never allow
* -/- emissions.
*
* In position [0 1 s1] we could store the probability of being in s1 after
* observing -/k but this makes sense only if we allow 0-to-1 emissions.
*
* In position [1 0 s1] we store the probability of being in s1 after
* observing C/-.
*
* [1 0] can be reached only from the initial state, so the value to cache
* in [ 1 0 s1 ] is computed as
*
* initial( s1 ) * emissions( s1 , "C/-" )
*
*
* In position [1 1 s3] we store the probability of being in s3 after
* observing C/k.
*
*
* Because we consider only ONE-to-many, it's not possible to observe
* -/k C/-, so [1 1] cannot be reached from [0 1]. In fact, [1 0] can be
* reached only from [0 0], so the value to cache in [ 1 1 s3 ]
* is also computed as
*
* initial( s3 ) * emissions( s3 , "C/k" )
*
* In position [2 2 s3] we store the probability of being in s3 after
* observing CA/ko.
*
* [2 2] can be reached from [1 1] through the composition of C/k A/o
* In addition, it can be reached from [1 0], throught the composition of
* C/- A/ko. Finally, it can be reached from [1 2], through the composition
* of C/ko A/-
*
* This algorithm computes, for each cell in the matrix, a set of edges.
* Each edge is represented using the coordinate of the target cell.
*
* For example: in position [0 0], no edges. in position [1 0] a single edge
* [0 0]. In position [1 1] again [0 0]. In position [2 2] we store
* [1 0] [1 1] [1 2].
*
* What exact edges there are, it depends on two parameters.
*
* If we force a ONE-to-many schema, in each cell there can be at most as
* many edges as there are elements in the string below.
*
*/
public class TupleAlignmentGraphFactory {
public static
TupleAlignmentGraph
graph(
BiFunction, Integer> pair_encoder,
Tuple above,
Tuple below,
final int min_below,
final int max_below )
throws NotAlignableException {
KnittingTuple ka = KnittingTuple.wrap( above );
KnittingTuple kb = KnittingTuple.wrap( below );
final int labove = above.size( );
final int lbelow = below.size( );
/*
* If we force a ONE-to-many schema, in each cell there can be at most as
* many edges as there are elements in the string below.
*/
final int max_number_of_edges = lbelow + 1;
/*
* In a MANY-to-MANY schema, we would need two ints to identify the target
* of each edge. In a ONE-to-MANY, we just need one. In a MANY-to-MANY
* schema, each edge would be represented as a short array with length 2.
*/
TupleAlignmentNode[][] matrix =
new TupleAlignmentNode[1 + labove][1 + lbelow];
// we fill in the structure
for ( int a = 0; a <= labove; a++ ) {
for ( int b = 0; b <= lbelow; b++ ) {
if ( a == 0 && b == 0 ) {
TupleAlignmentNode root = new TupleAlignmentNode( );
matrix[0][0] = root;
continue;
}
if ( a == 0 && b != 0 ) {
matrix[a][b] = null;
continue;
}
TupleAlignmentNode source_node = matrix[a - 1][b];
if ( source_node == null ) {
// the macro state above us is not reachable.
continue;
}
/* Indexing may be confusing.
* Remember that in position [4 7] the graph holds a node representing
* information about the prefix of length 4 above and length 7 below.
*/
SymbolAbove key = ka.get( a - 1 );
/* We are leaping forwared from [a - 1, b] by inserting one symbol
* above. In case of min=0 and max=2, we will most likely insert new
* nodes in positions:
* [a b] [a b+1] [a b+2]
*/
for ( int z = b + min_below; z <= b + max_below && z <= lbelow; z++ ) {
// additional condition to prune tree: the end of the string
// must be in range from [a z].
int leftover_above = labove - a;
if ( z + leftover_above * max_below < lbelow
|| z + leftover_above * min_below > lbelow ) {
continue;
}
// when the above/below pair is not in the pair alphabet,
// the edge is not legal. skip it.
KnittingTuple sub = kb.head( b, z - b );
Integer enc = pair_encoder.apply( key, sub );
if (enc == null) {
continue;
}
TupleAlignmentNode target_node = matrix[a][z];
if ( target_node == null ) {
target_node = new TupleAlignmentNode( );
target_node.incoming_edges = new int[max_number_of_edges][3];
matrix[a][z] = target_node;
}
int edges = target_node.number_of_incoming_edges;
target_node.incoming_edges[edges][0] = a - 1;
target_node.incoming_edges[edges][1] = b;
target_node.incoming_edges[edges][2] = enc;
target_node.number_of_incoming_edges = edges + 1;
}
}
}
if ( matrix[labove][lbelow] == null ) {
/* The pair is not representable */
throw NotAlignableException.neo;
}
matrix[labove][lbelow].is_reachable_from_end = true;
for ( int a = labove; a >= 0; a-- ) {
for ( int b = lbelow; b >= 0; b-- ) {
TupleAlignmentNode node = matrix[a][b];
if ( node == null ) {
continue;
}
if ( !node.is_reachable_from_end ) {
matrix[a][b] = null;
}
int edges = node.number_of_incoming_edges;
for ( int e = 0; e < edges; e++ ) {
int x = node.incoming_edges[e][0];
int y = node.incoming_edges[e][1];
TupleAlignmentNode origin = matrix[x][y];
origin.is_reachable_from_end = true;
}
}
}
TupleAlignmentGraph graph = new TupleAlignmentGraph( matrix, labove, lbelow );
for ( int a = 0; a <= labove; a++ ) {
for ( int b = 0; b <= lbelow; b++ ) {
if ( matrix[a][b] == null || ( a == 0 && b == 0 ) ) {
continue;
}
int[][] ie = matrix[a][b].incoming_edges;
int no_ie = matrix[a][b].number_of_incoming_edges;
for ( int e_i = 0; e_i < no_ie; e_i++ ) {
int x = ie[e_i][0];
int y = ie[e_i][1];
SymbolAbove key = ka.get( x );
KnittingTuple sub = kb.head( y, b - y );
Integer enc = pair_encoder.apply( key, sub );
ie[e_i][2] = enc;
}
}
}
return graph;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy