
org.github.evenjn.align.TupleAlignmentGraphDataManager Maven / Gradle / Ivy
/**
*
* Copyright 2016 Marco Trevisan
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.github.evenjn.align;
import java.util.Iterator;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.github.evenjn.align.alphabet.TupleAlignmentAlphabet;
import org.github.evenjn.align.alphabet.TupleAlignmentAlphabetBuilder;
import org.github.evenjn.align.alphabet.TupleAlignmentAlphabetDeserializer;
import org.github.evenjn.align.alphabet.TupleAlignmentAlphabetSerializer;
import org.github.evenjn.align.alphabet.TupleAlignmentAlphabetPair;
import org.github.evenjn.align.graph.NotAlignableException;
import org.github.evenjn.align.graph.TupleAlignmentGraphFactory;
import org.github.evenjn.align.graph.TupleAlignmentGraph;
import org.github.evenjn.align.graph.TupleAlignmentGraphDeserializer;
import org.github.evenjn.align.graph.TupleAlignmentGraphSerializer;
import org.github.evenjn.align.graph.TupleAlignmentNode;
import org.github.evenjn.knit.BasicAutoHook;
import org.github.evenjn.knit.Bi;
import org.github.evenjn.knit.KnittingCursable;
import org.github.evenjn.knit.KnittingCursor;
import org.github.evenjn.knit.KnittingTuple;
import org.github.evenjn.knit.ProgressManager;
import org.github.evenjn.numeric.FrequencyDistribution;
import org.github.evenjn.yarn.AutoHook;
import org.github.evenjn.yarn.Cursable;
import org.github.evenjn.yarn.Di;
import org.github.evenjn.yarn.Hook;
import org.github.evenjn.yarn.Progress;
import org.github.evenjn.yarn.ProgressSpawner;
import org.github.evenjn.yarn.SkipException;
import org.github.evenjn.yarn.SkipMap;
import org.github.evenjn.yarn.Tuple;
/*
* This object acts as a preprocessor for systems that work on alingment graphs.
*
* Its function is to transform a dataset of tuple pairs into a dataset of
* alignment graphs.
*
* It also provides information about the dataset such as the length of the
* longest tuples and the maximum number of edges occurring in a graph.
*
*
*
*/
public class TupleAlignmentGraphDataManager {
private int record_max_length_above = 0;
private int record_max_length_below = 0;
private int record_max_number_of_edges = 0;
private boolean enable_cache;
private boolean refresh_cache;
public TupleAlignmentGraphDataManager(
int min_below,
int max_below) {
this.min_below = min_below;
this.max_below = max_below;
this.putter_coalignment_alphabet = null;
this.reader_coalignment_alphabet = null;
this.putter_coalignment_graphs = null;
this.reader_coalignment_graphs = null;
this.a_serializer = null;
this.b_serializer = null;
this.a_deserializer = null;
this.b_deserializer = null;
this.enable_cache = false;
this.refresh_cache = false;
}
public TupleAlignmentGraphDataManager(
int min_below,
int max_below,
Function> putter_coalignment_alphabet,
Cursable reader_coalignment_alphabet,
Function> putter_coalignment_graphs,
Cursable reader_coalignment_graphs,
Function a_serializer,
Function b_serializer,
Function a_deserializer,
Function b_deserializer,
boolean refresh_cache) {
this.min_below = min_below;
this.max_below = max_below;
this.putter_coalignment_alphabet = putter_coalignment_alphabet;
this.reader_coalignment_alphabet = reader_coalignment_alphabet;
this.putter_coalignment_graphs = putter_coalignment_graphs;
this.reader_coalignment_graphs = reader_coalignment_graphs;
this.a_serializer = a_serializer;
this.b_serializer = b_serializer;
this.a_deserializer = a_deserializer;
this.b_deserializer = b_deserializer;
this.enable_cache = true;
this.refresh_cache = refresh_cache;
if ( putter_coalignment_alphabet == null )
throw new IllegalArgumentException( );
if ( reader_coalignment_alphabet == null )
throw new IllegalArgumentException( );
if ( putter_coalignment_graphs == null )
throw new IllegalArgumentException( );
if ( reader_coalignment_graphs == null )
throw new IllegalArgumentException( );
if ( a_serializer == null )
throw new IllegalArgumentException( );
if ( b_serializer == null )
throw new IllegalArgumentException( );
if ( a_deserializer == null )
throw new IllegalArgumentException( );
if ( b_deserializer == null )
throw new IllegalArgumentException( );
}
private final int min_below;
private final int max_below;
private final Function> putter_coalignment_alphabet;
private final Cursable reader_coalignment_alphabet;
private final Function> putter_coalignment_graphs;
private final Cursable reader_coalignment_graphs;
private final Function a_serializer;
private final Function b_serializer;
private final Function a_deserializer;
private final Function b_deserializer;
private KnittingCursable graphs;
private TupleAlignmentAlphabet alphabet;
public TupleAlignmentAlphabet getAlphabet( ) {
if ( alphabet == null ) {
throw new IllegalStateException( );
}
return alphabet;
}
public KnittingCursable getGraphs( ) {
if ( graphs == null ) {
throw new IllegalStateException( );
}
return graphs;
}
/**
* @return the maximum number of input symbols observed in the cached data.
*/
public int getMaxLenghtAbove( ) {
return record_max_length_above;
}
/**
* @return the maximum number of output symbols observed in the cached data.
*/
public int getMaxLenghtBelow( ) {
return record_max_length_below;
}
/**
* @return the maximum number of edges in a single tuple alignment graph
* observed in the cached data.
*/
public int getMaxNumberOfEdges( ) {
return record_max_number_of_edges;
}
public TupleAlignmentGraphDataManager load(
Cursable, Tuple>> data,
ProgressSpawner progress ) {
KnittingCursable, Tuple>> kc = KnittingCursable.wrap( data );
alphabet = prepareAlphabet( kc, progress );
graphs = prepareGraphs( kc, progress );
return this;
}
private
TupleAlignmentAlphabet
prepareAlphabet(
KnittingCursable, Tuple>> data,
ProgressSpawner progress ) {
TupleAlignmentAlphabet coalignment_alphabet = null;
if ( !enable_cache || refresh_cache ) {
/*
* re-compute the coalignment alphabet.
*/
KnittingCursable, Tuple>> map =
data
.map( x -> ( new Bi, Tuple>( )
.set( x.front( ), x.back( ) ) ) );
try ( AutoHook hook = new BasicAutoHook( ) ) {
Progress spawn = ProgressManager.safeSpawn( hook, progress, "prepareAlphabet" );
spawn.info( "Computing dataset size." );
int size = data.size( );
spawn.target( enable_cache ? 2 * size : size );
spawn.info( "Working out alphabet" );
coalignment_alphabet =
createAlphabet( map.pull( hook ), min_below, max_below, spawn );
/*
* serialize the coalignment alphabet, and pour it into the putter.
*/
if ( enable_cache ) {
spawn.info( "Serializing alignment graphs" );
TupleAlignmentAlphabetSerializer serializer =
new TupleAlignmentAlphabetSerializer<>(
coalignment_alphabet,
a_serializer,
b_serializer );
KnittingCursable.wrap( serializer )
.tap( x -> spawn.step( 1 ) )
.consume(
putter_coalignment_alphabet );
}
}
}
if ( enable_cache ) {
/*
* Otherwise, de-serialize it from the reader.
*/
try ( AutoHook hook = new BasicAutoHook( ) ) {
/**
* This is interesting, because the output of the serializer is not
* volatile, but how can we communicate that?
*/
coalignment_alphabet = KnittingCursable
.wrap( reader_coalignment_alphabet ).pull( hook )
.skipfold( new TupleAlignmentAlphabetDeserializer<>(
a_deserializer,
b_deserializer ) ).one( );
}
// int count = 0;
// for (TupleAlignmentPair i : coalignment_alphabet) {
// StringBuilder sb = new StringBuilder( );
// sb.append( count++ ).append( " ").append( i.print( ) );
// System.out.println(sb.toString( ));
// }
}
return coalignment_alphabet;
}
private
KnittingCursable
prepareGraphs(
KnittingCursable, Tuple>> data,
ProgressSpawner progress ) {
KnittingCursable graphs = null;
if ( !enable_cache || refresh_cache ) {
/*
* re-compute the coalignment graphs.
*
* This is a lazy iterator, so the graphs are computed on demand.
*/
SkipMap, Tuple>, TupleAlignmentGraph> skipMap =
new SkipMap, Tuple>, TupleAlignmentGraph>( ) {
@Override
public TupleAlignmentGraph get(
Di, Tuple> x )
throws SkipException {
try {
return TupleAlignmentGraphFactory.graph(
( a, b ) -> alphabet.encode( a, b ),
x.front( ),
x.back( ),
min_below,
max_below );
}
catch ( NotAlignableException e ) {
throw SkipException.neo;
}
}
};
graphs = data.skipmap( skipMap );
if ( enable_cache ) {
try ( AutoHook hook = new BasicAutoHook( ) ) {
Progress spawn =
ProgressManager.safeSpawn( hook, progress, "prepareGraphs");
KnittingCursable graphs_to_write = graphs;
spawn.info( "Computing dataset size." );
spawn.target( data.size( ) );
graphs_to_write = data
.tap( x -> spawn.step(1 ) )
.skipmap( skipMap );
StringBuilder header = new StringBuilder( );
header.append( record_max_length_above );
header.append( "," );
header.append( record_max_length_below );
header.append( "," );
header.append( record_max_number_of_edges );
KnittingCursor.on( header.toString( ) ).chain(
graphs_to_write
.pull( hook )
.unfoldCursable(
x -> new TupleAlignmentGraphSerializer( x ) ) )
.consume( putter_coalignment_graphs );
}
}
}
if ( enable_cache ) {
try ( AutoHook hook = new BasicAutoHook( ) ) {
Pattern splitter = Pattern.compile( "," );
String[] split = splitter.split(
KnittingCursable.wrap( reader_coalignment_graphs ).head( 0, 1 ).one( hook ) );
record_max_length_above = Integer.parseInt( split[0] );
record_max_length_below = Integer.parseInt( split[1] );
record_max_number_of_edges = Integer.parseInt( split[2] );
}
/*
* de-serialize them from the reader.
*/
graphs = KnittingCursable
.wrap( reader_coalignment_graphs )
.headless( 1 )
.skipfold( ( ) -> new TupleAlignmentGraphDeserializer(
record_max_length_above,
record_max_length_below ) );
}
return graphs;
}
private FrequencyDistribution fd_sa = null;
private FrequencyDistribution fd_sb = null;
private FrequencyDistribution> fd_pair =
null;
private void computeStats( ) {
fd_sa = new FrequencyDistribution<>( );
fd_sb = new FrequencyDistribution<>( );
fd_pair = new FrequencyDistribution<>( );
try ( AutoHook hook = new BasicAutoHook( ) ) {
for ( TupleAlignmentGraph g : graphs.pull( hook ).once( ) ) {
Iterator forward = g.forward( );
while ( forward.hasNext( ) ) {
TupleAlignmentNode node = forward.next( );
for ( int i = 0; i < node.number_of_incoming_edges; i++ ) {
int encoded = node.incoming_edges[i][2];
TupleAlignmentAlphabetPair pair =
alphabet.get( encoded );
fd_pair.accept( pair );
}
}
}
for ( TupleAlignmentGraph g : graphs.pull( hook ).once( ) ) {
TupleAlignmentNode node = g.get( g.la( ), g.lb( ) );
for ( ;; ) {
int x = node.incoming_edges[0][0];
int y = node.incoming_edges[0][1];
int encoded = node.incoming_edges[0][2];
TupleAlignmentAlphabetPair pair =
alphabet.get( encoded );
fd_sa.accept( pair.above );
for ( O b : pair.below.asIterable( ) ) {
fd_sb.accept( b );
}
if ( x == 0 && y == 0 ) {
break;
}
node = g.get( x, y );
}
}
}
}
public FrequencyDistribution getIFD( ) {
if ( fd_sa == null ) {
computeStats( );
}
return fd_sa;
}
public FrequencyDistribution getOFD( ) {
if ( fd_sb == null ) {
computeStats( );
}
return fd_sb;
}
public FrequencyDistribution>
getPairFD( ) {
if ( fd_pair == null ) {
computeStats( );
}
return fd_pair;
}
private TupleAlignmentAlphabet
createAlphabet(
KnittingCursor, Tuple>> data,
int min_below,
int max_below,
Progress progress ) {
try ( AutoHook hook = new BasicAutoHook( ) ) {
int record_max_length_above = 0;
int record_max_length_below = 0;
int record_max_number_of_edges = 0;
TupleAlignmentAlphabetBuilder builder =
new TupleAlignmentAlphabetBuilder<>( );
for ( Bi, Tuple> datum : data.once( ) ) {
progress.step( 1 );
KnittingTuple extends I> ka =
KnittingTuple.wrap( datum.first );
KnittingTuple kb = KnittingTuple.wrap( datum.second );
int la = ka.size( );
int lb = kb.size( );
if ( record_max_length_above < la ) {
record_max_length_above = la;
}
if ( record_max_length_below < lb ) {
record_max_length_below = lb;
}
try {
TupleAlignmentNode[][] matrix =
TupleAlignmentGraphFactory.pathMatrix( ka.size( ), kb.size( ), min_below, max_below );
int current_number_of_edges = 0;
for ( int a = 0; a <= la; a++ ) {
for ( int b = 0; b <= lb; b++ ) {
if ( matrix[a][b] == null || ( a == 0 && b == 0 ) ) {
continue;
}
int[][] ie = matrix[a][b].incoming_edges;
int no_ie = matrix[a][b].number_of_incoming_edges;
current_number_of_edges = current_number_of_edges + no_ie;
for ( int e_i = 0; e_i < no_ie; e_i++ ) {
int x = ie[e_i][0];
int y = ie[e_i][1];
I suba = ka.get( x );
KnittingTuple subb = kb.head( y, b - y );
builder.record( suba, subb );
}
}
}
if ( record_max_number_of_edges < current_number_of_edges ) {
record_max_number_of_edges = current_number_of_edges;
}
}
catch ( NotAlignableException e ) {
// simply ignore them.
}
}
this.record_max_length_above = record_max_length_above;
this.record_max_length_below = record_max_length_below;
this.record_max_number_of_edges = record_max_number_of_edges;
return builder.build( );
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy