org.geneweaver.io.connector.VariantConnector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gweaver-stream-io Show documentation
Show all versions of gweaver-stream-io Show documentation
The IO bundle for Geneweaver.
/*-
*
* Copyright 2018, 2020 The Jackson Laboratory Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @author Matthew Gerring
*/
package org.geneweaver.io.connector;
import java.io.PrintStream;
import java.lang.ref.SoftReference;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.geneweaver.domain.Entity;
import org.geneweaver.domain.GeneticEntity;
import org.geneweaver.domain.Transcript;
import org.geneweaver.domain.Variant;
import org.geneweaver.domain.VariantEffect;
import org.neo4j.ogm.cypher.ComparisonOperator;
import org.neo4j.ogm.cypher.Filter;
import org.neo4j.ogm.cypher.Filters;
import org.neo4j.ogm.session.Session;
/**
* A connector function which makes sure that variant effects linking to variants are extracts as the file
* is parsed. This is desirable because it makes parsing a large varient file or files fast.
*
* @author gerrim
* @param type of entity in the file
* @param type of entity after mapping using the connector.
*/
public class VariantConnector implements Connector, Function> {
/**
* Soft reference cache to reduce memory leaks. Garbage collector will nullify them as needed.
*/
private Map> cache = new HashMap<>();
private boolean useSessions;
/**
* Connect variants to effects without looking up things in an active Neo4j session
*/
public VariantConnector() {
this(false);
}
/**
* Connect variants to effects, looking up things in an active Neo4j session if useSessions is true.
* @param useSessions - Use with caution as a session is required and there may be more than one
* session if the objects are being parsed in parallel. If you do not know how to use this param,
* leave it as false.
*/
public VariantConnector(boolean useSessions) {
this.useSessions = useSessions;
}
@SuppressWarnings("unchecked")
@Override
public Stream apply(GeneticEntity ge) {
Variant v = (Variant)ge;
try {
if (v.getVariantEffect()==null || v.getVariantEffect().isEmpty()) {
return (Stream) Stream.of(v);
}
Collection ve = v.getVariantEffect()
.stream()
.filter(e->e!=null)
.filter(e->e.getFeatureId()!=null)
.filter(e->!e.getFeatureId().trim().isBlank())
.map(e->{e.setVariant(v); return e;})
.collect(Collectors.toSet());
Collection ret = new LinkedList<>();
ret.add(v);
ret.addAll(ve);
return (Stream) ret.stream();
} finally {
// We never actually save the relationships inside the Variant.
v.clearEffects();
}
}
/**
* @param session - not required if useSessions is false.
*/
@SuppressWarnings("unchecked")
@Override
public Stream stream(N ge, Session session, PrintStream log) {
if (!useSessions) {
return apply(ge);
} else {
Variant v = (Variant)ge;
try {
if (v.getVariantEffect()==null || v.getVariantEffect().isEmpty()) {
return (Stream) Stream.of(v);
}
Set transIds = v.getVariantEffect().stream()
.filter(e->e!=null)
.filter(e->e.getFeatureId()!=null)
.filter(e->!e.getFeatureId().trim().isBlank())
.map(e->e.getFeatureId())
.collect(Collectors.toSet());
if (transIds.isEmpty()) {
return (Stream) Stream.of(v); // It gets cleared on finally.
}
Map allTranscripts = getCachedFilters(transIds, session);
Collection ve = v.getVariantEffect()
.stream()
.map(e->registerTranscript(v, e, allTranscripts))
.filter(t->t!=null)
.collect(Collectors.toSet());
Collection ret = new LinkedList<>();
ret.add(v);
ret.addAll(ve);
return (Stream) ret.stream();
} finally {
// We never actually save the relationships inside the Variant.
v.clearEffects();
}
}
}
/**
* The logic of this is a little hard to understand. The following points help:
* 1. We do not want to do more filters than we have to, they are slow.
* 2. If the cache filters, we do not want to cache so many that we use all the memory
* 3. If a Transcript is not there, we want to save this as a null in our cache to
* save doing many negative filters.
*
* @param transIds
* @param session
* @return
*/
private Map getCachedFilters(Set transIds, Session session) {
// This seems clumsy and non-functional but the Filter object is limited.
Map allTranscripts = new HashMap<>();
Filters filters = new Filters();
// 1. Travese the ids and either get the cached Transcript
// or find out it has been purposely set to null or should
// result in a new filter.
Iterator it = transIds.iterator();
while(it.hasNext()) {
// Get it from cache
String tid = it.next();
KEY_TEST: if (cache.containsKey(tid)) {
SoftReference ref = cache.get(tid);
if (ref!=null) { // If its null it is marked as not having a mapping.
Transcript t = ref.get();
if (t != null) {
allTranscripts.put(tid, t);
continue; // We cached it!
} else {
cache.remove(tid); // Will need to read it as a filter again.
break KEY_TEST; // Add a filter, the cache value got garbage collected.
}
} else {
continue; // If set to explicitly null, do not filter it.
}
}
// If not get it from filter.
filters = filters.or(new Filter("transcriptId", ComparisonOperator.EQUALS, tid));
}
// Try not to do filtering unless we have to, it's slow.
if (!filters.isEmpty()) {
Collection transcripts = session.loadAll(Transcript.class, filters);
Map tmap = transcripts.stream().collect(Collectors.toMap(t->t.getTranscriptId(), t->t));
// Add the filtered Transcripts to the cache with a soft reference.
it = tmap.keySet().iterator();
while(it.hasNext()) {
String tid = it.next();
cache.put(tid, new SoftReference<>(tmap.get(tid)));
}
allTranscripts.putAll(tmap);
}
// We put all the nulls in the cache
// Anything we did not find is a permanent null
transIds.removeAll(allTranscripts.keySet());
for (String tid : transIds) {
cache.put(tid, null); // Set to null and stays null
}
return allTranscripts;
}
private VariantEffect registerTranscript(Variant v, VariantEffect e, Map tmap) {
Transcript t = tmap.get(e.getFeatureId());
if (t==null) return null; // We cannot link this one, no relationship will be made.
e.setTranscript(t);
e.setVariant(v);
return e;
}
}