All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bridgedb.tools.qc.BridgeQC Maven / Gradle / Ivy

The newest version!
package org.bridgedb.tools.qc;
import java.io.File;
import java.io.OutputStream;
import java.io.PrintStream;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.bridgedb.DataSource;
import org.bridgedb.IDMapperException;
import org.bridgedb.Xref;
import org.bridgedb.bio.DataSourceTxt;
import org.bridgedb.bio.Organism;
import org.bridgedb.rdb.SimpleGdb;
import org.bridgedb.rdb.SimpleGdbFactory;

/**
 * Utility to do simple quality control on a BridgeDerby database.
 * Run with two parameters: [old database] and [new database]
 * Some basic comparisons will be done, which serves as a sanity check
 * that not suddenly a whole identifier system has gone missing.
 * 

* The script produces a report on STDOUT (configurable), lines starting with "INFO" * are strictly informative, whereas lines starting with "WARNING" are * problems worth investigating further. Ideally there are no "WARNING" lines * in the report. */ public class BridgeQC { private final File oldDb; private final File newDb; private SimpleGdb oldGdb; private SimpleGdb newGdb; private PrintStream out; /** * Compares two Derby databases and reports the output * to STDOUT. * * @param f1 the original Derby@prefix gpml: database * @param f2 the new Derby database */ public BridgeQC(File f1, File f2) throws IDMapperException { this(f1, f2, System.out); } /** * Compares two Derby databases and reports the output * to the given {@link java.io.OutputStream}. * * @param f1 the original Derby database * @param f2 the new Derby database */ public BridgeQC(File f1, File f2, OutputStream out) throws IDMapperException { if (out == null) throw new NullPointerException( "OutputStream is null" ); oldDb = f1; newDb = f2; this.out = new PrintStream(out); } Map oldSet = new HashMap(); Map newSet = new HashMap(); public void initDatabases() throws IDMapperException { String url1 = "jdbc:derby:jar:(" + oldDb + ")database"; oldGdb = SimpleGdbFactory.createInstance("old", url1); this.out.printf("INFO: old database is %s %s (build: %s)\n", oldGdb.getCapabilities().getProperty("DATASOURCENAME"), oldGdb.getCapabilities().getProperty("DATASOURCEVERSION"), oldGdb.getCapabilities().getProperty("BUILDDATE") ); String url2 = "jdbc:derby:jar:(" + newDb + ")database"; newGdb = SimpleGdbFactory.createInstance("new", url2); this.out.printf("INFO: new database is %s %s (build: %s)\n", newGdb.getCapabilities().getProperty("DATASOURCENAME"), newGdb.getCapabilities().getProperty("DATASOURCEVERSION"), newGdb.getCapabilities().getProperty("BUILDDATE") ); } public void compareDataSources() throws IDMapperException { for (DataSource ds : oldGdb.getCapabilities().getSupportedSrcDataSources()) { int oldGenes = oldGdb.getGeneCount(ds); oldSet.put (ds, oldGenes); } for (DataSource ds : newGdb.getCapabilities().getSupportedSrcDataSources()) { int newGenes = newGdb.getGeneCount(ds); newSet.put (ds, newGenes); } // not in new for (DataSource ds : oldSet.keySet()) { if (!newSet.containsKey(ds)) { this.out.printf("INFO: %s%s is only in old database\n", ds.getSystemCode(), (ds.getFullName() != null && ds.getFullName().length() > 0) ? " (" + ds.getFullName() + ")" : "" ); } } // not in old for (DataSource ds : newSet.keySet()) { int newGenes = newSet.get(ds); if (newGenes == 0) { this.out.println ("WARNING: " + ds.getSystemCode() + " has 0 ids"); } if (!oldSet.containsKey(ds)) { this.out.printf("INFO: %s%s is only in new database\n", ds.getSystemCode(), (ds.getFullName() != null && ds.getFullName().length() > 0) ? " (" + ds.getFullName() + ")" : "" ); this.out.printf ("INFO: Number of ids in %s%s: %d\n", ds.getSystemCode(), (ds.getFullName() != null && ds.getFullName().length() > 0) ? " (" + ds.getFullName() + ")" : "", newGenes ); } else { Set oldIDs = new HashSet(); for (Xref oldXref : oldGdb.getIterator(ds)) oldIDs.add(oldXref.getId()); Set newIDs = new HashSet(); for (Xref newXref : newGdb.getIterator(ds)) newIDs.add(newXref.getId()); // determine all new IDs Set newGenesAdded = new HashSet(); newGenesAdded.addAll(newIDs); newGenesAdded.removeAll(oldIDs); // determine all no longer existing (removed) IDs Set genesRemoved = new HashSet(); genesRemoved.addAll(oldIDs); genesRemoved.removeAll(newIDs); int oldGenes = oldSet.get(ds); double delta = (double)(newGenes - oldGenes) / (double)oldGenes; if (newGenesAdded.size() + genesRemoved.size() == 0) this.out.printf( "INFO: Number of ids in %s%s: %d (unchanged)\n", ds.getSystemCode(), (ds.getFullName() != null && ds.getFullName().length() > 0) ? " (" + ds.getFullName() + ")" : "", newGenes ); else this.out.printf( "INFO: Number of ids in %s%s: %d (%d added, %d removed -> overall changed %+3.1f%%)\n", ds.getSystemCode(), (ds.getFullName() != null && ds.getFullName().length() > 0) ? " (" + ds.getFullName() + ")" : "", newGenes, newGenesAdded.size(), genesRemoved.size(), (delta * 100) ); if (genesRemoved.size() > 0 && "true".equals(System.getProperty("showRemovedIDs", "false"))) this.out.printf( "INFO: The ids removed from %s%s: %s\n", ds.getSystemCode(), (ds.getFullName() != null && ds.getFullName().length() > 0) ? " (" + ds.getFullName() + ")" : "", "" + genesRemoved ); if (delta < -0.1) this.out.println ("WARNING: Number of ids in " + ds.getSystemCode() + " has shrunk by more than 10%"); } } } public void compareLinks() throws SQLException { Connection con = oldGdb.getConnection(); //TODO ... do something to compare cross-link consistency ... } public void checkDatabaseSanity() throws SQLException { Connection con = newGdb.getConnection(); Statement st = con.createStatement(); /** check for ids that occur in the link table but not in datanode table. We expect zero results */ String sql = "select coderight, idright from link left outer join datanode on link.idright = datanode.id and link.coderight = datanode.code where datanode.code IS NULL"; ResultSet rs = st.executeQuery(sql); if (rs.next()) { this.out.println ("ERROR: 'link' table contains ids that do not occur in 'datanode' table."); this.out.print ("ERROR: A few examples: "); String sep = ""; int i = 0; do { this.out.print (sep + rs.getString(1) + ":" + rs.getString(2)); sep = ", "; } while (rs.next() && ++i < 8); this.out.println(); this.out.println ("ERROR: These ids will not map properly."); } } public void compareFileSizes() throws SQLException { long oldSize = oldDb.length(); long newSize = newDb.length(); this.out.printf ("INFO: new size is %d Mb (changed %+3.1f%%)\n", newSize / 1000000, (double)(newSize - oldSize) / (double)oldSize * 100); } public void compareAttributes() throws IDMapperException { Set oldAttrSet = oldGdb.getAttributeSet(); Set newAttrSet = newGdb.getAttributeSet(); for (String oldAttr : oldAttrSet) { if (!newAttrSet.contains(oldAttr)) { this.out.println ("WARNING: Attribute " + oldAttr + " only in old database"); } } for (String newAttr : newAttrSet) { this.out.println ("INFO: Attribute provided: " + newAttr); if (!oldAttrSet.contains(newAttr)) { this.out.println ("INFO: Attribute " + newAttr + " only in new database"); } } } public static boolean safeEquals (Object a, Object b) { return a == null ? b == null : a.equals(b); } public interface PropertyChecker { abstract void check(String oldVal, String newVal, PrintStream out); } enum Props implements PropertyChecker { ORGANISM (true, false) { public void check(String oldVal, String newVal, PrintStream out) { if (newVal != null) { Organism o = Organism.fromLatinName(newVal); if (o == null) out.println ("WARNING: species '" + newVal + "' is not a recognized latin name"); } } }, DATASOURCENAME (true, true) { public void check(String oldVal, String newVal, PrintStream out) {} }, SERIES (true, true) { public void check(String oldVal, String newVal, PrintStream out) {} }, DATATYPE (true, true) { public void check(String oldVal, String newVal, PrintStream out) {} }, DATASOURCEVERSION (false, true) { public void check(String oldVal, String newVal, PrintStream out) {} }, BUILDDATE (false, true) { public void check(String oldVal, String newVal, PrintStream out) { SimpleDateFormat sft = new SimpleDateFormat("yyyyMMdd"); Date oldDate = null; Date newDate = null; try { if (oldVal != null) oldDate = sft.parse(oldVal); } catch (ParseException e) { out.println ("ERROR: " + oldVal + " does not match pattern yyyymmdd"); } try { if (newVal != null) newDate = sft.parse(newVal); } catch (ParseException e) { out.println ("ERROR: " + oldVal + " does not match pattern yyyymmdd"); } if (oldDate != null && newDate != null && oldDate.after(newDate)) { out.println ("ERROR: new date " + newVal + " is older than old date " + oldVal); } } }, SCHEMAVERSION (false, true) { public void check(String oldVal, String newVal, PrintStream out) {} }, ; private boolean mustBeSame; private boolean mustBeDefined; PrintStream out; Props(boolean mustBeSame, boolean mustBeDefined) { this.mustBeSame = mustBeSame; this.mustBeDefined = mustBeDefined; } public void checkWrap(String oldVal, String newVal, PrintStream out) { if (mustBeSame && !safeEquals (oldVal, newVal)) out.println ("WARNING: old " + name() + " '" + oldVal + "' doesn\'t match new " + name() + " '" + newVal + "'"); if (mustBeDefined && (newVal == null || newVal.equals(""))) out.println ("WARNING: property " + name() + " is undefined"); check(oldVal, newVal, out); } } public void compareInfo() { for (Props p : Props.values()) { p.checkWrap(oldGdb.getCapabilities().getProperty(p.name()), newGdb.getCapabilities().getProperty(p.name()), System.out); } } public void run() throws IDMapperException, SQLException { initDatabases(); checkDatabaseSanity(); compareInfo(); compareDataSources(); compareLinks(); compareAttributes(); compareFileSizes(); summarizeOverallStats(oldGdb, "OLD"); summarizeOverallStats(newGdb, "NEW"); } private void summarizeOverallStats(SimpleGdb gdb, String oldNew) throws IDMapperException, SQLException { this.out.println("INFO: " + oldNew + " database has a total number of identifiers of " + gdb.getGeneCount()); this.out.println("INFO: " + oldNew + " database has a total number of mappings of " + gdb.getLinkCount()); Connection con = gdb.getConnection(); con.setAutoCommit(false); Statement st = con.createStatement(); String sqlSchema = "SELECT schemaversion FROM info "; ResultSet schema = st.executeQuery(sqlSchema); boolean isSchemaUpdated = (schema.next() && schema.getInt("schemaversion") >= 4); if (isSchemaUpdated) { for (DataSource ds : gdb.getCapabilities().getSupportedSrcDataSources()) { this.out.println("INFO: " + oldNew + " database data source " + ds.getFullName() + " has " + gdb.getPrimaryIDCount(ds) + " primary ids"); this.out.println("INFO: " + oldNew + " database data source " + ds.getFullName() + " has " + (gdb.getGeneCount(ds) - gdb.getPrimaryIDCount(ds)) + " secondary ids"); } } else { this.out.println("INFO: " + oldNew + " database has Schema Version is less than 4, and we cannot calculate Primary and Secondary identifier counts"); } } public static void printUsage() { System.out.println ("Expected 2 arguments: "); } /** * @param args * @throws IDMapperException * @throws SQLException */ public static void main(String[] args) throws IDMapperException, SQLException { if (args.length != 2) { printUsage(); return; } BridgeQC main = new BridgeQC (new File(args[0]), new File(args[1])); DataSourceTxt.init(); main.run(); PatternChecker checker = new PatternChecker(); checker.run(new File(args[0])); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy