org.bridgedb.tools.qc.BridgeQC Maven / Gradle / Ivy
package org.bridgedb.tools.qc;
import java.io.File;
import java.io.OutputStream;
import java.io.PrintStream;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.bridgedb.DataSource;
import org.bridgedb.IDMapperException;
import org.bridgedb.Xref;
import org.bridgedb.bio.DataSourceTxt;
import org.bridgedb.bio.Organism;
import org.bridgedb.rdb.SimpleGdb;
import org.bridgedb.rdb.SimpleGdbFactory;
/**
* Utility to do simple quality control on a BridgeDerby database.
* Run with two parameters: [old database] and [new database]
* Some basic comparisons will be done, which serves as a sanity check
* that not suddenly a whole identifier system has gone missing.
*
* The script produces a report on STDOUT (configurable), lines starting with "INFO"
* are strictly informative, whereas lines starting with "WARNING" are
* problems worth investigating further. Ideally there are no "WARNING" lines
* in the report.
*/
public class BridgeQC
{
private final File oldDb;
private final File newDb;
private SimpleGdb oldGdb;
private SimpleGdb newGdb;
private PrintStream out;
/**
* Compares two Derby databases and reports the output
* to STDOUT.
*
* @param f1 the original Derby@prefix gpml: database
* @param f2 the new Derby database
*/
public BridgeQC(File f1, File f2) throws IDMapperException
{
this(f1, f2, System.out);
}
/**
* Compares two Derby databases and reports the output
* to the given {@link java.io.OutputStream}.
*
* @param f1 the original Derby database
* @param f2 the new Derby database
*/
public BridgeQC(File f1, File f2, OutputStream out) throws IDMapperException
{
if (out == null)
throw new NullPointerException(
"OutputStream is null"
);
oldDb = f1;
newDb = f2;
this.out = new PrintStream(out);
}
Map oldSet = new HashMap();
Map newSet = new HashMap();
public void initDatabases() throws IDMapperException
{
String url1 = "jdbc:derby:jar:(" + oldDb + ")database";
oldGdb = SimpleGdbFactory.createInstance("old", url1);
this.out.printf("INFO: old database is %s %s (build: %s)\n",
oldGdb.getCapabilities().getProperty("DATASOURCENAME"),
oldGdb.getCapabilities().getProperty("DATASOURCEVERSION"),
oldGdb.getCapabilities().getProperty("BUILDDATE")
);
String url2 = "jdbc:derby:jar:(" + newDb + ")database";
newGdb = SimpleGdbFactory.createInstance("new", url2);
this.out.printf("INFO: new database is %s %s (build: %s)\n",
newGdb.getCapabilities().getProperty("DATASOURCENAME"),
newGdb.getCapabilities().getProperty("DATASOURCEVERSION"),
newGdb.getCapabilities().getProperty("BUILDDATE")
);
}
public void compareDataSources() throws IDMapperException
{
for (DataSource ds : oldGdb.getCapabilities().getSupportedSrcDataSources())
{
int oldGenes = oldGdb.getGeneCount(ds);
oldSet.put (ds, oldGenes);
}
for (DataSource ds : newGdb.getCapabilities().getSupportedSrcDataSources())
{
int newGenes = newGdb.getGeneCount(ds);
newSet.put (ds, newGenes);
}
// not in new
for (DataSource ds : oldSet.keySet())
{
if (!newSet.containsKey(ds)) {
this.out.printf("INFO: %s%s is only in old database\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : ""
);
}
}
// not in old
for (DataSource ds : newSet.keySet())
{
int newGenes = newSet.get(ds);
if (newGenes == 0)
{
this.out.println ("WARNING: " + ds.getSystemCode() + " has 0 ids");
}
if (!oldSet.containsKey(ds))
{
this.out.printf("INFO: %s%s is only in new database\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : ""
);
this.out.printf ("INFO: Number of ids in %s%s: %d\n", ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
newGenes
);
}
else
{
Set oldIDs = new HashSet();
for (Xref oldXref : oldGdb.getIterator(ds)) oldIDs.add(oldXref.getId());
Set newIDs = new HashSet();
for (Xref newXref : newGdb.getIterator(ds)) newIDs.add(newXref.getId());
// determine all new IDs
Set newGenesAdded = new HashSet();
newGenesAdded.addAll(newIDs);
newGenesAdded.removeAll(oldIDs);
// determine all no longer existing (removed) IDs
Set genesRemoved = new HashSet();
genesRemoved.addAll(oldIDs);
genesRemoved.removeAll(newIDs);
int oldGenes = oldSet.get(ds);
double delta = (double)(newGenes - oldGenes) / (double)oldGenes;
if (newGenesAdded.size() + genesRemoved.size() == 0)
this.out.printf(
"INFO: Number of ids in %s%s: %d (unchanged)\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
newGenes
);
else
this.out.printf(
"INFO: Number of ids in %s%s: %d (%d added, %d removed -> overall changed %+3.1f%%)\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
newGenes,
newGenesAdded.size(),
genesRemoved.size(),
(delta * 100)
);
if (genesRemoved.size() > 0 && "true".equals(System.getProperty("showRemovedIDs", "false")))
this.out.printf(
"INFO: The ids removed from %s%s: %s\n",
ds.getSystemCode(),
(ds.getFullName() != null && ds.getFullName().length() > 0) ?
" (" + ds.getFullName() + ")" : "",
"" + genesRemoved
);
if (delta < -0.1)
this.out.println ("WARNING: Number of ids in " + ds.getSystemCode() + " has shrunk by more than 10%");
}
}
}
public void compareLinks() throws SQLException
{
Connection con = oldGdb.getConnection();
//TODO ... do something to compare cross-link consistency ...
}
public void checkDatabaseSanity() throws SQLException
{
Connection con = newGdb.getConnection();
Statement st = con.createStatement();
/** check for ids that occur in the link table but not in datanode table. We expect zero results */
String sql = "select coderight, idright from link left outer join datanode on link.idright = datanode.id and link.coderight = datanode.code where datanode.code IS NULL";
ResultSet rs = st.executeQuery(sql);
if (rs.next())
{
this.out.println ("ERROR: 'link' table contains ids that do not occur in 'datanode' table.");
this.out.print ("ERROR: A few examples: ");
String sep = "";
int i = 0;
do
{
this.out.print (sep + rs.getString(1) + ":" + rs.getString(2));
sep = ", ";
}
while (rs.next() && ++i < 8);
this.out.println();
this.out.println ("ERROR: These ids will not map properly.");
}
}
public void compareFileSizes() throws SQLException
{
long oldSize = oldDb.length();
long newSize = newDb.length();
this.out.printf ("INFO: new size is %d Mb (changed %+3.1f%%)\n", newSize / 1000000,
(double)(newSize - oldSize) / (double)oldSize * 100);
}
public void compareAttributes() throws IDMapperException
{
Set oldAttrSet = oldGdb.getAttributeSet();
Set newAttrSet = newGdb.getAttributeSet();
for (String oldAttr : oldAttrSet)
{
if (!newAttrSet.contains(oldAttr))
{
this.out.println ("WARNING: Attribute " + oldAttr + " only in old database");
}
}
for (String newAttr : newAttrSet)
{
this.out.println ("INFO: Attribute provided: " + newAttr);
if (!oldAttrSet.contains(newAttr))
{
this.out.println ("INFO: Attribute " + newAttr + " only in new database");
}
}
}
public static boolean safeEquals (Object a, Object b)
{
return a == null ? b == null : a.equals(b);
}
public interface PropertyChecker
{
abstract void check(String oldVal, String newVal, PrintStream out);
}
enum Props implements PropertyChecker
{
ORGANISM (true, false) {
public void check(String oldVal, String newVal, PrintStream out)
{
if (newVal != null)
{
Organism o = Organism.fromLatinName(newVal);
if (o == null) out.println ("WARNING: species '" + newVal + "' is not a recognized latin name");
}
}
},
DATASOURCENAME (true, true) {
public void check(String oldVal, String newVal, PrintStream out) {}
},
SERIES (true, true) {
public void check(String oldVal, String newVal, PrintStream out) {}
},
DATATYPE (true, true) {
public void check(String oldVal, String newVal, PrintStream out) {}
},
DATASOURCEVERSION (false, true) {
public void check(String oldVal, String newVal, PrintStream out) {}
},
BUILDDATE (false, true) {
public void check(String oldVal, String newVal, PrintStream out) {
SimpleDateFormat sft = new SimpleDateFormat("yyyyMMdd");
Date oldDate = null;
Date newDate = null;
try
{
if (oldVal != null)
oldDate = sft.parse(oldVal);
}
catch (ParseException e)
{
out.println ("ERROR: " + oldVal + " does not match pattern yyyymmdd");
}
try
{
if (newVal != null)
newDate = sft.parse(newVal);
}
catch (ParseException e)
{
out.println ("ERROR: " + oldVal + " does not match pattern yyyymmdd");
}
if (oldDate != null && newDate != null && oldDate.after(newDate))
{
out.println ("ERROR: new date " + newVal + " is older than old date " + oldVal);
}
}
},
SCHEMAVERSION (false, true) {
public void check(String oldVal, String newVal, PrintStream out) {}
},
;
private boolean mustBeSame;
private boolean mustBeDefined;
PrintStream out;
Props(boolean mustBeSame, boolean mustBeDefined)
{
this.mustBeSame = mustBeSame;
this.mustBeDefined = mustBeDefined;
}
public void checkWrap(String oldVal, String newVal, PrintStream out)
{
if (mustBeSame && !safeEquals (oldVal, newVal))
out.println ("WARNING: old " + name() + " '" + oldVal + "' doesn\'t match new " + name() + " '" + newVal + "'");
if (mustBeDefined && (newVal == null || newVal.equals("")))
out.println ("WARNING: property " + name() + " is undefined");
check(oldVal, newVal, out);
}
}
public void compareInfo()
{
for (Props p : Props.values())
{
p.checkWrap(oldGdb.getCapabilities().getProperty(p.name()),
newGdb.getCapabilities().getProperty(p.name()), System.out);
}
}
public void run() throws IDMapperException, SQLException
{
initDatabases();
checkDatabaseSanity();
compareInfo();
compareDataSources();
compareLinks();
compareAttributes();
compareFileSizes();
summarizeOverallStats(oldGdb, "OLD");
summarizeOverallStats(newGdb, "NEW");
}
private void summarizeOverallStats(SimpleGdb gdb, String oldNew) throws IDMapperException, SQLException
{
this.out.println("INFO: " + oldNew + " database has a total number of identifiers of " + gdb.getGeneCount());
this.out.println("INFO: " + oldNew + " database has a total number of mappings of " + gdb.getLinkCount());
Connection con = gdb.getConnection();
con.setAutoCommit(false);
Statement st = con.createStatement();
String sqlSchema = "SELECT schemaversion FROM info ";
ResultSet schema = st.executeQuery(sqlSchema);
boolean isSchemaUpdated = (schema.next() && schema.getInt("schemaversion") >= 4);
if (isSchemaUpdated) {
for (DataSource ds : gdb.getCapabilities().getSupportedSrcDataSources()) {
this.out.println("INFO: " + oldNew + " database data source " + ds.getFullName() + " has " + gdb.getPrimaryIDCount(ds) + " primary ids");
this.out.println("INFO: " + oldNew + " database data source " + ds.getFullName() + " has " + (gdb.getGeneCount(ds) - gdb.getPrimaryIDCount(ds)) + " secondary ids");
}
} else {
this.out.println("INFO: " + oldNew + " database has Schema Version is less than 4, and we cannot calculate Primary and Secondary identifier counts");
}
}
public static void printUsage()
{
System.out.println ("Expected 2 arguments: ");
}
/**
* @param args
* @throws IDMapperException
* @throws SQLException
*/
public static void main(String[] args) throws IDMapperException, SQLException
{
if (args.length != 2) { printUsage(); return; }
BridgeQC main = new BridgeQC (new File(args[0]), new File(args[1]));
DataSourceTxt.init();
main.run();
PatternChecker checker = new PatternChecker();
checker.run(new File(args[0]));
}
}