All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.ontology.obo.OboFileParser Maven / Gradle / Ivy

There is a newer version: 7.1.4
Show newest version
/*
 *                  BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 * Created on Jan 18, 2008
 *
 */

package org.biojava.nbio.ontology.obo;

import org.biojava.nbio.ontology.Synonym;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;


/** A class to parse the content of an OBO file. It delegates handling of the
 * content to the OBOFileEventListener implementation.
 *
 * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part)
 *
 * http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup
 * Thanks to the OboEdit developers for giving permission to release this in BioJava.
 *
 *
 * @author Andreas Prlic
 * @author John Day Richter
 * @since 1.6
 */
public class OboFileParser {

	private static final Logger logger = LoggerFactory.getLogger(OboFileParser.class);

	List listeners;

	protected String line;
	protected int linenum = 0;
	protected int totalSize = 0;
	protected int bytesRead = 0;
	protected StringBuffer tempBuffer = new StringBuffer();
	protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US);


	protected static final Map escapeChars =
		new HashMap();

	protected static final Map unescapeChars =
		new HashMap();

	static {
		escapeChars.put(new Character('n'), new Character('\n'));
		escapeChars.put(new Character('W'), new Character(' '));
		escapeChars.put(new Character('t'), new Character('\t'));
		escapeChars.put(new Character(':'), new Character(':'));
		escapeChars.put(new Character(','), new Character(','));
		escapeChars.put(new Character('"'), new Character('"'));
		escapeChars.put(new Character('\''), new Character('\''));
		escapeChars.put(new Character('\\'), new Character('\\'));
		escapeChars.put(new Character('{'), new Character('{'));
		escapeChars.put(new Character('}'), new Character('}'));
		escapeChars.put(new Character('('), new Character('('));
		escapeChars.put(new Character(')'), new Character(')'));
		escapeChars.put(new Character('['), new Character('['));
		escapeChars.put(new Character(']'), new Character(']'));
		escapeChars.put(new Character('!'), new Character('!'));
		Iterator  it = escapeChars.keySet().iterator();
		while (it.hasNext()) {
			Character key = it.next();
			Character value = escapeChars.get(key);
			unescapeChars.put(value, key);
		}
	}

	public static class SOPair {
		public String str = null;

		public int index = -1;

		public int endIndex = -1;

		public SOPair(String str, int index) {
			this(str, index, -1);
		}

		public SOPair(String str, int index, int endIndex) {
			this.str = str;
			this.index = index;
			this.endIndex = endIndex;
		}


	}




	public OboFileParser(){
		listeners = new ArrayList();
	}



	public void addOboFileEventListener(OboFileEventListener listener){
		listeners.add(listener);
	}

	public List getOboFileEventListener(){
		return listeners;
	}

	/** parse an ontology file
	 *
	 * @param oboFile
	 * @throws IOException
	 * @throws IOException
	 */
	public void parseOBO(BufferedReader oboFile) throws IOException{

		String line;
		String currentStanza;

		while ((line = oboFile.readLine()) != null) {
			if (line.length() == 0)
				continue;

			if ( line.charAt(0) == '[') {
				if (line.charAt(line.length() - 1) != ']')
					throw new IOException("Unclosed stanza: \"" + line + "\"" );
				String stanzaname = line.substring(1, line.length() - 1);
				if (stanzaname.length() < 1)
					throw new IOException("Empty stanza: \"" +line+"\"");
				currentStanza = stanzaname;

				//logger.info("stanza: {}", currentStanza);
				triggerNewStanza(currentStanza);

			} else {
				// a content line
				SOPair pair;

				pair = unescape(line, ':', 0, true);

				//logger.info(pair);
				String name = pair.str;
				int lineEnd = findUnescaped(line, '!', 0, line.length(), true);
				if (lineEnd == -1)
					lineEnd = line.length();

				// find nested values
				NestedValue nv = null;

				int trailingStartIndex = -1;
				int trailingEndIndex = -1;
				for (int i = lineEnd - 1; i >= 0; i--) {
					if (Character.isWhitespace(line.charAt(i))) {
						// keep going until we see non-whitespace
					} else if (line.charAt(i) == '}') {
						// if the first thing we see is a closing brace,
						// we have a trailing modifier
						if (i >= 1 && line.charAt(i - 1) == '\\')
							continue;
						trailingEndIndex = i;
						break;
					} else
						break;
				}

				if (trailingEndIndex != -1) {
					for (int i = trailingEndIndex - 1; i >= 0; i--) {
						if (line.charAt(i) == '{') {
							if (i >= 1 && line.charAt(i - 1) == '\\')
								continue;
							trailingStartIndex = i + 1;
						}
					}
				}

				int valueStopIndex;
				if (trailingStartIndex == -1 && trailingEndIndex != -1)
					throw new IOException("Unterminated trailing modifier. " + line);
				else if (trailingStartIndex != -1) {
					valueStopIndex = trailingStartIndex - 1;
					String trailing = line.substring(trailingStartIndex,
							trailingEndIndex).trim();
					nv = new NestedValue();
					getNestedValue(nv, trailing, 0);
				} else
					valueStopIndex = lineEnd;

				String value = line.substring(pair.index + 1, valueStopIndex).trim();
				/*
				 * if (nv != null) logger.warn("nv = "+nv+", value =
				 * |"+value+"|");
				 */
				if (value.length() == 0)
					throw new IOException("Tag found with no value "+ line);

				if ( isSynonym(name)){
					Synonym synonym = parseSynonym(name,value);
					triggerNewSynonym(synonym);
				} else {
					//logger.info("new key:" + name + " " + value);
					triggerNewKey(name,value);
				}
				//logger.info("parsed key: " + name +" value: " + value + " nv: " + nv);



			}
		}
	}

	private boolean isSynonym(String key){
		if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM))
			return true;
		return false;
	}

	/** parse the Synonym String from the Term.
	 * value can be:
	 * 
"ca_bind" RELATED [uniprot:curation]
* @param value * @return the synonym text */ private Synonym parseSynonym(String key, String value) throws IOException{ //logger.info("PARSE SYNONYM " + key + " " + value); int startIndex = findUnescaped(value, '"', 0, value.length()); if (startIndex == -1) throw new IOException("Expected \"" + line + " " + linenum); SOPair p = unescape(value, '"', startIndex + 1, value.length(), true); int defIndex = findUnescaped(value, '[', p.index, value.length()); if (defIndex == -1) { throw new IOException("Badly formatted synonym. " + "No dbxref list found." + line + " " + linenum ); } String leftovers = value.substring(p.index + 1, defIndex).trim(); StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t"); int scope = Synonym.RELATED_SYNONYM; if ( key.equals(OboFileHandler.EXACT_SYNONYM)) scope = Synonym.EXACT_SYNONYM; else if ( key.equals(OboFileHandler.BROAD_SYNONYM)) scope = Synonym.BROAD_SYNONYM; else if ( key.equals(OboFileHandler.NARROW_SYNONYM)) scope = Synonym.NARROW_SYNONYM; String catID = null; for (int i = 0; tokenizer.hasMoreTokens(); i++) { String token = tokenizer.nextToken(); //logger.info("TOKEN:" +token); if (i == 0) { if (token.equals("RELATED")) scope = Synonym.RELATED_SYNONYM; else if (token.equals("UNSPECIFIED")) scope = Synonym.RELATED_SYNONYM; else if (token.equals("EXACT")) scope = Synonym.EXACT_SYNONYM; else if (token.equals("BROAD")) scope = Synonym.BROAD_SYNONYM; else if (token.equals("NARROW")) scope = Synonym.NARROW_SYNONYM; else throw new IOException("Found unexpected scope " + "identifier " + token + line); } else if (i == 1) { catID = token; } else throw new IOException("Expected dbxref list," + " instead found " + token + line ); } Synonym synonym = new Synonym(); synonym.setScope(scope); synonym.setCategory(catID); synonym.setName(p.str); //logger.info("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope()); Map[] refs = getDbxrefList(value,defIndex + 1, value.length()); // set the refs in the synonym for (Map ref : refs){ @SuppressWarnings("unused") String xref = (String) ref.get("xref"); @SuppressWarnings("unused") String desc = (String) ref.get("desc"); //logger.info(xref + " " + desc); @SuppressWarnings("unused") NestedValue nv = (NestedValue) ref.get("nv"); //TODO: add implementation for this... } return synonym; } protected Map[] getDbxrefList(String line, int startoffset, int endoffset) throws IOException { Vector> temp = new Vector>(); boolean stop = false; while (!stop) { int braceIndex = findUnescaped(line, '{', startoffset, endoffset); int endIndex = findUnescaped(line, ',', startoffset, endoffset, true); boolean trailing = false; if (endIndex == -1) { endIndex = findUnescaped(line, ']', startoffset, endoffset, true); if (endIndex == -1) { throw new IOException("Unterminated xref list " + line); } stop = true; } if (braceIndex != -1 && braceIndex < endIndex) { endIndex = braceIndex; trailing = true; } Map pair = parseXref(line, startoffset, endIndex); if (pair == null) { startoffset++; continue; } NestedValue nv = null; if (trailing) { nv = new NestedValue(); endIndex = getNestedValue(nv, line, endIndex + 1); if (endIndex == -1) { throw new IOException("Badly formatted " + "trailing properties " + line); } pair.put("nv",nv); } temp.add(pair); startoffset = endIndex + 1; } Map[] out = new HashMap[temp.size()]; for (int i = 0; i < temp.size(); i++) { Map pair = temp.get(i); out[i] = pair; } return out; } protected Map parseXref(String line, int startoffset, int endoffset) throws IOException { String xref_str = null; String desc_str = null; SOPair xref = unescape(line, '"', startoffset, endoffset, false); xref_str = xref.str.trim(); if (xref_str.length() == 0) return null; if (xref.index != -1) { SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true); desc_str = desc.str.trim(); } Map m = new HashMap(); m.put("xref",xref_str); m.put("desc",desc_str); return m; } private void triggerNewStanza(String stanza){ Iterator iter = listeners.iterator(); while (iter.hasNext()){ OboFileEventListener li = iter.next(); li.newStanza(stanza); } } private void triggerNewKey(String key, String value){ Iterator iter = listeners.iterator(); while (iter.hasNext()){ OboFileEventListener li = iter.next(); li.newKey(key, value); } } private void triggerNewSynonym(Synonym synonym){ Iterator iter = listeners.iterator(); while (iter.hasNext()){ OboFileEventListener li = iter.next(); li.newSynonym(synonym); } } public static String escape(String str, boolean escapespaces) { StringBuffer out = new StringBuffer(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); Object o = unescapeChars.get(new Character(c)); if (o == null) out.append(c); else { if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) { out.append("\\" + o); } else out.append(c); } } return out.toString(); } public String unescape(String str) throws IOException { return unescape(str, '\0', 0, str.length(), false).str; } public SOPair unescape(String str, char toChar, int startindex, boolean mustFindChar) throws IOException { return unescape(str, toChar, startindex, str.length(), mustFindChar); } public SOPair unescape(String str, char toChar, int startindex, int endindex, boolean mustFindChar) throws IOException { StringBuffer out = new StringBuffer(); int endValue = -1; for (int i = startindex; i < endindex; i++) { char c = str.charAt(i); if (c == '\\') { i++; c = str.charAt(i); Character mapchar = escapeChars .get(new Character(c)); if (mapchar == null) throw new IOException("Unrecognized escape" + " character " + c + " found."); out.append(mapchar); } else if (c == toChar) { endValue = i; break; } else { out.append(c); } } if (endValue == -1 && mustFindChar) { throw new IOException("Expected " + toChar + "." + str); } return new SOPair(out.toString(), endValue); } public static int findUnescaped(String str, char toChar) { return findUnescaped(str, toChar, 0, str.length()); } public static int findUnescaped(String str, char toChar, int startIndex, int endIndex) { return findUnescaped(str, toChar, startIndex, endIndex, false); } public static int findUnescaped(String str, char toChar, int startindex, int endindex, boolean honorQuotes) { boolean inQuotes = false; char quoteChar = '\0'; for (int i = startindex; i < endindex; i++) { char c = str.charAt(i); if (c == '\\') { i++; continue; } else if (inQuotes) { if (c == quoteChar) inQuotes = false; continue; } else if (c == toChar) { return i; } else if (honorQuotes && isQuote(c)) { inQuotes = true; quoteChar = c; } } return -1; } public static boolean isEscapeStarter(char c) { return c == '\\'; } public static boolean isQuote(char c) { return c == '"'; } protected StringBuffer getTempBuffer() { tempBuffer.delete(0, tempBuffer.length()); return tempBuffer; } protected SOPair readQuotedString(String value, int startIndex, int stopIndex, char terminatingChar, boolean requireQuotes, boolean legalEndOfLine) throws IOException { char quoteChar = '\0'; StringBuffer out = getTempBuffer(); int i = startIndex; boolean useQuotes = false; for (; i < stopIndex; i++) { // burn through any leading whitespace if (Character.isWhitespace(value.charAt(i))) continue; // if the first non-whitespace character is not a quote, // proceed in non-quoted mode else if (!isQuote(value.charAt(i))) { if (requireQuotes) throw new IOException( "Expected start of quoted string. " + line + " " + value+ " at linenr " + linenum); useQuotes = false; break; } else { useQuotes = true; quoteChar = value.charAt(i); i++; break; } } // look for a closing quote or final delimiter for (; i < stopIndex; i++) { if (isEscapeStarter(value.charAt(i))) { i++; if (i >= value.length()) throw new IOException("Incomplete escape sequence. " + line); out.append(value.charAt(i)); } else if ((useQuotes && value.charAt(i) == quoteChar) || (!useQuotes && value.charAt(i) == terminatingChar)) { if (!useQuotes) return new SOPair(out.toString().trim(), startIndex, i - 1); else return new SOPair(out.toString(), startIndex, i); } else { out.append(value.charAt(i)); } } if (!useQuotes && legalEndOfLine) return new SOPair(out.toString().trim(), startIndex, i); else throw new IOException("Unterminated quoted string. " +line); } protected int getNestedValue(NestedValue nv, String str, int startIndex) throws IOException { while (startIndex < str.length()) { int equalsIndex = findUnescaped(str, '=', startIndex, str.length()); if (equalsIndex == -1) throw new IOException("Expected = in trailing modifier " +line); String name = str.substring(startIndex, equalsIndex).trim(); SOPair value = readQuotedString(str, equalsIndex + 1, str.length(), ',', false, true); Properties pv = new Properties(); pv.setProperty(unescape(name),value.str); nv.addPropertyValue(pv); startIndex = value.endIndex + 1; for (; startIndex < str.length(); startIndex++) { if (Character.isWhitespace(str.charAt(startIndex))) continue; else if (str.charAt(startIndex) == ',') { startIndex++; break; } else { logger.error("found character |{}|", str.charAt(startIndex)); throw new IOException("Expected comma in trailing modifier. " + line + " linenr: " + linenum); } } } return str.length(); } } class NestedValue { protected Properties propertyValues = new Properties(); protected String name; protected String suggestedComment; public NestedValue() { } @Override public String toString(){ String txt = "NestedValue: " ; Set keys = propertyValues.keySet(); Iterator iter = keys.iterator(); while (iter.hasNext()){ String key = iter.next().toString(); String value = propertyValues.get(key).toString(); txt += " [" + key + ":" + value + "]"; } return txt; } public String getName() { return name; } public Properties getPropertyValues() { return propertyValues; } public void addPropertyValue(Properties pv) { Set keys = pv.keySet(); Iterator iter = keys.iterator(); while (iter.hasNext()){ String key = iter.next().toString(); String value = pv.get(key).toString(); propertyValues.setProperty(key, value); } } @Override public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException ex) { // this will never happen return null; } } public String getSuggestedComment() { return suggestedComment; } public void setSuggestedComment(String suggestedComment) { this.suggestedComment = suggestedComment; } }