All Downloads are FREE. Search and download functionalities are using the official Maven repository.

javajs.util.CifDataParser Maven / Gradle / Ivy

There is a newer version: 14.31.10
Show newest version
package javajs.util;

import java.io.BufferedReader;

import java.util.Hashtable;

import java.util.Map;

import javajs.api.GenericCifDataParser;
import javajs.api.GenericLineReader;


// BH 11/21/16 -- adds support for array grouping [...] - used in 2016-format magCIF files

/**
*
* A CIF 1.0 tokenizer class for dealing with quoted strings in CIF files.
* 
* Subclassed by org.jmol.adapters.readers.cif.Cif2DataParser
* 
* Greek letters implemented in Jmol 13.3.9 and only for 
* titles and space groups. All other mark ups ignored.
* 
*

* regarding the treatment of single quotes vs. primes in * cif file, PMR wrote: *

*

* * There is a formal grammar for CIF * (see http://www.iucr.org/iucr-top/cif/index.html) * which confirms this. The textual explanation is *

*

* 14. Matching single or double quote characters (' or ") may * be used to bound a string representing a non-simple data value * provided the string does not extend over more than one line. *

*

* 15. Because data values are invariably separated from other * tokens in the file by white space, such a quote-delimited * character string may contain instances of the character used * to delimit the string provided they are not followed by white * space. For example, the data item * * _example 'a dog's life' * * is legal; the data value is a dog's life. *

*

* [PMR - the terminating character(s) are quote+whitespace. * That would mean that: * * _example 'Jones' life' * * would be an error *

*

* The CIF format was developed in that late 1980's under the aegis of the * International Union of Crystallography (I am a consultant to the COMCIFs * committee). It was ratified by the Union and there have been several * workshops. mmCIF is an extension of CIF which includes a relational * structure. The formal publications are: *

*

* Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333. * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic * Information File (CIF): A New Standard Archive File for Crystallography", * Acta Cryst., A47, 655-685. * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508. *

*/ public class CifDataParser implements GenericCifDataParser { protected int getVersion() { return 1; } /** * The maximum number of columns (data keys) passed to the parser or found in the file * for a given loop_ or category.subkey listing. * */ public static final int KEY_MAX = 100; private GenericLineReader reader; private BufferedReader br; /** * from buffered reader */ protected String line; /** * working string (buffer) * */ protected String str; /** * pointer to current character on str */ protected int ich; /** * length of str * */ protected int cch; /** * whether we are processing an unquoted value or key */ protected boolean wasUnquoted; /** * optional token terminator; in CIF 2.0 could be } or ] */ protected char cterm = '\0'; /** * string to return for CIF data value . and ? */ protected String nullString = "\0"; /** * A flag to create and return Java objects, not strings. * Used only by Jmol scripting x = getProperty("cifInfo", filename). */ protected boolean asObject; /** * debugging flag passed from reader; unused * */ protected boolean debugging; /** * private processing fields * */ private Object strPeeked; private int ichPeeked; private int columnCount; private String[] columnNames; private Object[] columnData = new Object[KEY_MAX]; private boolean isLoop; private boolean haveData; /** * comments at the top of a file, including #\#CIF_2.0, for example */ private SB fileHeader = new SB(); private boolean isHeader = true; /** * Set the string value of what is returned for "." and "?" * * @param nullString null here returns "." and "?"; default is "\0" * */ public void setNullValue(String nullString) { this.nullString = nullString; } /** * A global, static map that contains field information. The assumption is that * if we read a set of fields for, say, atom_site, once in a lifetime, then * that should be good forever. Those are static lists. Or should be.... */ private static Map htFields = new Hashtable(); //////////////////////////////////////////////////////////////// // special tokenizer class //////////////////////////////////////////////////////////////// public CifDataParser() { // for reflection } @Override public Object getColumnData(int i) { return columnData[i]; } @Override public int getColumnCount() { return columnCount; } @Override public String getColumnName(int i) { return columnNames[i]; } /** * A Chemical Information File data parser. * * set() should be called immediately upon construction. * * Two options; one of reader or br should be null, or reader will be * ignored. Just simpler this way... * * @param reader Anything that can deliver a line of text or null * @param br A standard BufferedReader. * @param debugging * */ @Override public CifDataParser set(GenericLineReader reader, BufferedReader br, boolean debugging) { this.reader = reader; this.br = br; this.debugging = debugging; return this; } /** * * @return commented-out section at the start of a CIF file. * */ @Override public String getFileHeader() { return fileHeader.toString(); } /** * Parses all CIF data for a reader defined in the constructor * into a standard Map structure and close the BufferedReader if * it exists. * * @return Hashtable of models Vector of Hashtable data */ @Override public Map getAllCifData() { line = ""; String key; Map data = null, data0 = null; Map allData = new Hashtable(); Lst> models = new Lst>(); allData.put("models", models); asObject = (getVersion() >= 2); nullString = null; Lst> saveFrames = new Lst>(); try { while ((key = getNextToken()) != null) { if (key.startsWith("global_") || key.startsWith("data_")) { models.addLast(data0 = data = new Hashtable()); data.put("name", key); continue; } if (key.startsWith("loop_")) { getAllCifLoopData(data); continue; } if (key.startsWith("save_")) { if (key.equals("save_")) { int n = saveFrames.size(); if (n == 0) { System.out.println("CIF ERROR ? save_ without corresponding save_xxxx"); data = data0; } else { data = saveFrames.removeItemAt(n - 1); } } else { saveFrames.addLast(data); Map d = data; data = new Hashtable(); d.put(key, data); } continue; } if (key.charAt(0) != '_') { System.out.println("CIF ERROR ? should be an underscore: " + key); } else { Object value = (asObject ? getNextTokenObject() : getNextToken()); if (value == null) { System.out.println("CIF ERROR ? end of file; data missing: " + key); } else { data.put(fixKey(key), value); } } } } catch (Exception e) { // ? } asObject = false; try { if (br != null) br.close(); } catch (Exception e) { // ? } nullString = "\0"; return allData; } /** * create our own list of keywords and for each one create a list * of data associated with that keyword. For example, a list of all * x coordinates, then a list of all y coordinates, etc. * * @param data * @throws Exception */ @SuppressWarnings("unchecked") private void getAllCifLoopData(Map data) throws Exception { String key; Lst keyWords = new Lst(); Object o; while ((o = peekToken()) != null && o instanceof String && ((String) o).charAt(0) == '_') { key = fixKey((String) getTokenPeeked()); keyWords.addLast(key); data.put(key, new Lst()); } columnCount = keyWords.size(); if (columnCount == 0) return; isLoop = true; while (getData()) for (int i = 0; i < columnCount; i++) ((Lst)data.get(keyWords.get(i))).addLast(columnData[i]); isLoop = false; } @Override public String readLine() { try { line = (reader == null ? br.readLine() : reader.readNextLine()); if (line == null) return null; if (isHeader) { if (line.startsWith("#")) fileHeader.append(line).appendC('\n'); else isHeader = false; } return line; } catch (Exception e) { return null; } } /** * The work horse; a general reader for loop data. Fills colunnData with * fieldCount fields. * * @return false if EOF * @throws Exception */ @Override public boolean getData() throws Exception { // line is already present, and we leave with the next line to parse if (isLoop) { for (int i = 0; i < columnCount; ++i) if ((columnData[i] = getNextDataToken()) == null) return false; } else if (haveData) { haveData = false; } else { return false; } return (columnCount > 0); } /** * * Skips all associated loop data. (Skips to next control word.) * * @throws Exception */ @Override public String skipLoop(boolean doReport) throws Exception { String str; SB ret = (doReport ? new SB() : null); int n = 0; while ((str = (String) peekToken()) != null && str.charAt(0) == '_') { if (ret != null) ret.append(str).append("\n"); getTokenPeeked(); n++; } if (n == 0) n = columnCount; // end-of-label-section skip int m = 0; while ((str = (String) getNextDataToken()) != null) { if (ret == null) continue; ret.append(str).append(" "); if ((++m % n) == 0) ret.append("\n"); } return (ret == null ? null : ret.toString()); } /** * Get a token as a String value (for the reader) * * @return the next token of any kind, or null * @throws Exception */ @Override public String getNextToken() throws Exception { wasUnquoted = true; return (String) getNextTokenProtected(); } /** * Get the token as a Java Object * * @return the next token of any kind, or null * @throws Exception */ public Object getNextTokenObject() throws Exception { wasUnquoted = true; return getNextTokenProtected(); } /** * Just makes sure * @return String from buffer. * @throws Exception */ protected Object getNextTokenProtected() throws Exception { return (getNextLine() ? nextStrToken() : null); } /** * * first checks to see if the next token is an unquoted * control code, and if so, returns null * * @return next data token or null * @throws Exception */ @Override public Object getNextDataToken() throws Exception { Object o = peekToken(); if (o == null) return null; if (wasUnquoted && o instanceof String) { String str = (String) o; if (str.charAt(0) == '_' || str.startsWith("loop_") || str.startsWith("data_") || str.startsWith("save_") || str.startsWith("stop_") || str.startsWith("global_")) return null; } return getTokenPeeked(); } /** * Just look at the next token. Saves it for retrieval * using getTokenPeeked() * * @return next token or null if EOF * @throws Exception */ @Override public Object peekToken() throws Exception { if (!getNextLine()) return null; int ich = this.ich; strPeeked = nextStrToken(); ichPeeked= this.ich; this.ich = ich; return strPeeked; } /** * grab a new line if necessary and prepare it * if it starts with ";" * * @return updated this.str * @throws Exception */ private boolean getNextLine() throws Exception { while (!strHasMoreTokens()) if (prepareNextLine() == null) return false; return true; } /** * * @return the token last acquired; may be null */ @Override public Object getTokenPeeked() { ich = ichPeeked; return strPeeked; } /** * Used especially for data that might be multi-line data that * might have unwanted white space at start or end. * * @param str * @return str without any leading/trailing white space, and no '\n' */ @Override public String fullTrim(String str) { int pt0 = -1; int pt1 = str.length(); while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) { } while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) { } return str.substring(pt0, pt1 + 1); } private final static String grABC = "ABX\u0394E\u03A6\u0393H" // ABCDEFGH + "I_K\u039BMNO\u03A0" // I_KLMNOP + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ private final static String grabc = "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz /** * Only translating the basic Greek set here, not all the other stuff. See * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup * * @param data * @return cleaned string */ @Override public String toUnicode(String data) { int pt; try { while ((pt = data.indexOf('\\')) >= 0) { int c = data.charAt(pt + 1); String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64) : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_"); data = data.substring(0, pt) + ch + data.substring(pt + 2); } } catch (Exception e) { // ignore } return data; } /** * Process a data block, with or without a loop_. * * Passed an array of field names, this method fills two int[] arrays. The * first, key2col, maps desired key values to actual order of appearance * (column number) in the file; the second, col2key, is a reverse loop-up for * that, mapping column numbers to desired field indices. * * When called within a loop_ context, this.columnData will be created but not filled. * * Alternatively, if fields is null, then this.fieldNames is * filled, in order, with key data, and both key2col and col2key will be * simply 0,1,2,... This array is used in cases such as matrices for which * there are simply too many possibilities to list, and the key name itself * contains information that we need. * * When not a loop_ context, keys are expected to be in the mmCIF form * category.subkey and will be unique within a data block (see * http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html). * Keys and data will be read for all data in the same category, filling this.columnData. * * * In this way, the calling class does not need to enumerate all possible * category names, but instead can focus on just those of interest. * * * @param fields * list of normalized field names, such as * "_pdbx_struct_assembly_gen_assembly_id" (with "_" instead of ".") * @param key * null to indicate a loop_ construct, otherwise the initial category.subkey * found * @param data * when not loop_ the initial data read, otherwise ignored * @param key2col * map of desired keys to actual columns * @param col2key * map of actual columns to desired keys * @throws Exception */ @Override public void parseDataBlockParameters(String[] fields, String key, String data, int[] key2col, int[] col2key) throws Exception { isLoop = (key == null); Object o; String s; if (fields == null) { // for reading full list of keys, as for matrices columnNames = new String[KEY_MAX]; } else { if (!htFields.containsKey(fields[0])) for (int i = fields.length; --i >= 0;) htFields.put(fields[i], Integer.valueOf(i)); for (int i = fields.length; --i >= 0;) key2col[i] = NONE; } columnCount = 0; int pt, i; if (isLoop) { while (true) { o = peekToken(); if (o == null) { // we are PREMATURELY done; reset columnCount = 0; break; } // end of the loop is a new token not starting with underscore if (!(o instanceof String) || ((String) o).charAt(0) != '_') break; pt = columnCount++; s = fixKey((String) getTokenPeeked()); if (fields == null) { // just make a linear model, saving the list columnNames[col2key[pt] = key2col[pt] = pt] = s; continue; } Integer iField = htFields.get(s); i = (iField == null ? NONE : iField.intValue()); if ((col2key[pt] = i) != NONE) key2col[i] = pt; } } else { pt = key.indexOf("."); String str0 = (pt < 0 ? key : key.substring(0, pt + 1)); while (true) { // end of the loop is a new token not starting with underscore pt = columnCount++; if (key == null) { key = (String) getTokenPeeked(); data = getNextToken(); } Integer iField = htFields.get(fixKey(key)); i = (iField == null ? NONE : iField.intValue()); if ((col2key[pt] = i) != NONE) columnData[key2col[i] = pt] = data; if ((o = peekToken()) == null || !(o instanceof String) || !((String) o).startsWith(str0)) break; key = null; } haveData = (columnCount > 0); } } @Override public String fixKey(String key) { // PRELIMINARY -- BilBao _magnetic // PRELIMINARY -- Jana2006 return (PT.rep( key.startsWith("_magnetic") ? key.substring(9) : key.startsWith("_jana") ? key.substring(5) : key, ".", "_").toLowerCase()); } //////////////////// private methods //////////////////// /** * sets global str and line to be parsed from the beginning * * \1 .... \1 indicates an embedded fully escaped data object * * @param str new data string * @return str */ protected String setString(String str) { this.str = line = str; cch = (str == null ? 0 : str.length()); ich = 0; return str; } /* * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax * * 17. The special sequence of end-of-line followed * immediately by a semicolon in column one (denoted ";") * may also be used as a delimiter at the beginning and end * of a character string comprising a data value. The complete * bounded string is called a text field, and may be used to * convey multi-line values. The end-of-line associated with * the closing semicolon does not form part of the data value. * Within a multi-line text field, leading white space within * text lines must be retained as part of the data value; trailing * white space on a line may however be elided. * * 18. A text field delimited by the ; digraph may not * include a semicolon at the start of a line of text as * part of its value. * * 20. For example, the data value foo may be expressed * equivalently as an unquoted string foo, as a quoted * string 'foo' or as a text field * *;foo *; * * By contrast the value of the text field * *; foo * bar *; * * is foo bar (where represents an end-of-line); * the embedded space characters are significant. * * * I (BH) note, however, that we sometimes have: * * _some_name * ; * the name here * ; * * so this should actually be * * ;the name here * ; * * for this, we use fullTrim(); * */ /** * * sets the string for parsing to be from the next line * when the token buffer is empty, and if ';' is at the * beginning of that line, extends the string to include * that full multiline string. Uses \1 to indicate that * this is a special quotation. * * * * @return the next line or null if EOF * @throws Exception */ protected String prepareNextLine() throws Exception { setString(readLine()); if (line == null || line.length() == 0) return line; if (line.charAt(0) == ';') return preprocessString(); if (str.startsWith("###non-st#")) ich = 10; return line; } /** * Preprocess the string on a line starting with a semicolon * to produce a string with a \1 ... \1 segment * that will be picked up in the next round * * @return escaped part with attached extra data * @throws Exception */ protected String preprocessString() throws Exception { return setString(preprocessSemiString()); } /** * Encapsulate a multi-line ; .... ; string with \1 ... \1 * * CIF 1.0 and CIF 2.0 * * @return ecapsulated string * @throws Exception */ protected String preprocessSemiString() throws Exception { ich = 1; String str = '\1' + line.substring(1) + '\n'; while (readLine() != null) { if (line.startsWith(";")) { // remove trailing only, and attach rest of next line str = str.substring(0, str.length() - 1) + '\1' + line.substring(1); break; } str += line + '\n'; } return str; } /** * @return TRUE if there are more tokens in the line buffer * */ private boolean strHasMoreTokens() { if (str == null) return false; char ch = '#'; while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t')) ++ich; return (ich < cch && ch != '#'); } /** * assume that hasMoreTokens() has been called and that ich is pointing at a * non-white character. Also sets boolean wasUnQuoted, because we need to know * if we should be checking for a control keyword. 'loop_' is different from * just loop_ without the quotes. * * @return null if no more tokens, "\0" if '.' or '?', or next token */ private Object nextStrToken() { if (ich == cch) return null; char ch = str.charAt(ich); if (isQuote(ch)) { wasUnquoted = false; return getQuotedStringOrObject(ch); } int ichStart = ich; wasUnquoted = true; while (ich < cch && !isTerminator(ch = str.charAt(ich))) ++ich; if (ich == ichStart + 1) if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?')) return nullString; String s = str.substring(ichStart, ich); return unquoted(s); } /** * In CIF 2.0, this method turns a String into an Integer or Float * In CIF 1.0 (here) just return the unchanged value. * @param s unquoted string * @return unchanged value */ protected Object unquoted(String s) { return s; } /** * The token terminator is space or tab in CIF 1.0, * but it can be quoted strings in CIF 2.0. * * @param c * @return true if this character is a terminator */ protected boolean isTerminator(char c) { return c == ' ' || c == '\t' || c == cterm ; } /** * CIF 1.0 only; we handle various quote types here * @param ch * @return true if this character is a (starting) quote */ protected boolean isQuote(char ch) { switch (ch) { case '\'': case '\"': case '\1': return true; } return false; } /** * CIF 1.0 only. * * * @param ch current character being pointed to * @return a String data object */ protected Object getQuotedStringOrObject(char ch) { int ichStart = ich; char chClosingQuote = ch; boolean wasQuote = false; while (++ich < cch) { ch = str.charAt(ich); // CIF 1.0 rules require that the closing ' or "" be followed by space or tab or EOL if (wasQuote && (ch == ' ' || ch == '\t')) break; wasQuote = (ch == chClosingQuote); } int pt1 = ichStart + 1; int pt2 = ich - 1; if (ich == cch && !wasQuote) { // reached the end of the string without finding closing ' // so take the whole thing. Probably a bad CIF file. pt1--; pt2++; } else { // throw away the last white character ++ich; } return str.substring(pt1, pt2); } }