All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.rcsb.cif.binary.BinaryCifWriter Maven / Gradle / Ivy
package org.rcsb.cif.binary;
import org.rcsb.cif.CifOptions;
import org.rcsb.cif.EncodingStrategyHint;
import org.rcsb.cif.binary.codec.BinaryCifCodec;
import org.rcsb.cif.binary.codec.MessagePackCodec;
import org.rcsb.cif.binary.data.ByteArray;
import org.rcsb.cif.binary.data.Float64Array;
import org.rcsb.cif.binary.data.Int32Array;
import org.rcsb.cif.binary.data.StringArray;
import org.rcsb.cif.binary.data.Uint8Array;
import org.rcsb.cif.binary.encoding.ByteArrayEncoding;
import org.rcsb.cif.binary.encoding.Encoding;
import org.rcsb.cif.binary.encoding.FixedPointEncoding;
import org.rcsb.cif.binary.encoding.RunLengthEncoding;
import org.rcsb.cif.binary.encoding.StringArrayEncoding;
import org.rcsb.cif.model.Block;
import org.rcsb.cif.model.Category;
import org.rcsb.cif.model.CifFile;
import org.rcsb.cif.model.Column;
import org.rcsb.cif.model.FloatColumn;
import org.rcsb.cif.model.IntColumn;
import org.rcsb.cif.model.StrColumn;
import org.rcsb.cif.model.ValueKind;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class BinaryCifWriter {
private final CifOptions options;
public BinaryCifWriter(CifOptions options) {
this.options = options;
}
public byte[] write(CifFile cifFile) {
Map file = encodeFile(cifFile);
return MessagePackCodec.encode(file);
}
private Map encodeFile(CifFile cifFile) {
// naming: uses cifEntity for original model and entity for the map representation ready for MessagePack
Map file = new LinkedHashMap<>();
file.put("encoder", options.getEncoder());
file.put("version", BinaryCifCodec.VERSION);
Object[] blocks = new Object[cifFile.getBlocks().size()];
int blockCount = 0;
file.put("dataBlocks", blocks);
for (Block cifBlock : cifFile.getBlocks()) {
Map block = new LinkedHashMap<>();
String blockHeader = cifBlock.getBlockHeader();
String header = blockHeader != null ? blockHeader.replaceAll("[ \n\t]", "").toUpperCase() : "UNKNOWN";
block.put("header", header);
// filter category names
List filteredCategories = cifBlock.categories()
.filter(category -> options.filterCategory(category.getCategoryName()))
.collect(Collectors.toList());
Object[] categories = new Object[filteredCategories.size()];
int categoryCount = 0;
block.put("categories", categories);
blocks[blockCount++] = block;
for (Category category : filteredCategories) {
String categoryName = category.getCategoryName();
int rowCount = category.getRowCount();
if (rowCount == 0) {
continue;
}
Map categoryMap = new LinkedHashMap<>();
categoryMap.put("name", "_" + category.getCategoryName());
Object[] columns = category.columns()
.filter(column -> options.filterColumn(categoryName, column.getColumnName()))
.map(column -> encodeColumn(categoryName, column))
.toArray();
categoryMap.put("columns", columns);
categoryMap.put("rowCount", rowCount);
categories[categoryCount++] = categoryMap;
}
}
return file;
}
private ByteArray encodeFloatArray(Float64Array column, EncodingStrategyHint optional) {
// if no hint given, classify column
EncodingStrategyHint hint = optional != null ? optional : Classifier.classify(column);
if (hint.getEncoding() == null) {
hint.setEncoding(Classifier.classify(column).getEncoding());
}
if ("byte".equals(hint.getEncoding())) {
return column.encode();
}
if (hint.getPrecision() == null) {
hint.setPrecision(Classifier.classify(column).getPrecision());
}
int multiplier = getMultiplier(hint.getPrecision());
Int32Array fixedPoint = column.encode(new FixedPointEncoding(multiplier));
return Classifier.encode(fixedPoint, hint.getEncoding());
}
private static int getMultiplier(int mantissaDigits) {
int m = 1;
for (int i = 0; i < mantissaDigits; i++) {
m *= 10;
}
return m;
}
private ByteArray encodeIntArray(Int32Array column, EncodingStrategyHint optional) {
// if no hint given, classify column
String encoding = optional != null && optional.getEncoding() != null ? optional.getEncoding() : Classifier.classify(column).getEncoding();
return Classifier.encode(column, encoding);
}
private Map encodeColumn(String categoryName, Column cifColumn) {
// TODO encoding provider support and/or make auto-classify configurable
EncodingStrategyHint optional = options.getEncodingStrategyHint(categoryName, cifColumn.getColumnName()).orElse(null);
ColumnType type = ColumnType.of(cifColumn);
switch (type) {
case Str:
return encodeStr(cifColumn);
case Float:
return encodeFloat(cifColumn, optional);
case Int:
return encodeInt(cifColumn, optional);
default:
throw new UnsupportedOperationException(type + " not handled");
}
}
private Map encodeStr(Column cifColumn) {
String[] array = cifColumn instanceof StrColumn ?
((StrColumn) cifColumn).getArray() :
cifColumn.stringData().toArray(String[]::new);
ByteArray byteArray = new StringArray(array).encode(new StringArrayEncoding());
return encodeColumnUsingByteArray(cifColumn, byteArray);
}
private Map encodeFloat(Column cifColumn, EncodingStrategyHint optional) {
double[] array = cifColumn instanceof FloatColumn ?
((FloatColumn) cifColumn).getArray() :
cifColumn.stringData().mapToDouble(FloatColumn::parseFloat).toArray();
ByteArray byteArray = encodeFloatArray(new Float64Array(array), optional);
return encodeColumnUsingByteArray(cifColumn, byteArray);
}
private Map encodeInt(Column cifColumn, EncodingStrategyHint optional) {
int[] array = cifColumn instanceof IntColumn ?
((IntColumn) cifColumn).getArray() :
cifColumn.stringData().mapToInt(IntColumn::parseInt).toArray();
ByteArray byteArray = encodeIntArray(new Int32Array(array), optional);
return encodeColumnUsingByteArray(cifColumn, byteArray);
}
enum ColumnType {
Int,
Float,
Str;
static ColumnType of(Column column) {
int floatCount = 0;
boolean hasStringOrScientific = false;
int undefinedCount = 0;
for (int i = 0; i < column.getRowCount(); i++) {
ValueKind valueKind = column.getValueKind(i);
if (valueKind != ValueKind.PRESENT) {
undefinedCount++;
continue;
}
NumberType type = NumberType.of(column.getStringData(i));
if (type == NumberType.Int) {
continue;
} else if (type == NumberType.Float) {
floatCount++;
} else {
hasStringOrScientific = true;
break;
}
}
if (hasStringOrScientific || undefinedCount == column.getRowCount()) {
return Str;
}
if (floatCount > 0) {
return Float;
}
return Int;
}
}
enum NumberType {
Int,
Float,
Scientific,
NaN;
static NumberType of(String v) {
int start = 0;
int end = v.length();
if (v.charAt(start) == '-') {
start++;
}
if (v.charAt(start) == '.' && end - start == 1) {
return NaN;
}
while (start < end) {
int c = v.charAt(start);
if (c >= '0' && c < ':') {
start++;
} else if (c == '.') {
start++;
boolean hasDigit = false;
while (start < end) {
c = v.charAt(start);
if (c >= '0' && c < ':') {
hasDigit = true;
start++;
} else if (c == 'e' || c == 'E') {
return getNumberTypeScientific(v, start + 1, end);
} else {
return NaN;
}
}
return hasDigit ? Float : Int;
} else if (c == 'e' || c == 'E') {
if (start == 0 || start == 1 && v.charAt(0) == '-') {
return NaN;
}
return getNumberTypeScientific(v, start + 1, end);
} else {
break;
}
}
return start == end ? Int : NaN;
}
// check for "scientific integers?"
static NumberType getNumberTypeScientific(String v, int start, int end) {
// handle + in '1e+1' separately.
if (v.charAt(start) == '+') start++;
return isInt(v, start, end) ? NumberType.Scientific : NumberType.NaN;
}
static boolean isInt(String v, int start, int end) {
if (v.charAt(start) == '-') { start++; }
for (; start < end; start++) {
int c = v.charAt(start) - '0';
if (c > 9 || c < 0) return false;
}
return true;
}
}
private Map encodeColumnUsingByteArray(Column cifField, ByteArray byteArray) {
String name = cifField.getColumnName();
// handle ValueKind and if needed create mask
int[] maskArray = new int[cifField.getRowCount()];
Uint8Array mask = new Uint8Array(maskArray);
boolean allPresent = true;
for (int row = 0; row < maskArray.length; row++) {
ValueKind kind = cifField.getValueKind(row);
if (kind != ValueKind.PRESENT) {
maskArray[row] = (byte) kind.ordinal();
allPresent = false;
} else {
maskArray[row] = (byte) ValueKind.PRESENT.ordinal();
}
}
// default encoding
Map encodedMap = new LinkedHashMap<>();
encodedMap.put("encoding", byteArray.getEncoding()
.stream()
.map(Encoding::getMapRepresentation)
.toArray(Map[]::new));
encodedMap.put("data", byteArray.getData());
// encode mask
Map maskData = null;
if (!allPresent) {
maskData = new LinkedHashMap<>();
ByteArray maskRLE = mask.encode(new RunLengthEncoding()).encode();
if (maskRLE.getData().length < mask.getData().length) {
RunLengthEncoding rle = (RunLengthEncoding) maskRLE.getEncoding().getFirst();
maskData.put("encoding", new Object[] { rle.getMapRepresentation(), ByteArrayEncoding.INT32.getMapRepresentation() });
maskData.put("data", maskRLE.getData());
} else {
ByteArray encodedMask = mask.encode();
maskData.put("encoding", new Object[] { ByteArrayEncoding.UINT8.getMapRepresentation() });
maskData.put("data", encodedMask.getData());
}
}
Map map = new LinkedHashMap<>();
map.put("name", name);
map.put("data", encodedMap);
map.put("mask", maskData);
return map;
}
}