All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.marmotta.kiwi.io.KiWiIO Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.marmotta.kiwi.io;

import org.apache.commons.lang3.StringUtils;
import org.apache.marmotta.commons.io.DataIO;
import org.apache.marmotta.commons.vocabulary.SCHEMA;
import org.apache.marmotta.commons.vocabulary.XSD;
import org.apache.marmotta.kiwi.model.rdf.*;
import org.openrdf.model.vocabulary.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;

/**
 * Add file description here!
 *
 * @author Sebastian Schaffert ([email protected])
 */
public class KiWiIO {

    public static final String NS_DBPEDIA = "http://dbpedia.org/resource/";
    public static final String NS_FREEBASE = "http://rdf.freebase.com/ns/";
    private static Logger log = LoggerFactory.getLogger(KiWiIO.class);

    /**
     * Minimum length of content where we start using compression.
     */
    private static final int LITERAL_COMPRESS_LENGTH = 500;

    private static final int PREFIX_UNKNOWN = 0;
    private static final int PREFIX_XSD     = 1;
    private static final int PREFIX_RDF     = 2;
    private static final int PREFIX_RDFS    = 3;
    private static final int PREFIX_SKOS    = 4;
    private static final int PREFIX_DC      = 5;
    private static final int PREFIX_DCT     = 6;
    private static final int PREFIX_OWL     = 7;
    private static final int PREFIX_LOCAL   = 8;
    private static final int PREFIX_REDLINK = 9;
    private static final int PREFIX_SCHEMA  = 10;
    private static final int PREFIX_DBPEDIA = 11;
    private static final int PREFIX_FREEBASE= 12;


    private static final int TYPE_URI       = 1;
    private static final int TYPE_BNODE     = 2;
    private static final int TYPE_BOOLEAN   = 3;
    private static final int TYPE_DATE      = 4;
    private static final int TYPE_DOUBLE    = 5;
    private static final int TYPE_INT       = 6;
    private static final int TYPE_STRING    = 7;


    public static final int MODE_DEFAULT    = 1; // no compression
    public static final int MODE_PREFIX     = 2; // prefix compression for some known URI prefixes
    public static final int MODE_COMPRESSED = 3; // reserved: ZLIB string compression for long literals

    private static final int LANG_UNKNOWN = 0;
    private static final int LANG_EN = 1;
    private static final int LANG_DE = 2;
    private static final int LANG_FR = 3;
    private static final int LANG_ES = 4;
    private static final int LANG_NL = 5;
    private static final int LANG_PT = 6;
    private static final int LANG_RU = 7;
    private static final int LANG_SV = 8;
    private static final int LANG_NO = 9;
    private static final int LANG_FI = 10;
    private static final int LANG_DK = 11;
    private static final int LANG_IT = 12;
    private static final int LANG_PL = 13;


    public static final String HTTP_LOCALHOST = "http://localhost";
    public static final String NS_REDLINK = "http://data.redlink.io";


    private static Map, Integer> classTable = new HashMap<>();
    static {
        classTable.put(KiWiUriResource.class,    TYPE_URI);
        classTable.put(KiWiAnonResource.class,   TYPE_BNODE);
        classTable.put(KiWiBooleanLiteral.class, TYPE_BOOLEAN);
        classTable.put(KiWiDateLiteral.class,    TYPE_DATE);
        classTable.put(KiWiDoubleLiteral.class,  TYPE_DOUBLE);
        classTable.put(KiWiIntLiteral.class,     TYPE_INT);
        classTable.put(KiWiStringLiteral.class,  TYPE_STRING);
    }


    private static Map langTable = new HashMap<>();
    static {
        langTable.put("en", LANG_EN);
        langTable.put("de", LANG_DE);
        langTable.put("fr", LANG_FR);
        langTable.put("es", LANG_ES);
        langTable.put("nl", LANG_NL);
        langTable.put("pt", LANG_PT);
        langTable.put("ru", LANG_RU);
        langTable.put("sv", LANG_SV);
        langTable.put("no", LANG_NO);
        langTable.put("fi", LANG_FI);
        langTable.put("dk", LANG_DK);
        langTable.put("it", LANG_IT);
        langTable.put("pl", LANG_PL);
    }

    /**
     * Efficiently serialize a KiWiNode to a DataOutput destination. The type of node will be encoded with a single
     * byte usinbg the TYPE_* constants defined in this class
     *
     * @param output  DataOutput destination
     * @param node  KiWiNode to serialize
     * @throws IOException
     */
    public static void writeNode(DataOutput output, KiWiNode node) throws IOException {
        if(node == null) {
            output.writeByte(0);
        } else {
            int type = classTable.get(node.getClass());
            output.writeByte(type);
            switch (type) {
                case TYPE_URI:
                    writeURI(output, (KiWiUriResource) node);
                    break;
                case TYPE_BNODE:
                    writeBNode(output, (KiWiAnonResource) node);
                    break;
                case TYPE_BOOLEAN:
                    writeBooleanLiteral(output, (KiWiBooleanLiteral) node);
                    break;
                case TYPE_DATE:
                    writeDateLiteral(output, (KiWiDateLiteral) node);
                    break;
                case TYPE_DOUBLE:
                    writeDoubleLiteral(output, (KiWiDoubleLiteral) node);
                    break;
                case TYPE_INT:
                    writeIntLiteral(output, (KiWiIntLiteral) node);
                    break;
                case TYPE_STRING:
                    writeStringLiteral(output, (KiWiStringLiteral) node);
                    break;
                default:
                    throw new IllegalArgumentException("unknown KiWiNode type: "+node.getClass());
            }
        }
    }


    /**
     * Read a KiWiNode serialized with writeNode and return it. The type indicator is used to determine which type
     * of resource to instantiate.
     *
     * @param input DataInput source
     * @return an instance of a subclass of KiWiNode, depending on the type indicator read from the source
     * @throws IOException
     */
    public static KiWiNode readNode(DataInput input) throws IOException {
        int type = input.readByte();
        switch (type) {
            case 0:
                return null;
            case TYPE_URI:
                return readURI(input);
            case TYPE_BNODE:
                return readBNode(input);
            case TYPE_BOOLEAN:
                return readBooleanLiteral(input);
            case TYPE_DATE:
                return readDateLiteral(input);
            case TYPE_DOUBLE:
                return readDoubleLiteral(input);
            case TYPE_INT:
                return readIntLiteral(input);
            case TYPE_STRING:
                return readStringLiteral(input);
            default:
                throw new IllegalArgumentException("unknown KiWiNode type: "+type);
        }

    }

    /**
     * Efficiently serialize a KiWiUriResource to a DataOutput destination, using prefix compression for commonly used
     * prefixes.
     *
     * @param out  DataOutput destination
     * @param uri  KiWiUriResource to serialize
     * @throws IOException
     */
    public static void writeURI(DataOutput out, KiWiUriResource uri) throws IOException {
        if(uri == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(uri.getId());

            // compression for commonly used constant prefixes
            if(uri.stringValue().startsWith(XSD.NAMESPACE)) {
                out.writeByte(PREFIX_XSD);
                DataIO.writeString(out, uri.stringValue().substring(XSD.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(RDF.NAMESPACE)) {
                out.writeByte(PREFIX_RDF);
                DataIO.writeString(out, uri.stringValue().substring(RDF.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(RDFS.NAMESPACE)) {
                out.writeByte(PREFIX_RDFS);
                DataIO.writeString(out, uri.stringValue().substring(RDFS.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(SKOS.NAMESPACE)) {
                out.writeByte(PREFIX_SKOS);
                DataIO.writeString(out, uri.stringValue().substring(SKOS.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(DC.NAMESPACE)) {
                out.writeByte(PREFIX_DC);
                DataIO.writeString(out, uri.stringValue().substring(DC.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(DCTERMS.NAMESPACE)) {
                out.writeByte(PREFIX_DCT);
                DataIO.writeString(out, uri.stringValue().substring(DCTERMS.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(OWL.NAMESPACE)) {
                out.writeByte(PREFIX_OWL);
                DataIO.writeString(out, uri.stringValue().substring(OWL.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(SCHEMA.NAMESPACE)) {
                out.writeByte(PREFIX_SCHEMA);
                DataIO.writeString(out, uri.stringValue().substring(SCHEMA.NAMESPACE.length()));
            } else if(uri.stringValue().startsWith(NS_REDLINK)) {
                out.writeByte(PREFIX_REDLINK);
                DataIO.writeString(out, uri.stringValue().substring(NS_REDLINK.length()));
            } else if(uri.stringValue().startsWith(NS_DBPEDIA)) {
                out.writeByte(PREFIX_DBPEDIA);
                DataIO.writeString(out, uri.stringValue().substring(NS_DBPEDIA.length()));
            } else if(uri.stringValue().startsWith(NS_FREEBASE)) {
                out.writeByte(PREFIX_FREEBASE);
                DataIO.writeString(out, uri.stringValue().substring(NS_FREEBASE.length()));
            } else if(uri.stringValue().startsWith(HTTP_LOCALHOST)) {
                out.writeByte(PREFIX_LOCAL);
                DataIO.writeString(out, uri.stringValue().substring(HTTP_LOCALHOST.length()));
            } else {
                out.writeByte(PREFIX_UNKNOWN);
                DataIO.writeString(out, uri.stringValue());
            }

            out.writeLong(uri.getCreated().getTime());
        }
    }


    /**
     * Read a KiWiUriResource serialized with writeURI and return it.
     *
     * @param input DataInput source
     * @return a KiWiUriResource
     * @throws IOException
     */
    public static KiWiUriResource readURI(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {


            int prefixMode = input.readByte();
            String uriPrefix = "";
            String uriSuffix = DataIO.readString(input);

            switch (prefixMode) {
                case PREFIX_XSD:
                    uriPrefix = XSD.NAMESPACE;
                    break;
                case PREFIX_RDF:
                    uriPrefix = RDF.NAMESPACE;
                    break;
                case PREFIX_RDFS:
                    uriPrefix = RDFS.NAMESPACE;
                    break;
                case PREFIX_SKOS:
                    uriPrefix = SKOS.NAMESPACE;
                    break;
                case PREFIX_DC:
                    uriPrefix = DC.NAMESPACE;
                    break;
                case PREFIX_DCT:
                    uriPrefix = DCTERMS.NAMESPACE;
                    break;
                case PREFIX_OWL:
                    uriPrefix = OWL.NAMESPACE;
                    break;
                case PREFIX_SCHEMA:
                    uriPrefix = SCHEMA.NAMESPACE;
                    break;
                case PREFIX_REDLINK:
                    uriPrefix = NS_REDLINK;
                    break;
                case PREFIX_DBPEDIA:
                    uriPrefix = NS_DBPEDIA;
                    break;
                case PREFIX_FREEBASE:
                    uriPrefix = NS_FREEBASE;
                    break;
                case PREFIX_LOCAL:
                    uriPrefix = HTTP_LOCALHOST;
                    break;
                default:
                    uriPrefix = "";
                    break;
            }

            Date created = new Date(input.readLong());

            KiWiUriResource r = new KiWiUriResource(uriPrefix + uriSuffix,created);
            r.setId(id);

            return r;
        }
    }


    /**
     * Efficiently serialize a KiWiAnonResource to a DataOutput destination.
     *
     * @param out the destination
     * @param bnode the KiWiAnonResource to serialize
     * @throws IOException
     */
    public static void writeBNode(DataOutput out, KiWiAnonResource bnode) throws IOException {
        if(bnode == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(bnode.getId());
            DataIO.writeString(out, bnode.stringValue());
            out.writeLong(bnode.getCreated().getTime());
        }
    }

    /**
     * Read a KiWiAnonResource serialized with writeBNode from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiAnonResource
     * @throws IOException
     */
    public static KiWiAnonResource readBNode(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {
            String anonId = DataIO.readString(input);

            Date created = new Date(input.readLong());

            KiWiAnonResource r = new KiWiAnonResource(anonId,created);
            r.setId(id);

            return r;
        }
    }


    /**
     * Efficiently serialize a KiWiBooleanLiteral to a DataOutput destination.
     *
     * @param out the destination
     * @param literal the KiWiBooleanLiteral to serialize
     * @throws IOException
     */
    public static void writeBooleanLiteral(DataOutput out, KiWiBooleanLiteral literal) throws IOException {
        if(literal == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(literal.getId());
            out.writeBoolean(literal.booleanValue());
            writeURI(out, literal.getType());
            out.writeLong(literal.getCreated().getTime());
        }
    }


    /**
     * Read a KiWiBooleanLiteral serialized with writeBooleanLiteral from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiBooleanLiteral
     * @throws IOException
     */
    public static KiWiBooleanLiteral readBooleanLiteral(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {
            boolean content = input.readBoolean();

            KiWiUriResource dtype = readURI(input);

            Date created = new Date(input.readLong());

            KiWiBooleanLiteral r = new KiWiBooleanLiteral(content, dtype, created);
            r.setId(id);

            return r;
        }
    }


    /**
     * Efficiently serialize a KiWiDateLiteral to a DataOutput destination.
     *
     * @param out the destination
     * @param literal the KiWiDateLiteral to serialize
     * @throws IOException
     */
    public static void writeDateLiteral(DataOutput out, KiWiDateLiteral literal) throws IOException {
        if(literal == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(literal.getId());
            out.writeLong(literal.getDateContent().getTime());
            writeURI(out, literal.getType());
            out.writeLong(literal.getCreated().getTime());
        }
    }


    /**
     * Read a KiWiDateLiteral serialized with writeDateLiteral from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiDateLiteral
     * @throws IOException
     */
    public static KiWiDateLiteral readDateLiteral(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {
            Date content = new Date(input.readLong());

            KiWiUriResource dtype = readURI(input);

            Date created = new Date(input.readLong());

            KiWiDateLiteral r = new KiWiDateLiteral(content, dtype, created);
            r.setId(id);

            return r;
        }
    }


    /**
     * Efficiently serialize a KiWiDoubleLiteral to a DataOutput destination.
     *
     * @param out the destination
     * @param literal the KiWiDoubleLiteral to serialize
     * @throws IOException
     */
    public static void writeDoubleLiteral(DataOutput out, KiWiDoubleLiteral literal) throws IOException {
        if(literal == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(literal.getId());
            out.writeDouble(literal.getDoubleContent());
            writeURI(out, literal.getType());
            out.writeLong(literal.getCreated().getTime());
        }
    }


    /**
     * Read a KiWiDoubleLiteral serialized with writeDoubleLiteral from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiDoubleLiteral
     * @throws IOException
     */
    public static KiWiDoubleLiteral readDoubleLiteral(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {
            double content = input.readDouble();

            KiWiUriResource dtype = readURI(input);

            Date created = new Date(input.readLong());

            KiWiDoubleLiteral r = new KiWiDoubleLiteral(content, dtype, created);
            r.setId(id);

            return r;
        }
    }


    /**
     * Efficiently serialize a KiWiIntLiteral to a DataOutput destination.
     *
     * @param out the destination
     * @param literal the KiWiIntLiteral to serialize
     * @throws IOException
     */
    public static void writeIntLiteral(DataOutput out, KiWiIntLiteral literal) throws IOException {
        if(literal == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(literal.getId());
            out.writeLong(literal.getIntContent());
            writeURI(out, literal.getType());
            out.writeLong(literal.getCreated().getTime());
        }
    }


    /**
     * Read a KiWiIntLiteral serialized with writeIntLiteral from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiIntLiteral
     * @throws IOException
     */
    public static KiWiIntLiteral readIntLiteral(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {
            long content = input.readLong();

            KiWiUriResource dtype = readURI(input);

            Date created = new Date(input.readLong());

            KiWiIntLiteral r = new KiWiIntLiteral(content, dtype, created);
            r.setId(id);

            return r;
        }
    }



    /**
     * Efficiently serialize a KiWiStringLiteral to a DataOutput destination.
     *
     * @param out the destination
     * @param literal the KiWiStringLiteral to serialize
     * @throws IOException
     */
    public static void writeStringLiteral(DataOutput out, KiWiStringLiteral literal) throws IOException {
        if(literal == null) {
            out.writeLong(-1L);
        } else {
            out.writeLong(literal.getId());
            writeContent(out, literal.getContent());
            if(langTable.containsKey(literal.getLanguage())) {
                out.writeByte(langTable.get(literal.getLanguage()));
            } else {
                out.writeByte(LANG_UNKNOWN);
                DataIO.writeString(out, literal.getLanguage());
            }
            writeURI(out, literal.getType());
            out.writeLong(literal.getCreated().getTime());
        }
    }



    /**
     * Read a KiWiStringLiteral serialized with writeStringLiteral from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiStringLiteral
     * @throws IOException
     */
    public static KiWiStringLiteral readStringLiteral(DataInput input) throws IOException {
        long id = input.readLong();

        if(id == -1) {
            return null;
        } else {
            String content = readContent(input);
            byte   langB   = input.readByte();
            String lang;

            switch (langB) {
                case LANG_EN:
                    lang = "en";
                    break;
                case LANG_DE:
                    lang = "de";
                    break;
                case LANG_FR:
                    lang = "fr";
                    break;
                case LANG_ES:
                    lang = "es";
                    break;
                case LANG_IT:
                    lang = "it";
                    break;
                case LANG_PT:
                    lang = "pt";
                    break;
                case LANG_NL:
                    lang = "nl";
                    break;
                case LANG_SV:
                    lang = "sv";
                    break;
                case LANG_NO:
                    lang = "no";
                    break;
                case LANG_FI:
                    lang = "fi";
                    break;
                case LANG_RU:
                    lang = "ru";
                    break;
                case LANG_DK:
                    lang = "dk";
                    break;
                case LANG_PL:
                    lang = "pl";
                    break;
                default:
                    lang = DataIO.readString(input);
            }



            KiWiUriResource dtype = readURI(input);

            Date created = new Date(input.readLong());

            KiWiStringLiteral r = new KiWiStringLiteral(content, lang != null ? Locale.forLanguageTag(lang) : null, dtype, created);
            r.setId(id);

            return r;
        }
    }


    /**
     * Efficiently serialize a KiWiTriple to a DataOutput destination.
     *
     * @param output the destination
     * @param triple the KiWiTriple to serialize
     * @throws IOException
     */
    public static void writeTriple(DataOutput output, KiWiTriple triple) throws IOException {
        output.writeLong(triple.getId());

        // in case subject and object are both uris we use a special prefix-compressed mode
        if(triple.getSubject().isUriResource() && triple.getObject().isUriResource()) {
            String sUri = triple.getSubject().stringValue();
            String oUri = triple.getObject().stringValue();

            String prefix = StringUtils.getCommonPrefix(sUri, oUri);

            output.writeByte(MODE_PREFIX);
            DataIO.writeString(output,prefix);

            output.writeLong(triple.getSubject().getId());
            DataIO.writeString(output, sUri.substring(prefix.length()));
            output.writeLong(triple.getSubject().getCreated().getTime());

            writeURI(output,triple.getPredicate());

            output.writeLong(triple.getObject().getId());
            DataIO.writeString(output, oUri.substring(prefix.length()));
            output.writeLong(triple.getObject().getCreated().getTime());
        } else {
            output.writeByte(MODE_DEFAULT);

            writeNode(output,triple.getSubject());
            writeURI(output,triple.getPredicate());
            writeNode(output,triple.getObject());

        }

        writeNode(output,triple.getContext());
        writeNode(output,triple.getCreator());
        output.writeBoolean(triple.isDeleted());
        output.writeBoolean(triple.isInferred());
        output.writeBoolean(triple.isNewTriple());
        output.writeLong(triple.getCreated().getTime());
        if(triple.getDeletedAt() != null) {
            output.writeLong(triple.getDeletedAt().getTime());
        } else {
            output.writeLong(0);
        }
    }


    /**
     * Read a KiWiTriple serialized with writeTriple from a DataInput source
     *
     * @param input the source
     * @return the de-serialized KiWiTriple
     * @throws IOException
     */
    public static KiWiTriple readTriple(DataInput input) throws IOException {
        KiWiTriple result = new KiWiTriple();
        result.setId(input.readLong());

        int mode = input.readByte();
        if(mode == MODE_PREFIX) {
            String prefix = DataIO.readString(input);

            long sId = input.readLong();
            String sUri = prefix + DataIO.readString(input);
            long sTime = input.readLong();
            KiWiUriResource s = new KiWiUriResource(sUri);
            s.setId(sId);
            s.setCreated(new Date(sTime));
            result.setSubject(s);

            result.setPredicate(readURI(input));

            long oId = input.readLong();
            String oUri = prefix + DataIO.readString(input);
            long oTime = input.readLong();
            KiWiUriResource o = new KiWiUriResource(oUri);
            o.setId(oId);
            o.setCreated(new Date(oTime));
            result.setObject(o);

        } else {
            result.setSubject((KiWiResource) readNode(input));
            result.setPredicate(readURI(input));
            result.setObject(readNode(input));
        }
        result.setContext((KiWiResource) readNode(input));
        result.setCreator((KiWiResource) readNode(input));
        result.setDeleted(input.readBoolean());
        result.setInferred(input.readBoolean());
        result.setNewTriple(input.readBoolean());

        result.setCreated(new Date(input.readLong()));

        long deletedAt = input.readLong();
        if(deletedAt > 0) {
            result.setDeletedAt(new Date(deletedAt));
        }


        return result;

    }

    /**
     * Read a potentially compressed string from the data input.
     *
     * @param in
     * @return
     * @throws IOException
     */
    private static String readContent(DataInput in) throws IOException {
        int mode = in.readByte();

        if(mode == MODE_COMPRESSED) {
            try {
                int strlen = in.readInt();
                int buflen = in.readInt();

                byte[] buffer = new byte[buflen];
                in.readFully(buffer);

                Inflater decompressor = new Inflater(true);
                decompressor.setInput(buffer);

                byte[] data = new byte[strlen];
                decompressor.inflate(data);
                decompressor.end();

                return new String(data,"UTF-8");
            } catch(DataFormatException ex) {
                throw new IllegalStateException("input data is not valid",ex);
            }
        } else {
            return DataIO.readString(in);
        }
    }

    /**
     * Write a string to the data output. In case the string length exceeds LITERAL_COMPRESS_LENGTH, uses a LZW
     * compressed format, otherwise writes the plain bytes.
     *
     * @param out      output destination to write to
     * @param content  string to write
     * @throws IOException
     */
    private static void writeContent(DataOutput out, String content) throws IOException {
        if(content.length() > LITERAL_COMPRESS_LENGTH) {
            // temporary buffer of the size of bytes in the content string (assuming that the compressed data will fit into it)
            byte[] data   = content.getBytes("UTF-8");
            byte[] buffer = new byte[data.length];

            Deflater compressor = new Deflater(Deflater.BEST_COMPRESSION, true);
            compressor.setInput(data);
            compressor.finish();

            int length = compressor.deflate(buffer);

            // only use compressed version if it is smaller than the number of bytes used by the string
            if(length < buffer.length) {
                log.debug("compressed string with {} bytes; compression ratio {}", data.length, (double) length / data.length);

                out.writeByte(MODE_COMPRESSED);
                out.writeInt(data.length);
                out.writeInt(length);
                out.write(buffer,0,length);
            } else {
                log.warn("compressed length exceeds string buffer: {} > {}", length, buffer.length);

                out.writeByte(MODE_DEFAULT);
                DataIO.writeString(out,content);
            }

            compressor.end();
        } else {
            out.writeByte(MODE_DEFAULT);
            DataIO.writeString(out,content);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy