All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.io.compression.UnicodeHelper Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 26, 2011
 */

package com.bigdata.io.compression;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataInputBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.LongPacker;
import com.bigdata.io.SliceInputStream;

/**
 * Utility class for compressed unicode encode/decode.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class UnicodeHelper {

    /**
     * Used to compress Unicode strings.
     */
    private final IUnicodeCompressor uc;

    public UnicodeHelper(final IUnicodeCompressor uc) {
        
        if(uc == null)
            throw new IllegalArgumentException();

        this.uc = uc;
        
    }
    
    /**
     * Encode the {@link String} onto the {@link OutputStream}. The temporary
     * buffer is used to perform the encoding. The byte length of the encoding
     * is written out using a packed long integer format followed by the coded
     * bytes. This method is capable of writing out very long strings.
     * 
     * @param s
     *            The character data.
     * @param out
     *            The output stream.
     * @param tmp
     *            The temporary buffer.
     * @throws IOException
     */
    public int encode(final String s, final OutputStream out,
            final ByteArrayBuffer tmp) throws IOException {
     
        // reset the temporary buffer
        tmp.reset();
        
        // encode the data onto the temporary buffer.
        int nencoded = uc.encode(s, tmp);
        
        // the #of bytes written onto the temporary buffer.
        final int len = tmp.pos();
        
        // write out the packed byte length of the temporary buffer.
        nencoded += LongPacker.packLong(out, len);

        // write out the data in the temporary buffer.
        out.write(tmp.array(), 0/* off */, len);
        
        return nencoded;
        
    }

    /**
     * Encode a Unicode string.
     * 
     * @param s
     *            The string.
     *            
     * @return The encoded byte[].
     */
    public byte[] encode1(final String s) {

        final ByteArrayBuffer tmp = new ByteArrayBuffer(s.length());
        
        final int nwritten = uc.encode(s, tmp);
        
        final int npack = LongPacker.getByteLength(nwritten);
        
        final byte[] a = new byte[npack+nwritten];

        final DataOutputBuffer dob = new DataOutputBuffer(0/* len */, a/* buf */);
        try {
            // write the length of the (prefix+compressed) data.
            dob.packLong(nwritten);

            // copy the compressed data.
            dob.append(tmp.array(), 0/* off */, tmp.pos());

            // return the array, including the encode length prefix.
            return a;
        } finally {
            try {
                dob.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        
    }

    /**
     * Decode a {@link String} from the input stream. The result is appended
     * into the caller's buffer. The caller is responsible for resetting the
     * buffer as necessary.
     * 
     * @param in
     *            The input stream.
     * @param out
     *            The output buffer.
     *            
     * @throws IOException
     */
    public int decode(final InputStream in, final StringBuilder out)
            throws IOException {

        // read in the byte length of the encoded character data.
        final long n = LongPacker.unpackLong(in);

        if (n > Integer.MAX_VALUE) {
            // Java does not support strings longer than int32 characters.
            throw new IOException();
        }
        
        // The byte length of the backed long value.
        int ndecoded = LongPacker.getByteLength(n);

        // ensure sufficient capacity for (at least) that many chars.
        out.ensureCapacity((int) n + out.length());

        // decode into the temporary buffer.
        ndecoded += uc.decode(new SliceInputStream(in, (int) n), out);

        return ndecoded;
        
    }

    /**
     * Decode a {@link String} from the input stream.
     * 
     * @param in
     *            The input stream.
     * @param tmp
     *            The temporary buffer used to decode the data. The buffer will
     *            be reset automatically by this method.
     * @return The decoded data.
     * 
     * @throws IOException
     */
    public String decode1(final DataInputBuffer in, final StringBuilder tmp)
            throws IOException {

        // reset the temporary buffer
        tmp.setLength(0);

        // decode into the temporary buffer.
        decode(in, tmp);

        // extract and return the decoded String.
        return tmp.toString();

    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy