All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.util.FrontCodedStringBigList Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
/*
 * DSI utilities
 *
 * Copyright (C) 2002-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

package it.unimi.dsi.big.util;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Iterator;
import java.util.RandomAccess;
import java.util.zip.GZIPInputStream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Charsets;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;
import com.martiansoftware.jsap.stringparsers.IntSizeStringParser;

import it.unimi.dsi.fastutil.bytes.ByteArrayFrontCodedBigList;
import it.unimi.dsi.fastutil.chars.CharArrayFrontCodedBigList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.AbstractObjectBigList;
import it.unimi.dsi.fastutil.objects.ObjectBigListIterator;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.FrontCodedStringList;

/**
 * Compact storage of strings using front-coding compression (a.k.a. compression by
 * prefix omission).
 *
 * 

* This class is functionally identical to {@link FrontCodedStringList}, except for the larger size * allowed. * * @see FrontCodedStringList */ public class FrontCodedStringBigList extends AbstractObjectBigList implements RandomAccess, Serializable { public static final long serialVersionUID = 1; /** The underlying {@link ByteArrayFrontCodedBigList}, or {@code null}. */ protected final ByteArrayFrontCodedBigList byteFrontCodedBigList; /** The underlying {@link CharArrayFrontCodedBigList}, or {@code null}. */ protected final CharArrayFrontCodedBigList charFrontCodedBigList; /** Whether this front-coded list is UTF-8 encoded. */ protected final boolean utf8; /** * Creates a new front-coded string list containing the character sequences returned by the given * iterator. * * @param words an iterator returning {@linkplain CharSequence character sequences}. * @param ratio the desired ratio. * @param utf8 if true, the strings will be stored as UTF-8 byte arrays. */ public FrontCodedStringBigList(final Iterator words, final int ratio, final boolean utf8) { this.utf8 = utf8; if (utf8) { byteFrontCodedBigList = new ByteArrayFrontCodedBigList(new ObjectIterator() { @Override public boolean hasNext() { return words.hasNext(); } @SuppressWarnings("null") @Override public byte[] next() { return words.next().toString().getBytes(Charsets.UTF_8); } }, ratio); charFrontCodedBigList = null; } else { charFrontCodedBigList = new CharArrayFrontCodedBigList(new ObjectIterator() { @Override public boolean hasNext() { return words.hasNext(); } @Override public char[] next() { final CharSequence s = words.next(); int i = s.length(); final char[] a = new char[i]; while (i-- != 0) a[i] = s.charAt(i); return a; } }, ratio); byteFrontCodedBigList = null; } } /** * Creates a new front-coded string list containing the character sequences contained in the given * collection. * * @param c a collection containing {@linkplain CharSequence character sequences}. * @param ratio the desired ratio. * @param utf8 if true, the strings will be stored as UTF-8 byte arrays. */ public FrontCodedStringBigList(final Collection c, final int ratio, final boolean utf8) { this(c.iterator(), ratio, utf8); } /** * Returns whether this front-coded string list is storing its strings as UTF-8 encoded bytes. * * @return true if this front-coded string list is keeping its data as an array of UTF-8 encoded * bytes. */ public boolean utf8() { return utf8; } /** * Returns the ratio of the underlying front-coded list. * * @return the ratio of the underlying front-coded list. */ public int ratio() { return utf8 ? byteFrontCodedBigList.ratio() : charFrontCodedBigList.ratio(); } /** * Returns the element at the specified position in this front-coded as a mutable string. * * @param index an index in the list. * @return a {@link MutableString} that will contain the string at the specified position. The * string may be freely modified. */ @Override public MutableString get(final long index) { return MutableString.wrap(utf8 ? byte2Char(byteFrontCodedBigList.getArray(index), null) : charFrontCodedBigList.getArray(index)); } /** * Returns the element at the specified position in this front-coded list by storing it in a mutable * string. * * @param index an index in the list. * @param s a mutable string that will contain the string at the specified position. */ public void get(final long index, final MutableString s) { if (utf8) { final byte[] a = byteFrontCodedBigList.getArray(index); s.length(countUTF8Chars(a)); byte2Char(a, s.array()); } else { s.length(s.array().length); int res = charFrontCodedBigList.get(index, s.array()); if (res < 0) { s.length(s.array().length - res); res = charFrontCodedBigList.get(index, s.array()); } else s.length(res); } } /* * The following methods are highly optimized UTF-8 converters exploiting the fact that since it was * ourselves in the first place who created the coding, we can be sure it is correct. */ protected static int countUTF8Chars(final byte[] a) { final int length = a.length; int result = 0, b; for (int i = 0; i < length; i++) { b = (a[i] & 0xFF) >> 4; if (b < 8) result++; else if (b < 14) { result++; i++; } else if (b < 15) { result++; i += 2; } else { // Surrogate pair (yuck!) result += 2; i += 4; } } return result; } protected static char[] byte2Char(final byte[] a, char[] s) { final int length = a.length; if (s == null) s = new char[countUTF8Chars(a)]; int b, c, d, t; for (int i = 0, j = 0; i < length; i++) { b = a[i] & 0xFF; t = b >> 4; if (t < 8) s[j++] = (char)b; else if (t < 14) { c = a[++i] & 0xFF; if ((c & 0xC0) != 0x80) throw new IllegalStateException("Malformed internal UTF-8 encoding"); s[j++] = (char)(((b & 0x1F) << 6) | (c & 0x3F)); } else if (t < 15) { c = a[++i] & 0xFF; d = a[++i]; if ((c & 0xC0) != 0x80 || (d & 0xC0) != 0x80) throw new IllegalStateException("Malformed internal UTF-8 encoding"); s[j++] = (char)(((b & 0x0F) << 12) | ((c & 0x3F) << 6) | ((d & 0x3F) << 0)); } else { // Surrogate pair (yuck!) final String surrogatePair = new String(a, i, 4, Charsets.UTF_8); s[j++] = surrogatePair.charAt(0); s[j++] = surrogatePair.charAt(1); i += 3; } } return s; } @Override public ObjectBigListIterator listIterator(final long k) { return new ObjectBigListIterator() { ObjectBigListIterator i = utf8 ? byteFrontCodedBigList.listIterator(k) : charFrontCodedBigList.listIterator(k); @Override public boolean hasNext() { return i.hasNext(); } @Override public boolean hasPrevious() { return i.hasPrevious(); } @Override public MutableString next() { return MutableString.wrap(utf8 ? byte2Char((byte[])i.next(), null) : (char[])i.next()); } @Override public MutableString previous() { return MutableString.wrap(utf8 ? byte2Char((byte[])i.previous(), null) : (char[])i.previous()); } @Override public long nextIndex() { return i.nextIndex(); } @Override public long previousIndex() { return i.previousIndex(); } }; } @Override public long size64() { return utf8 ? byteFrontCodedBigList.size64() : charFrontCodedBigList.size64(); } public static void main(final String[] arg) throws IOException, JSAPException, NoSuchMethodException { final SimpleJSAP jsap = new SimpleJSAP(FrontCodedStringBigList.class.getName(), "Builds a front-coded string list reading from standard input a newline-separated ordered list of strings.", new Parameter[] { new FlaggedOption("bufferSize", IntSizeStringParser.getParser(), "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read strings."), new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The file encoding."), new FlaggedOption("ratio", IntSizeStringParser.getParser(), "4", JSAP.NOT_REQUIRED, 'r', "ratio", "The compression ratio."), new Switch("utf8", 'u', "utf8", "Store the strings as UTF-8 byte arrays."), new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."), new UnflaggedOption("frontCodedList", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised front-coded list.") }); final JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final int bufferSize = jsapResult.getInt("bufferSize"); final int ratio = jsapResult.getInt("ratio"); final boolean utf8 = jsapResult.getBoolean("utf8"); final boolean zipped = jsapResult.getBoolean("zipped"); final String listName = jsapResult.getString("frontCodedList"); final Charset encoding = (Charset)jsapResult.getObject("encoding"); final Logger logger = LoggerFactory.getLogger(FrontCodedStringBigList.class); final ProgressLogger pl = new ProgressLogger(logger); pl.displayFreeMemory = true; pl.displayLocalSpeed = true; pl.itemsName = "strings"; pl.start("Reading strings..."); final FrontCodedStringBigList frontCodedStringBigList = new FrontCodedStringBigList(new LineIterator(new FastBufferedReader(new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding), bufferSize), pl), ratio, utf8); pl.done(); logger.info("Writing front-coded list to file..."); BinIO.storeObject(frontCodedStringBigList, listName); logger.info("Completed."); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy