All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.bits.TransformationStrategies Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
/*
 * DSI utilities
 *
 * Copyright (C) 2007-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

package it.unimi.dsi.bits;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;

import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.lang.MutableString;

/** A class providing static methods and objects that do useful things with transformation strategies.
 *
 * 

This class provides several {@linkplain TransformationStrategy transformation strategies} that turn * strings or other objects into bit vectors. The transformations might optionally be: *

    *
  • Lexicographical: for objects based on bytes or characters, such as strings * and byte arrays, this means that the first bit of the bit vector is the most significant * bit of the first byte or character, and so on. In other word, the lexicographical order between * bit vectors reflects the lexicographical byte-by-byte, char-by-char, etc. order. Thiss property * is necessary for some kind of static structure that depends on it, but it has some computational * cost, as after compacting byte or chars into a long we need to revert the bit order of each piece. *
  • Prefix-free: no two bit vector returned by the transformation on two * different objects will be comparable in prefix order. Again, this might require to use more * linear (e.g., {@link #prefixFree()}) or constant (e.g., {@link #prefixFreeIso()}) additional space. *
* *

As a general rule, transformations without additional naming are lexicographical. * Transformation that generate prefix-free bit vectors are marked as such. * Plain transformations that do not provide any guarantee are called raw. They should be * used only when performance is the main issue and the two properties above are not relevant. * * @see TransformationStrategy */ public class TransformationStrategies { private final static TransformationStrategy IDENTITY = new TransformationStrategy() { private static final long serialVersionUID = 1L; @Override public BitVector toBitVector(final BitVector v) { return v; } @Override public long length(final BitVector v) { return v.length(); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } public Object readResolve() { return IDENTITY; } }; /** Reverses the bit order in the bytes of the provided word. * * @param word a word. * @return {@code word}, with the bit order inside each byte reversed. */ private static final long reverseBytes(long word) { word = (word & 0x5555555555555555L) << 1 | (word >>> 1) & 0x5555555555555555L; word = (word & 0x3333333333333333L) << 2 | (word >>> 2) & 0x3333333333333333L; return (word & 0x0f0f0f0f0f0f0f0fL) << 4 | (word >>> 4) & 0x0f0f0f0f0f0f0f0fL; } /** Reverses the bit order in the characters of the provided word. * * @param word a word. * @return {@code word}, with the bit order inside each 16-bit character reversed. */ private static final long reverseChars(long word) { word = (word & 0x5555555555555555L) << 1 | (word >>> 1) & 0x5555555555555555L; word = (word & 0x3333333333333333L) << 2 | (word >>> 2) & 0x3333333333333333L; word = (word & 0x0f0f0f0f0f0f0f0fL) << 4 | (word >>> 4) & 0x0f0f0f0f0f0f0f0fL; return (word & 0x00ff00ff00ff00ffL) << 8 | (word >>> 8) & 0x00ff00ff00ff00ffL; } /** A trivial transformation for data already in {@link BitVector} form. */ @SuppressWarnings("unchecked") public static TransformationStrategy identity() { return (TransformationStrategy)IDENTITY; } private static final TransformationStrategy RAW_UTF32 = new RawUtf32TransformationStrategy(); /**A trivial raw transformation from strings to bit vectors * that turns the UTF-16 representation into a UTF-32 representation, * decodes surrogate pairs and concatenates the bits of the UTF-32 representation. * *

Warning: this transformation is not lexicographic. */ @SuppressWarnings("unchecked") public static TransformationStrategy rawUtf32() { return (TransformationStrategy)RAW_UTF32; } private static class RawUtf32TransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; @Override public long length(final CharSequence cs) { return length(cs.toString()); } private long length(final String s) { return s.codePointCount(0, s.length()) * (long)Integer.SIZE; } @Override public BitVector toBitVector(final CharSequence cs) { final String s = cs.toString(); final int length = s.length(); final LongArrayBitVector bitVector = LongArrayBitVector.getInstance(length(s)); for (int i = 0, cp; i < length; i += Character.charCount(cp)) bitVector.append(cp = s.codePointAt(i), Integer.SIZE); return bitVector; } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return RAW_UTF32; } } private static final TransformationStrategy UTF32 = new Utf32TransformationStrategy(false); /** A transformation from strings to bit vectors that turns the UTF-16 representation into a UTF-32 representation, * decodes surrogate pairs and concatenates the bits of the UTF-32 representation. */ @SuppressWarnings("unchecked") public static TransformationStrategy utf32() { return (TransformationStrategy)UTF32; } private static final TransformationStrategy PREFIX_FREE_UTF32 = new Utf32TransformationStrategy(true); /** A transformation from strings to bit vectors that turns the UTF-16 representation into a UTF-32 representation, * decodes surrogate pairs, concatenates the bits of the UTF-32 representation and completes * the representation with an NUL to guarantee lexicographical ordering and prefix-freeness. * *

Note that strings provided to this strategy must not contain NULs. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFreeUtf32() { return (TransformationStrategy)PREFIX_FREE_UTF32; } private static class Utf32TransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; /** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */ private final boolean prefixFree; /** Creates a UTF32 transformation strategy. The strategy will map a string to its UTF32 bit sequence by decoding surrogate pairs. * * @param prefixFree if true, the resulting set of binary words will be made prefix free by adding a NUL at the end of the string. */ protected Utf32TransformationStrategy(final boolean prefixFree) { this.prefixFree = prefixFree; } @Override public long length(final CharSequence cs) { return length(cs.toString()); } private long length(final String s) { return (s.codePointCount(0, s.length()) + (prefixFree ? 1 : 0)) * (long)Integer.SIZE; } @Override public BitVector toBitVector(final CharSequence cs) { final String s = cs.toString(); final int length = s.length(); final LongArrayBitVector bitVector = LongArrayBitVector.getInstance(length(s)); for (int i = 0, cp; i < length; i += Character.charCount(cp)) bitVector.append(Integer.reverse(cp = s.codePointAt(i)) & -1L >>> 32, Integer.SIZE); if (prefixFree) bitVector.append(0, Integer.SIZE); return bitVector; } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return prefixFree ? PREFIX_FREE_UTF32 : UTF32; } } private static final TransformationStrategy RAW_UTF16 = new RawUtf16TransformationStrategy(); /** A trivial, high-performance, raw transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation. * *

Warning: this transformation is not lexicographic. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy rawUtf16() { return (TransformationStrategy)RAW_UTF16; } private static class RawUtf16TransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; @Override public long length(final CharSequence s) { return s.length() * (long)Character.SIZE; } private static class RawUtf16MutableStringBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final char[] a; private final long length; public RawUtf16MutableStringBitVector(final MutableString s) { this.a = s.array(); length = s.length() * (long)Character.SIZE; } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); final int charIndex = (int)(index / Character.SIZE); return (a[charIndex] & 1 << index % Character.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Character.SIZE); if (startBit == 0) { final int pos = (int)(from / Character.SIZE); if (to == from + Long.SIZE) return (long)a[pos + 3] << 48 | (long)a[pos + 2] << 32 | (long)a[pos + 1] << 16 | a[pos]; if (to % Character.SIZE == 0) { long word = 0; switch((int)((to - from) / Character.SIZE)) { case 3: word |= (long)a[pos + 2] << 32; case 2: word |= (long)a[pos + 1] << 16; case 1: word |= a[pos + 0]; } return word; } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } private static class RawUtf16CharSequenceBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final CharSequence s; private final long length; public RawUtf16CharSequenceBitVector(final CharSequence s) { this.s = s; length = s.length() * (long)Character.SIZE; } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); final int charIndex = (int)(index / Character.SIZE); return (s.charAt(charIndex) & 1 << index % Character.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Character.SIZE); if (startBit == 0) { final int pos = (int)(from / Character.SIZE); if (to == from + Long.SIZE) return (long)s.charAt(pos + 3) << 48 | (long)s.charAt(pos + 2) << 32 | (long)s.charAt(pos + 1) << 16 | s.charAt(pos); if (to % Character.SIZE == 0) { long word = 0; switch((int)((to - from) / Character.SIZE)) { case 3: word |= (long)s.charAt(pos + 2) << 32; case 2: word |= (long)s.charAt(pos + 1) << 16; case 1: word |= s.charAt(pos + 0); } return word; } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } @Override public BitVector toBitVector(final CharSequence s) { return s instanceof MutableString ? new RawUtf16MutableStringBitVector((MutableString)s) : new RawUtf16CharSequenceBitVector(s); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return RAW_UTF16; } } private static final TransformationStrategy UTF16 = new Utf16TransformationStrategy(false); /** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy utf16() { return (TransformationStrategy)UTF16; } private static final TransformationStrategy PREFIX_FREE_UTF16 = new Utf16TransformationStrategy(true); /** A trivial transformation from strings to bit vectors that concatenates the bits of the UTF-16 representation and completes * the representation with an NUL to guarantee lexicographical ordering and prefix-freeness. * *

Note that strings provided to this strategy must not contain NULs. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFreeUtf16() { return (TransformationStrategy)PREFIX_FREE_UTF16; } private static class Utf16TransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; /** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */ private final boolean prefixFree; /** Creates a UTF16 transformation strategy. The strategy will map a string to its natural UTF16 bit sequence. * * @param prefixFree if true, the resulting set of binary words will be made prefix free by adding a NUL at the end of the string. */ protected Utf16TransformationStrategy(final boolean prefixFree) { this.prefixFree = prefixFree; } @Override public long length(final CharSequence s) { return (s.length() + (prefixFree ? 1 : 0)) * (long)Character.SIZE; } private static class Utf16CharSequenceBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final CharSequence s; private final long length; private final long actualEnd; public Utf16CharSequenceBitVector(final CharSequence s, final boolean prefixFree) { this.s = s; actualEnd = s.length() * (long)Character.SIZE; length = actualEnd + (prefixFree ? Character.SIZE : 0); } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); if (index >= actualEnd) return false; final int charIndex = (int)(index / Character.SIZE); return (s.charAt(charIndex) & 0x8000 >>> index % Character.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Character.SIZE); if (startBit == 0) { final int pos = (int)(from / Character.SIZE); if (to == from + Long.SIZE) return reverseChars((to > actualEnd ? 0 : (long)s.charAt(pos + 3)) << 48 | (long)s.charAt(pos + 2) << 32 | (long)s.charAt(pos + 1) << 16 | s.charAt(pos)); if (to % Character.SIZE == 0) { long word = 0; switch((int)((Math.min(to, actualEnd) - Math.min(from, actualEnd)) / Character.SIZE)) { case 3: word |= (long)s.charAt(pos + 2) << 32; case 2: word |= (long)s.charAt(pos + 1) << 16; case 1: word |= s.charAt(pos); } return reverseChars(word); } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } private static class Utf16MutableStringBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final char[] a; private final long length; private final long actualEnd; public Utf16MutableStringBitVector(final MutableString s, final boolean prefixFree) { this.a = s.array(); actualEnd = s.length() * (long)Character.SIZE; length = actualEnd + (prefixFree ? Character.SIZE : 0); } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); if (index >= actualEnd) return false; final int charIndex = (int)(index / Character.SIZE); return (a[charIndex] & 0x8000 >>> index % Character.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Character.SIZE); if (startBit == 0) { final int pos = (int)(from / Character.SIZE); if (to == from + Long.SIZE) return reverseChars((to > actualEnd ? 0 : (long)a[pos + 3]) << 48 | (long)a[pos + 2] << 32 | (long)a[pos + 1] << 16 | a[pos]); if (to % Character.SIZE == 0) { long word = 0; switch((int)((Math.min(to, actualEnd) - Math.min(from, actualEnd)) / Character.SIZE)) { case 3: word |= (long)a[pos + 2] << 32; case 2: word |= (long)a[pos + 1] << 16; case 1: word |= a[pos]; } return reverseChars(word); } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } @Override public BitVector toBitVector(final CharSequence s) { return s instanceof MutableString ? new Utf16MutableStringBitVector((MutableString)s, prefixFree) : new Utf16CharSequenceBitVector(s, prefixFree); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return prefixFree ? PREFIX_FREE_UTF16 : UTF16; } } private static final TransformationStrategy RAW_ISO = new RawISOTransformationStrategy(); /** A trivial, high-performance, raw transformation from strings to bit vectors that concatenates the lower eight bits bits of the UTF-16 representation. * *

Warning: this transformation is not lexicographic. * *

Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy rawIso() { return (TransformationStrategy)RAW_ISO; } private static class RawISOTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; @Override public long length(final CharSequence s) { return s.length() * (long)Byte.SIZE; } private static class RawISOCharSequenceBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final CharSequence s; private final long length; public RawISOCharSequenceBitVector(final CharSequence s) { this.s = s; length = s.length() * (long)Byte.SIZE; } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); final int byteIndex = (int)(index / Byte.SIZE); return (s.charAt(byteIndex) & 1 << index % Byte.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Byte.SIZE); if (startBit == 0) { final int pos = (int)(from / Byte.SIZE); if (to == from + Long.SIZE) return (s.charAt(pos + 7) & 0xFFL) << 56 | (s.charAt(pos + 6) & 0xFFL) << 48 | (s.charAt(pos + 5) & 0xFFL) << 40 | (s.charAt(pos + 4) & 0xFFL) << 32 | (s.charAt(pos + 3) & 0xFFL) << 24 | (s.charAt(pos + 2) & 0xFF) << 16 | (s.charAt(pos + 1) & 0xFF) << 8 | (s.charAt(pos) & 0xFF); if (to % Byte.SIZE == 0) { long word = 0; switch((int)((to - from) / Byte.SIZE)) { case 7: word |= (s.charAt(pos + 6) & 0xFFL) << 48; case 6: word |= (s.charAt(pos + 5) & 0xFFL) << 40; case 5: word |= (s.charAt(pos + 4) & 0xFFL) << 32; case 4: word |= (s.charAt(pos + 3) & 0xFFL) << 24; case 3: word |= (s.charAt(pos + 2) & 0xFF) << 16; case 2: word |= (s.charAt(pos + 1) & 0xFF) << 8; case 1: word |= s.charAt(pos) & 0xFF; } return word; } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } private static class RawISOMutableStringBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final char[] a; private final long length; public RawISOMutableStringBitVector(final MutableString s) { this.a = s.array(); length = s.length() * (long)Byte.SIZE; } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); final int byteIndex = (int)(index / Byte.SIZE); return (a[byteIndex] & 1 << index % Byte.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Byte.SIZE); if (startBit == 0) { final int pos = (int)(from / Byte.SIZE); if (to == from + Long.SIZE) return (a[pos + 7] & 0xFFL) << 56 | (a[pos + 6] & 0xFFL) << 48 | (a[pos + 5] & 0xFFL) << 40 | (a[pos + 4] & 0xFFL) << 32 | (a[pos + 3] & 0xFFL) << 24 | (a[pos + 2] & 0xFF) << 16 | (a[pos + 1] & 0xFF) << 8 | (a[pos] & 0xFF); if (to % Byte.SIZE == 0) { long word = 0; switch((int)((to - from) / Byte.SIZE)) { case 7: word |= (a[pos + 6] & 0xFFL) << 48; case 6: word |= (a[pos + 5] & 0xFFL) << 40; case 5: word |= (a[pos + 4] & 0xFFL) << 32; case 4: word |= (a[pos + 3] & 0xFFL) << 24; case 3: word |= (a[pos + 2] & 0xFF) << 16; case 2: word |= (a[pos + 1] & 0xFF) << 8; case 1: word |= a[pos] & 0xFF; } return word; } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } @Override public BitVector toBitVector(final CharSequence s) { return s instanceof MutableString ? new RawISOMutableStringBitVector((MutableString)s) : new RawISOCharSequenceBitVector(s); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return RAW_ISO; } } private static final TransformationStrategy ISO = new ISOTransformationStrategy(false); /** A trivial transformation from strings to bit vectors that concatenates the lower eight bits of the UTF-16 representation. * *

Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy iso() { return (TransformationStrategy)ISO; } private static final TransformationStrategy PREFIX_FREE_ISO = new ISOTransformationStrategy(true); /** A trivial transformation from strings to bit vectors that concatenates the lower eight bits bits of the UTF-16 representation and completes * the representation with an ASCII NUL to guarantee lexicographical ordering and prefix-freeness. * *

Note that this transformation is sensible only for strings that are known to be contain just characters in the ISO-8859-1 charset, and * that strings provided to this strategy must not contain ASCII NULs. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFreeIso() { return (TransformationStrategy)PREFIX_FREE_ISO; } private static class ISOTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; /** Whether we should guarantee prefix-freeness by adding 0 to the end of each string. */ private final boolean prefixFree; /** Creates an ISO transformation strategy. The strategy will map a string to the lowest eight bits of its natural UTF16 bit sequence. * * @param prefixFree if true, the resulting set of binary words will be made prefix free by adding a NUL at the end of the string. */ protected ISOTransformationStrategy(final boolean prefixFree) { this.prefixFree = prefixFree; } @Override public long length(final CharSequence s) { return (s.length() + (prefixFree ? 1 : 0)) * (long)Byte.SIZE; } private static class ISOCharSequenceBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final CharSequence s; private final long length; private final long actualEnd; public ISOCharSequenceBitVector(final CharSequence s, final boolean prefixFree) { this.s = s; actualEnd = s.length() * (long)Byte.SIZE; length = actualEnd + (prefixFree ? Byte.SIZE : 0); } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); if (index >= actualEnd) return false; final int byteIndex = (int)(index / Byte.SIZE); return (s.charAt(byteIndex) & 0x80 >>> index % Byte.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Byte.SIZE); if (startBit == 0) { final int pos = (int)(from / Byte.SIZE); if (to == from + Long.SIZE) return reverseBytes((to > actualEnd ? 0 : (s.charAt(pos + 7) & 0xFFL)) << 56 | (s.charAt(pos + 6) & 0xFFL) << 48 | (s.charAt(pos + 5) & 0xFFL) << 40 | (s.charAt(pos + 4) & 0xFFL) << 32 | (s.charAt(pos + 3) & 0xFFL) << 24 | (s.charAt(pos + 2) & 0xFF) << 16 | (s.charAt(pos + 1) & 0xFF) << 8 | (s.charAt(pos) & 0xFF)); if (to % Byte.SIZE == 0) { long word = 0; switch((int)((Math.min(to, actualEnd) - Math.min(from, actualEnd)) / Byte.SIZE)) { case 7: word |= (s.charAt(pos + 6) & 0xFFL) << 48; case 6: word |= (s.charAt(pos + 5) & 0xFFL) << 40; case 5: word |= (s.charAt(pos + 4) & 0xFFL) << 32; case 4: word |= (s.charAt(pos + 3) & 0xFFL) << 24; case 3: word |= (s.charAt(pos + 2) & 0xFF) << 16; case 2: word |= (s.charAt(pos + 1) & 0xFF) << 8; case 1: word |= s.charAt(pos) & 0xFF; } return reverseBytes(word); } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } private static class ISOMutableStringBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final char[] a; private final long length; private final long actualEnd; public ISOMutableStringBitVector(final MutableString s, final boolean prefixFree) { this.a = s.array(); actualEnd = s.length() * (long)Byte.SIZE; length = actualEnd + (prefixFree ? Byte.SIZE : 0); } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); if (index >= actualEnd) return false; final int byteIndex = (int)(index / Byte.SIZE); return (a[byteIndex] & 0x80 >>> index % Byte.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Byte.SIZE); if (startBit == 0) { final int pos = (int)(from / Byte.SIZE); if (to == from + Long.SIZE) return reverseBytes((to > actualEnd ? 0 : (a[pos + 7] & 0xFFL)) << 56 | (a[pos + 6] & 0xFFL) << 48 | (a[pos + 5] & 0xFFL) << 40 | (a[pos + 4] & 0xFFL) << 32 | (a[pos + 3] & 0xFFL) << 24 | (a[pos + 2] & 0xFF) << 16 | (a[pos + 1] & 0xFF) << 8 | (a[pos] & 0xFF)); if (to % Byte.SIZE == 0) { long word = 0; switch((int)((Math.min(actualEnd, to) - Math.min(actualEnd, from)) / Byte.SIZE)) { case 7: word |= (a[pos + 6] & 0xFFL) << 48; case 6: word |= (a[pos + 5] & 0xFFL) << 40; case 5: word |= (a[pos + 4] & 0xFFL) << 32; case 4: word |= (a[pos + 3] & 0xFFL) << 24; case 3: word |= (a[pos + 2] & 0xFF) << 16; case 2: word |= (a[pos + 1] & 0xFF) << 8; case 1: word |= a[pos] & 0xFF; } return reverseBytes(word); } } final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } @Override public BitVector toBitVector(final CharSequence s) { return s instanceof MutableString ? new ISOMutableStringBitVector((MutableString)s, prefixFree) : new ISOCharSequenceBitVector(s, prefixFree); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return prefixFree ? PREFIX_FREE_ISO : ISO; } } private static final TransformationStrategy RAW_BYTE_ARRAY = new RawByteArrayTransformationStrategy(); /** A trivial, high-performance, raw transformation from byte arrays to bit * vectors that simply concatenates the bytes of the array. * *

Warning: this transformation is not lexicographic. * *

Warning: bit vectors returned by this strategy are adaptors around the original array. If the array * changes while the bit vector is being accessed, the results will be unpredictable. * * @see TransformationStrategies */ public static TransformationStrategy rawByteArray() { return RAW_BYTE_ARRAY; } private static class RawByteArrayTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; @Override public long length(final byte[] a) { return a.length * (long)Byte.SIZE; } private static class RawByteArrayBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 0L; private final byte[] a; private final long length; public RawByteArrayBitVector(final byte[] a) { this.a = a; length = a.length * (long)Byte.SIZE; } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); return (a[(int)(index / Byte.SIZE)] & 1 << index % Byte.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Byte.SIZE); if (startBit == 0) { if (to == from + Long.SIZE) { final int pos = (int)(from / Byte.SIZE); return (a[pos + 7] & 0xFFL) << 56 | (a[pos + 6] & 0xFFL) << 48 | (a[pos + 5] & 0xFFL) << 40 | (a[pos + 4] & 0xFFL) << 32 | (a[pos + 3] & 0xFFL) << 24 | (a[pos + 2] & 0xFF) << 16 | (a[pos + 1] & 0xFF) << 8 | (a[pos] & 0xFF); } if (to % Byte.SIZE == 0) { final int pos = (int)(from / Byte.SIZE); long word = 0; switch((int)((to - from) / Byte.SIZE)) { case 7: word |= (a[pos + 6] & 0xFFL) << 48; case 6: word |= (a[pos + 5] & 0xFFL) << 40; case 5: word |= (a[pos + 4] & 0xFFL) << 32; case 4: word |= (a[pos + 3] & 0xFFL) << 24; case 3: word |= (a[pos + 2] & 0xFF) << 16; case 2: word |= (a[pos + 1] & 0xFF) << 8; case 1: word |= (a[pos] & 0xFF); } return word; } } // Actually, we should never get here as the transformation is not lexicographical. final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } @Override public BitVector toBitVector(final byte[] s) { return new RawByteArrayBitVector(s); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return RAW_BYTE_ARRAY; } } private static final TransformationStrategy BYTE_ARRAY = new ByteArrayTransformationStrategy(); /** A lexicographical transformation from byte arrays to bit vectors. * *

Warning: bit vectors returned by this strategy are adaptors around the original array. If the array * changes while the bit vector is being accessed, the results will be unpredictable. * * @see TransformationStrategies */ public static TransformationStrategy byteArray() { return BYTE_ARRAY; } private static class ByteArrayTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; @Override public long length(final byte[] a) { return a.length * (long)Byte.SIZE; } private static class ByteArrayBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 0L; private final byte[] a; private final long length; public ByteArrayBitVector(final byte[] a) { this.a = a; length = a.length * (long)Byte.SIZE; } @Override public boolean getBoolean(final long index) { if (index > length) throw new IndexOutOfBoundsException(); return (a[(int)(index / Byte.SIZE)] & 0x80 >> index % Byte.SIZE) != 0; } @Override public long getLong(final long from, final long to) { final int startBit = (int)(from % Byte.SIZE); if (startBit == 0) { if (to == from + Long.SIZE) { final int pos = (int)(from / Byte.SIZE); return reverseBytes((a[pos + 7] & 0xFFL) << 56 | (a[pos + 6] & 0xFFL) << 48 | (a[pos + 5] & 0xFFL) << 40 | (a[pos + 4] & 0xFFL) << 32 | (a[pos + 3] & 0xFFL) << 24 | (a[pos + 2] & 0xFF) << 16 | (a[pos + 1] & 0xFF) << 8 | (a[pos] & 0xFF)); } if (to % Byte.SIZE == 0) { final int pos = (int)(from / Byte.SIZE); long word = 0; switch((int)((to - from) / Byte.SIZE)) { case 7: word |= (a[pos + 6] & 0xFFL) << 48; case 6: word |= (a[pos + 5] & 0xFFL) << 40; case 5: word |= (a[pos + 4] & 0xFFL) << 32; case 4: word |= (a[pos + 3] & 0xFFL) << 24; case 3: word |= (a[pos + 2] & 0xFF) << 16; case 2: word |= (a[pos + 1] & 0xFF) << 8; case 1: word |= (a[pos] & 0xFF); } return reverseBytes(word); } } // Actually, we should never get here as the transformation is not lexicographical. final long l = Long.SIZE - (to - from); final long startPos = from - startBit; if (l == Long.SIZE) return 0; if (startBit <= l) return getLong(startPos, Math.min(length, startPos + Long.SIZE)) << l - startBit >>> l; return getLong(startPos, startPos + Long.SIZE) >>> startBit | getLong(startPos + Long.SIZE, Math.min(length, startPos + 2 * Long.SIZE)) << Long.SIZE + l - startBit >>> l; } @Override public long length() { return length; } } @Override public BitVector toBitVector(final byte[] s) { return new ByteArrayBitVector(s); } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return BYTE_ARRAY; } } private final static class IteratorWrapper implements ObjectIterator { final Iterator iterator; final TransformationStrategy transformationStrategy; public IteratorWrapper(final Iterator iterator, final TransformationStrategy transformationStrategy) { this.iterator = iterator; this.transformationStrategy = transformationStrategy; } @Override public boolean hasNext() { return iterator.hasNext(); } @Override public BitVector next() { return transformationStrategy.toBitVector(iterator.next()); } } /** Wraps a given iterator, returning an iterator that emits {@linkplain BitVector bit vectors}. * * @param iterator an iterator. * @param transformationStrategy a strategy to transform the object returned by iterator. * @return an iterator that emits the content of iterator passed through transformationStrategy. */ @SuppressWarnings("unchecked") public static Iterator wrap(final Iterator iterator, final TransformationStrategy transformationStrategy) { return (Iterator)(transformationStrategy == IDENTITY ? iterator : new IteratorWrapper<>(iterator, transformationStrategy)); } private final static class IterableWrapper implements Iterable { private final TransformationStrategy transformationStrategy; private final Iterable collection; public IterableWrapper(final Iterable collection, final TransformationStrategy transformationStrategy) { this.collection = collection; this.transformationStrategy = transformationStrategy; } @Override public ObjectIterator iterator() { return new IteratorWrapper<>(collection.iterator(), transformationStrategy.copy()); } } /** Wraps a given iterable, returning an iterable that contains {@linkplain BitVector bit vectors}. * * @param iterable an iterable. * @param transformationStrategy a strategy to transform the object contained in iterable. * @return an iterable that has the content of iterable passed through transformationStrategy. */ @SuppressWarnings("unchecked") public static Iterable wrap(final Iterable iterable, final TransformationStrategy transformationStrategy) { return (Iterable)(transformationStrategy == IDENTITY ? iterable : new IterableWrapper<>(iterable, transformationStrategy)); } private final static class ListWrapper extends AbstractObjectList { private final TransformationStrategy transformationStrategy; private final List list; public ListWrapper(final List list, final TransformationStrategy transformationStrategy) { this.list = list; this.transformationStrategy = transformationStrategy; } @Override public BitVector get(final int index) { return transformationStrategy.toBitVector(list.get(index)); } @Override public int size() { return list.size(); } } /** Wraps a given list, returning a list that contains {@linkplain BitVector bit vectors}. * * @param list a list. * @param transformationStrategy a strategy to transform the object contained in list. * @return a list that has the content of list passed through transformationStrategy. */ @SuppressWarnings("unchecked") public static List wrap(final List list, final TransformationStrategy transformationStrategy) { return (List)(transformationStrategy == IDENTITY ? list : new ListWrapper<>(list, transformationStrategy)); } private static final TransformationStrategy PREFIX_FREE = new PrefixFreeTransformationStrategy(); /** A transformation from bit vectors to bit vectors that guarantees that its results are prefix free. * *

More in detail, we map 0 to 10, 1 to 11, and we add a 0 at the end of all strings. * *

Warning: bit vectors returned by this strategy are adaptors around the original string. If the string * changes while the bit vector is being accessed, the results will be unpredictable. */ @SuppressWarnings("unchecked") public static TransformationStrategy prefixFree() { return (TransformationStrategy)PREFIX_FREE; } private static class PrefixFreeTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 1L; private static class PrefixFreeBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final BitVector v; private final long length; public PrefixFreeBitVector(final BitVector v) { this.v = v; length = v.length() * 2 + 1; } @Override public boolean getBoolean(final long index) { if (index >= length) throw new IndexOutOfBoundsException(); if (index == length - 1) return false; if (index % 2 == 0) return true; return v.getBoolean(index / 2); } @Override public long getLong(final long from, final long to) { // The following code is optimized for word-by-word reading. if (from % Long.SIZE == 0) { if (to == from + Long.SIZE) { long word = v.getLong(from / 2, from / 2 + Long.SIZE / 2); word = (word | word << 16) & 0x0000FFFF0000FFFFL; word = (word | word << 8) & 0x00FF00FF00FF00FFL; word = (word | word << 4) & 0x0F0F0F0F0F0F0F0FL; word = (word | word << 2) & 0x3333333333333333L; return word << 1 | word << 2 | 0x5555555555555555L; } if (to == length) { assert from < to; // As from is even and to is odd. long word = v.getLong(from / 2, to / 2); word = (word | word << 16) & 0x0000FFFF0000FFFFL; word = (word | word << 8) & 0x00FF00FF00FF00FFL; word = (word | word << 4) & 0x0F0F0F0F0F0F0F0FL; word = (word | word << 2) & 0x3333333333333333L; return (word << 1 | word << 2 | 0x5555555555555555L) & (1L << to - from - 1) - 1; } } return super.getLong(from, to); } @Override public long length() { return length; } } @Override public BitVector toBitVector(final BitVector v) { return new PrefixFreeBitVector(v); } @Override public long length(final BitVector v) { return v.length() * 2 + 1; } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return PREFIX_FREE; } } private static final FixedLongTransformationStrategy FIXED_LONG = new FixedLongTransformationStrategy(true); /** A transformation from longs to bit vectors that returns a fixed-size {@link Long#SIZE}-bit vector. Note that the * bit vectors have as first bit the most significant bit of the underlying long integer, so * lexicographical and numerical order do coincide for positive numbers. */ public static TransformationStrategy fixedLong() { return FIXED_LONG; } private static final FixedLongTransformationStrategy RAW_FIXED_LONG = new FixedLongTransformationStrategy(false); /** A trivial, high-performance, raw transformation from longs to bit vectors that returns a fixed-size * {@link Long#SIZE}-bit vector. */ public static TransformationStrategy rawFixedLong() { return RAW_FIXED_LONG; } /** A transformation from longs to bit vectors that returns a fixed-size {@link Long#SIZE}-bit vector, possibly reversed * to maintain lexicographical order. */ private static class FixedLongTransformationStrategy implements TransformationStrategy, Serializable { private static final long serialVersionUID = 0L; private final boolean lexicographical; public FixedLongTransformationStrategy(final boolean lexicographical) { this.lexicographical = lexicographical; } private static class FixedLongBitVector extends AbstractBitVector implements Serializable { private static final long serialVersionUID = 1L; private final long v; public FixedLongBitVector(final long v) { this.v = v; } @Override public boolean getBoolean(final long index) { if (index >= Long.SIZE) throw new IndexOutOfBoundsException(); return (v & 1L << index) != 0; } @Override public long getLong(final long from, final long to) { if (from == 0 && to == Long.SIZE) return v; return (v >> from) & (1L << to - from) - 1; } @Override public long length() { return Long.SIZE; } } @Override public BitVector toBitVector(final Long v) { return new FixedLongBitVector(lexicographical ? Long.reverse(v.longValue()) : v.longValue()); } @Override public long length(final Long v) { return Long.SIZE; } @Override public long numBits() { return 0; } @Override public TransformationStrategy copy() { return this; } private Object readResolve() { return FIXED_LONG; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy