diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/data/BitVector.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/data/BitVector.java | 309 |
1 files changed, 226 insertions, 83 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/data/BitVector.java b/src/de/lmu/ifi/dbs/elki/data/BitVector.java index de0edc2d..0ec4a081 100644 --- a/src/de/lmu/ifi/dbs/elki/data/BitVector.java +++ b/src/de/lmu/ifi/dbs/elki/data/BitVector.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.data; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -23,25 +23,28 @@ package de.lmu.ifi.dbs.elki.data; along with this program. If not, see <http://www.gnu.org/licenses/>. */ +import gnu.trove.iterator.TIntDoubleIterator; +import gnu.trove.map.TIntDoubleMap; + import java.io.IOException; import java.nio.ByteBuffer; -import java.util.BitSet; import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; -import de.lmu.ifi.dbs.elki.persistent.ByteArrayUtil; -import de.lmu.ifi.dbs.elki.persistent.ByteBufferSerializer; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil; +import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** - * Provides a BitVector wrapping a BitSet. + * Vector using a dense bit set encoding, based on {@code long[]} storage. * * @author Arthur Zimek * * @apiviz.composedOf Bit */ -public class BitVector extends AbstractNumberVector<Bit> { +public class BitVector extends AbstractNumberVector implements SparseNumberVector { /** * Static instance. */ @@ -55,71 +58,109 @@ public class BitVector extends AbstractNumberVector<Bit> { /** * Storing the bits. */ - private final BitSet bits; + private final long[] bits; /** * Dimensionality of this bit vector. */ - private final int dimensionality; + private int dimensionality; /** - * Provides a new BitVector corresponding to the specified bits and of the + * Create a new BitVector corresponding to the specified bits and of the * specified dimensionality. * - * @param bits the bits to be set in this BitVector + * @param bits the bits to be set in this BitVector. * @param dimensionality the dimensionality of this BitVector - * @throws IllegalArgumentException if the specified dimensionality is to - * small to match the given BitSet */ - public BitVector(BitSet bits, int dimensionality) throws IllegalArgumentException { - if (dimensionality < bits.length()) { - throw new IllegalArgumentException("Specified dimensionality " + dimensionality + " is to low for specified BitSet of length " + bits.length()); - } + public BitVector(long[] bits, int dimensionality) { this.bits = bits; this.dimensionality = dimensionality; } - /** - * Provides a new BitVector corresponding to the bits in the given array. - * - * @param bits an array of bits specifying the bits in this bit vector - */ - public BitVector(Bit[] bits) { - this.bits = new BitSet(bits.length); - for (int i = 0; i < bits.length; i++) { - this.bits.set(i, bits[i].bitValue()); - } - this.dimensionality = bits.length; - } - @Override public int getDimensionality() { return dimensionality; } @Override + public void setDimensionality(int dimensionality) { + this.dimensionality = dimensionality; + } + + /** + * Get the value of a single bit. + * + * @param dimension Bit number to get + * @return {@code true} when set + */ + public boolean booleanValue(int dimension) { + return BitsUtil.get(bits, dimension); + } + + @Override @Deprecated public Bit getValue(int dimension) { - if (dimension < 1 || dimension > dimensionality) { - throw new IllegalArgumentException("illegal dimension: " + dimension); - } - return new Bit(bits.get(dimension - 1)); + return new Bit(booleanValue(dimension)); } @Override public double doubleValue(int dimension) { - if (dimension < 0 || dimension >= dimensionality) { - throw new IllegalArgumentException("illegal dimension: " + dimension); - } - return bits.get(dimension) ? 1.0 : 0.0; + return BitsUtil.get(bits, dimension) ? 1. : 0.; } @Override public long longValue(int dimension) { - if (dimension < 0 || dimension >= dimensionality) { - throw new IllegalArgumentException("illegal dimension: " + dimension); - } - return bits.get(dimension) ? 1 : 0; + return BitsUtil.get(bits, dimension) ? 1L : 0L; + } + + @Override + public int iter() { + return BitsUtil.nextSetBit(bits, 0); + } + + @Override + public int iterAdvance(int iter) { + return BitsUtil.nextSetBit(bits, iter + 1); + } + + @Override + public boolean iterValid(int iter) { + return iter >= 0; + } + + @Override + public int iterDim(int iter) { + return iter; // Identity + } + + @Override + public double iterDoubleValue(int iter) { + return 1.; // When properly used: always true! + } + + @Override + public float iterFloatValue(int iter) { + return 1.f; // When properly used: always true! + } + + @Override + public int iterIntValue(int iter) { + return 1; // When properly used: always true! + } + + @Override + public short iterShortValue(int iter) { + return 1; // When properly used: always true! + } + + @Override + public long iterLongValue(int iter) { + return 1L; // When properly used: always true! + } + + @Override + public byte iterByteValue(int iter) { + return 1; // When properly used: always true! } /** @@ -136,8 +177,8 @@ public class BitVector extends AbstractNumberVector<Bit> { @Override public Vector getColumnVector() { double[] values = new double[dimensionality]; - for (int i = 0; i < dimensionality; i++) { - values[i] = bits.get(i) ? 1 : 0; + for(int i = 0; i < dimensionality; i++) { + values[i] = BitsUtil.get(bits, i) ? 1 : 0; } return new Vector(values); } @@ -150,13 +191,17 @@ public class BitVector extends AbstractNumberVector<Bit> { * @return true if this BitVector contains all bits that are set to true in * the specified BitSet, false otherwise */ - public boolean contains(BitSet bitset) { - boolean contains = true; - for (int i = bitset.nextSetBit(0); i >= 0 && contains; i = bitset.nextSetBit(i + 1)) { - // noinspection ConstantConditions - contains &= bits.get(i); + public boolean contains(long[] bitset) { + for(int i = 0; i < bitset.length; i++) { + final long b = bitset[i]; + if(i >= bits.length && b != 0L) { + return false; + } + if((b & bits[i]) != b) { + return false; + } } - return contains; + return true; } /** @@ -164,8 +209,94 @@ public class BitVector extends AbstractNumberVector<Bit> { * * @return a copy of the bits currently set in this BitVector */ - public BitSet getBits() { - return (BitSet) bits.clone(); + public long[] cloneBits() { + return bits.clone(); + } + + /** + * Compute the vector cardinality (uncached!) + * + * @return Vector cardinality + */ + public int cardinality() { + return BitsUtil.cardinality(bits); + } + + /** + * Compute the Jaccard similarity of two bit vectors. + * + * @param v2 Second bit vector + * @return Jaccard similarity (intersection / union) + */ + public double jaccardSimilarity(BitVector v2) { + return BitsUtil.intersectionSize(bits, v2.bits) / (double) BitsUtil.unionSize(bits, v2.bits); + } + + /** + * Compute the Hamming distance of two bit vectors. + * + * @param v2 Second bit vector + * @return Hamming distance (number of bits difference) + */ + public int hammingDistance(BitVector v2) { + return BitsUtil.hammingDistance(bits, v2.bits); + } + + /** + * Compute the vector intersection size. + * + * @param v2 Second bit vector + * @return Intersection size (number of bits in both) + */ + public int intersectionSize(BitVector v2) { + return BitsUtil.intersectionSize(bits, v2.bits); + } + + /** + * Compute the vector union size. + * + * @param v2 Second bit vector + * @return Intersection size (number of bits in both) + */ + public int unionSize(BitVector v2) { + return BitsUtil.unionSize(bits, v2.bits); + } + + /** + * Compute whether two vectors intersect. + * + * @param v2 Second bit vector + * @return {@code true} if they intersect in at least one bit. + */ + public boolean intersect(BitVector v2) { + return BitsUtil.intersect(bits, v2.bits); + } + + /** + * Combine onto v using the AND operation, i.e. {@code v &= this}. + * + * @param v Existing bit set of same length. + */ + public void andOnto(long[] v) { + BitsUtil.andI(v, bits); + } + + /** + * Combine onto v using the OR operation, i.e. {@code v |= this}. + * + * @param v Existing bit set of same length. + */ + public void orOnto(long[] v) { + BitsUtil.orI(v, bits); + } + + /** + * Combine onto v using the XOR operation, i.e. {@code v ^= this}. + * + * @param v Existing bit set of same length. + */ + public void xorOnto(long[] v) { + BitsUtil.xorI(v, bits); } /** @@ -178,16 +309,12 @@ public class BitVector extends AbstractNumberVector<Bit> { */ @Override public String toString() { - Bit[] bitArray = new Bit[dimensionality]; - for (int i = 0; i < dimensionality; i++) { - bitArray[i] = bits.get(i) ? Bit.TRUE : Bit.FALSE; - } StringBuilder representation = new StringBuilder(); - for (Bit bit : bitArray) { - if (representation.length() > 0) { + for(int i = 0; i < dimensionality; i++) { + if(i > 0) { representation.append(ATTRIBUTE_SEPARATOR); } - representation.append(bit.toString()); + representation.append(BitsUtil.get(bits, i) ? '1' : '0'); } return representation.toString(); } @@ -201,11 +328,12 @@ public class BitVector extends AbstractNumberVector<Bit> { */ @Override public boolean equals(Object obj) { - if (obj instanceof BitVector) { + if(obj instanceof BitVector) { BitVector bv = (BitVector) obj; return this.getDimensionality() == bv.getDimensionality() && this.bits.equals(bv.bits); - } else { + } + else { return false; } } @@ -217,14 +345,15 @@ public class BitVector extends AbstractNumberVector<Bit> { * * @apiviz.has BitVector */ - public static class Factory extends AbstractNumberVector.Factory<BitVector, Bit> { + public static class Factory extends AbstractNumberVector.Factory<BitVector> implements SparseNumberVector.Factory<BitVector> { @Override - public <A> BitVector newFeatureVector(A array, ArrayAdapter<Bit, A> adapter) { + public <A> BitVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) { int dim = adapter.size(array); - BitSet bits = new BitSet(dim); - for (int i = 0; i < dim; i++) { - bits.set(i, adapter.get(array, i).bitValue()); - i++; + long[] bits = BitsUtil.zero(dim); + for(int i = 0; i < dim; i++) { + if(adapter.get(array, i).doubleValue() >= 0.5) { + BitsUtil.setI(bits, i); + } } return new BitVector(bits, dim); } @@ -232,20 +361,33 @@ public class BitVector extends AbstractNumberVector<Bit> { @Override public <A> BitVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) { int dim = adapter.size(array); - BitSet bits = new BitSet(dim); - for (int i = 0; i < dim; i++) { - if (adapter.getDouble(array, i) >= 0.5) { - bits.set(i); + long[] bits = BitsUtil.zero(dim); + for(int i = 0; i < dim; i++) { + if(adapter.getDouble(array, i) >= 0.5) { + BitsUtil.setI(bits, i); } } return new BitVector(bits, dim); } @Override + public BitVector newNumberVector(TIntDoubleMap values, int maxdim) { + long[] bits = BitsUtil.zero(maxdim); + // Import and sort the indexes + for(TIntDoubleIterator iter = values.iterator(); iter.hasNext();) { + iter.advance(); + if(iter.value() != 0.) { + BitsUtil.setI(bits, iter.key()); + } + } + return new BitVector(bits, maxdim); + } + + @Override public ByteBufferSerializer<BitVector> getDefaultSerializer() { return SHORT_SERIALIZER; } - + @Override public Class<? super BitVector> getRestrictionClass() { return BitVector.class; @@ -280,23 +422,23 @@ public class BitVector extends AbstractNumberVector<Bit> { public BitVector fromByteBuffer(ByteBuffer buffer) throws IOException { short dimensionality = buffer.getShort(); final int len = ByteArrayUtil.SIZE_SHORT + (dimensionality + 7) / 8; - if (buffer.remaining() < len) { + if(buffer.remaining() < len) { throw new IOException("Not enough data for a bit vector!"); } // read values - BitSet values = new BitSet(dimensionality); + long[] bits = BitsUtil.zero(dimensionality); byte b = 0; - for (int i = 0; i < dimensionality; i++) { + for(int i = 0; i < dimensionality; i++) { // read the next byte when needed. - if ((i & 7) == 0) { + if((i & 7) == 0) { b = buffer.get(); } final byte bit = (byte) (1 << (i & 7)); - if ((b & bit) != 0) { - values.set(i + 1); + if((b & bit) != 0) { + BitsUtil.setI(bits, i); } } - return new BitVector(values, dimensionality); + return new BitVector(bits, dimensionality); } @Override @@ -304,7 +446,7 @@ public class BitVector extends AbstractNumberVector<Bit> { final int len = getByteSize(vec); assert (vec.getDimensionality() <= Short.MAX_VALUE); final short dim = (short) vec.getDimensionality(); - if (buffer.remaining() < len) { + if(buffer.remaining() < len) { throw new IOException("Not enough space for the bit vector!"); } // write size @@ -312,15 +454,16 @@ public class BitVector extends AbstractNumberVector<Bit> { // write values // Next byte to write: byte b = 0; - for (int i = 0; i < dim; i++) { + for(int i = 0; i < dim; i++) { final byte mask = (byte) (1 << (i & 7)); - if (vec.bits.get(i)) { + if(BitsUtil.get(vec.bits, i)) { b |= mask; - } else { + } + else { b &= ~mask; } // Write when appropriate - if ((i & 7) == 7 || i == dim - 1) { + if((i & 7) == 7 || i == dim - 1) { buffer.put(b); b = 0; } |