diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java | 302 |
1 files changed, 171 insertions, 131 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java index d6ce6a15..ebf588b6 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/statistics/DistanceStatisticsWithClasses.java @@ -42,6 +42,7 @@ import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; @@ -52,10 +53,11 @@ import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.StepProgress; import de.lmu.ifi.dbs.elki.math.DoubleMinMax; import de.lmu.ifi.dbs.elki.math.MeanVariance; -import de.lmu.ifi.dbs.elki.math.histograms.AggregatingHistogram; -import de.lmu.ifi.dbs.elki.math.histograms.FlexiHistogram; import de.lmu.ifi.dbs.elki.result.CollectionResult; import de.lmu.ifi.dbs.elki.result.HistogramResult; +import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram; +import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram; +import de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.ExceptionMessages; @@ -66,14 +68,13 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; -import de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair; -import de.lmu.ifi.dbs.elki.utilities.pairs.Pair; /** * Algorithm to gather statistics over the distance distribution in the data * set. * * @author Erich Schubert + * * @param <O> Object type * @param <D> Distance type */ @@ -84,22 +85,22 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex /** * The logger for this class. */ - private static final Logging logger = Logging.getLogger(DistanceStatisticsWithClasses.class); + private static final Logging LOG = Logging.getLogger(DistanceStatisticsWithClasses.class); /** * Flag to compute exact value range for binning. */ - public static final OptionID EXACT_ID = OptionID.getOrCreateOptionID("diststat.exact", "In a first pass, compute the exact minimum and maximum, at the cost of O(2*n*n) instead of O(n*n). The number of resulting bins is guaranteed to be as requested."); + public static final OptionID EXACT_ID = new OptionID("diststat.exact", "In a first pass, compute the exact minimum and maximum, at the cost of O(2*n*n) instead of O(n*n). The number of resulting bins is guaranteed to be as requested."); /** - * Flag to enable sampling + * Flag to enable sampling. */ - public static final OptionID SAMPLING_ID = OptionID.getOrCreateOptionID("diststat.sampling", "Enable sampling of O(n) size to determine the minimum and maximum distances approximately. The resulting number of bins can be larger than the given n."); + public static final OptionID SAMPLING_ID = new OptionID("diststat.sampling", "Enable sampling of O(n) size to determine the minimum and maximum distances approximately. The resulting number of bins can be larger than the given n."); /** * Option to configure the number of bins to use. */ - public static final OptionID HISTOGRAM_BINS_ID = OptionID.getOrCreateOptionID("diststat.bins", "Number of bins to use in the histogram. By default, it is only guaranteed to be within 1*n and 2*n of the given number."); + public static final OptionID HISTOGRAM_BINS_ID = new OptionID("diststat.bins", "Number of bins to use in the histogram. By default, it is only guaranteed to be within 1*n and 2*n of the given number."); /** * Number of bins to use in sampling. @@ -107,12 +108,12 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex private int numbin; /** - * Sampling + * Sampling flag. */ private boolean sampling = false; /** - * Sampling + * Compute exactly (slower). */ private boolean exact = false; @@ -136,7 +137,7 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]); final DistanceQuery<O, D> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); - final StepProgress stepprog = logger.isVerbose() ? new StepProgress("Distance statistics", 2) : null; + final StepProgress stepprog = LOG.isVerbose() ? new StepProgress("Distance statistics", 2) : null; // determine binning ranges. DoubleMinMax gminmax = new DoubleMinMax(); @@ -157,43 +158,71 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex MeanVariance momax = new MeanVariance(); MeanVariance modif = new MeanVariance(); // Histogram - final AggregatingHistogram<Pair<Long, Long>, Pair<Long, Long>> histogram; - if(stepprog != null) { - stepprog.beginStep(1, "Prepare histogram.", logger); + final ObjHistogram<long[]> histogram; + if (stepprog != null) { + stepprog.beginStep(1, "Prepare histogram.", LOG); } - if(exact) { + if (exact) { gminmax = exactMinMax(relation, distFunc); - histogram = AggregatingHistogram.LongSumLongSumHistogram(numbin, gminmax.getMin(), gminmax.getMax()); - } - else if(sampling) { + histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2); + } else if (sampling) { gminmax = sampleMinMax(relation, distFunc); - histogram = AggregatingHistogram.LongSumLongSumHistogram(numbin, gminmax.getMin(), gminmax.getMax()); - } - else { - histogram = FlexiHistogram.LongSumLongSumHistogram(numbin); + histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2); + } else { + histogram = new AbstractObjDynamicHistogram<long[]>(numbin) { + @Override + protected long[] downsample(Object[] data, int start, int end, int size) { + long[] ret = new long[2]; + for (int i = start; i < end; i++) { + long[] existing = (long[]) data[i]; + if (existing != null) { + for (int c = 0; c < 2; c++) { + ret[c] += existing[c]; + } + } + } + return ret; + } + + @Override + protected long[] aggregate(long[] first, long[] second) { + for (int c = 0; c < 2; c++) { + first[c] += second[c]; + } + return first; + } + + @Override + protected long[] cloneForCache(long[] data) { + return data.clone(); + } + + @Override + protected long[] makeObject() { + return new long[2]; + } + }; } - if(stepprog != null) { - stepprog.beginStep(2, "Build histogram.", logger); + if (stepprog != null) { + stepprog.beginStep(2, "Build histogram.", LOG); } - final FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), logger) : null; + final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null; // iterate per cluster - final Pair<Long, Long> incFirst = new Pair<Long, Long>(1L, 0L); - final Pair<Long, Long> incSecond = new Pair<Long, Long>(0L, 1L); - for(Cluster<?> c1 : split) { - for(DBIDIter iter = c1.getIDs().iter(); iter.valid(); iter.advance()) { - DBID id1 = iter.getDBID(); + final long[] incFirst = new long[] { 1L, 0L }; + final long[] incSecond = new long[] { 0L, 1L }; + for (Cluster<?> c1 : split) { + for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) { // in-cluster distances DoubleMinMax iminmax = new DoubleMinMax(); - for(DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) { - DBID id2 = iter2.getDBID(); + for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) { // skip the point itself. - if(id1.sameDBID(id2)) { + if (DBIDUtil.equal(id1, iter2)) { continue; } - double d = distFunc.distance(id1, id2).doubleValue(); + double d = distFunc.distance(id1, iter2).doubleValue(); - histogram.aggregate(d, incFirst); + histogram.putData(d, incFirst); iminmax.put(d); } @@ -207,19 +236,18 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // other-cluster distances DoubleMinMax ominmax = new DoubleMinMax(); - for(Cluster<?> c2 : split) { - if(c2 == c1) { + for (Cluster<?> c2 : split) { + if (c2 == c1) { continue; } - for(DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) { - DBID id2 = iter2.getDBID(); + for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) { // skip the point itself (shouldn't happen though) - if(id1.sameDBID(id2)) { + if (DBIDUtil.equal(id1, iter2)) { continue; } - double d = distFunc.distance(id1, id2).doubleValue(); + double d = distFunc.distance(id1, iter2).doubleValue(); - histogram.aggregate(d, incSecond); + histogram.putData(d, incSecond); ominmax.put(d); } @@ -231,38 +259,39 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex // min/max gominmax.put(ominmax.getMin()); gominmax.put(ominmax.getMax()); - if(progress != null) { - progress.incrementProcessed(logger); + if (progress != null) { + progress.incrementProcessed(LOG); } } } - if(progress != null) { - progress.ensureCompleted(logger); + if (progress != null) { + progress.ensureCompleted(LOG); } // Update values (only needed for sampling case). gminmax.setFirst(Math.min(giminmax.getMin(), gominmax.getMin())); gminmax.setSecond(Math.max(giminmax.getMax(), gominmax.getMax())); - if(stepprog != null) { - stepprog.setCompleted(logger); + if (stepprog != null) { + stepprog.setCompleted(LOG); } // count the number of samples we have in the data long inum = 0; long onum = 0; - for(DoubleObjPair<Pair<Long, Long>> ppair : histogram) { - inum += ppair.getSecond().getFirst(); - onum += ppair.getSecond().getSecond(); + for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) { + inum += iter.getValue()[0]; + onum += iter.getValue()[1]; } long bnum = inum + onum; Collection<DoubleVector> binstat = new ArrayList<DoubleVector>(numbin); - for(DoubleObjPair<Pair<Long, Long>> ppair : histogram) { - final double icof = (inum == 0) ? 0 : ((double) ppair.getSecond().getFirst()) / inum / histogram.getBinsize(); - final double icaf = ((double) ppair.getSecond().getFirst()) / bnum / histogram.getBinsize(); - final double ocof = (onum == 0) ? 0 : ((double) ppair.getSecond().getSecond()) / onum / histogram.getBinsize(); - final double ocaf = ((double) ppair.getSecond().getSecond()) / bnum / histogram.getBinsize(); - DoubleVector row = new DoubleVector(new double[] { ppair.first, icof, icaf, ocof, ocaf }); + for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) { + final long[] value = iter.getValue(); + final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize(); + final double icaf = ((double) value[0]) / bnum / histogram.getBinsize(); + final double ocof = (onum == 0) ? 0 : ((double) value[1]) / onum / histogram.getBinsize(); + final double ocaf = ((double) value[1]) / bnum / histogram.getBinsize(); + DoubleVector row = new DoubleVector(new double[] { iter.getCenter(), icof, icaf, ocof, ocaf }); binstat.add(row); } HistogramResult<DoubleVector> result = new HistogramResult<DoubleVector>("Distance Histogram", "distance-histogram", binstat); @@ -278,111 +307,121 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex return result; } - private DoubleMinMax sampleMinMax(Relation<O> database, DistanceQuery<O, D> distFunc) { - int size = database.size(); + /** + * Estimate minimum and maximum via sampling. + * + * @param relation Relation to process + * @param distFunc Distance function to use + * @return Minimum and maximum + */ + private DoubleMinMax sampleMinMax(Relation<O> relation, DistanceQuery<O, D> distFunc) { + int size = relation.size(); Random rnd = new Random(); // estimate minimum and maximum. - int k = (int) Math.max(25, Math.pow(database.size(), 0.2)); - TreeSet<DoubleObjPair<DBID>> minhotset = new TreeSet<DoubleObjPair<DBID>>(); - TreeSet<DoubleObjPair<DBID>> maxhotset = new TreeSet<DoubleObjPair<DBID>>(Collections.reverseOrder()); + int k = (int) Math.max(25, Math.pow(relation.size(), 0.2)); + TreeSet<DoubleDBIDPair> minhotset = new TreeSet<DoubleDBIDPair>(); + TreeSet<DoubleDBIDPair> maxhotset = new TreeSet<DoubleDBIDPair>(Collections.reverseOrder()); - int randomsize = (int) Math.max(25, Math.pow(database.size(), 0.2)); + int randomsize = (int) Math.max(25, Math.pow(relation.size(), 0.2)); double rprob = ((double) randomsize) / size; ArrayModifiableDBIDs randomset = DBIDUtil.newArray(randomsize); - DBIDIter iter = database.iterDBIDs(); - if(!iter.valid()) { + DBIDIter iter = relation.iterDBIDs(); + if (!iter.valid()) { throw new IllegalStateException(ExceptionMessages.DATABASE_EMPTY); } - DBID firstid = iter.getDBID(); + DBID firstid = DBIDUtil.deref(iter); iter.advance(); - minhotset.add(new DoubleObjPair<DBID>(Double.MAX_VALUE, firstid)); - maxhotset.add(new DoubleObjPair<DBID>(Double.MIN_VALUE, firstid)); - while(iter.valid()) { - DBID id1 = iter.getDBID(); - iter.advance(); + minhotset.add(DBIDUtil.newPair(Double.MAX_VALUE, firstid)); + maxhotset.add(DBIDUtil.newPair(Double.MIN_VALUE, firstid)); + for (; iter.valid(); iter.advance()) { // generate candidates for min distance. - ArrayList<DoubleObjPair<DBID>> np = new ArrayList<DoubleObjPair<DBID>>(k * 2 + randomsize * 2); - for(DoubleObjPair<DBID> pair : minhotset) { - DBID id2 = pair.getSecond(); + ArrayList<DoubleDBIDPair> np = new ArrayList<DoubleDBIDPair>(k * 2 + randomsize * 2); + for (DoubleDBIDPair pair : minhotset) { // skip the object itself - if(id1.compareTo(id2) == 0) { + if (DBIDUtil.equal(iter, pair)) { continue; } - double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new DoubleObjPair<DBID>(d, id1)); - np.add(new DoubleObjPair<DBID>(d, id2)); + double d = distFunc.distance(iter, pair).doubleValue(); + np.add(DBIDUtil.newPair(d, iter)); + np.add(DBIDUtil.newPair(d, pair)); } - for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { - DBID id2 = iter2.getDBID(); - double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new DoubleObjPair<DBID>(d, id1)); - np.add(new DoubleObjPair<DBID>(d, id2)); + for (DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + double d = distFunc.distance(iter, iter2).doubleValue(); + np.add(DBIDUtil.newPair(d, iter)); + np.add(DBIDUtil.newPair(d, iter2)); } minhotset.addAll(np); shrinkHeap(minhotset, k); // generate candidates for max distance. - ArrayList<DoubleObjPair<DBID>> np2 = new ArrayList<DoubleObjPair<DBID>>(k * 2 + randomsize * 2); - for(DoubleObjPair<DBID> pair : minhotset) { - DBID id2 = pair.getSecond(); + ArrayList<DoubleDBIDPair> np2 = new ArrayList<DoubleDBIDPair>(k * 2 + randomsize * 2); + for (DoubleDBIDPair pair : minhotset) { // skip the object itself - if(id1.compareTo(id2) == 0) { + if (DBIDUtil.equal(iter, pair)) { continue; } - double d = distFunc.distance(id1, id2).doubleValue(); - np2.add(new DoubleObjPair<DBID>(d, id1)); - np2.add(new DoubleObjPair<DBID>(d, id2)); + double d = distFunc.distance(iter, pair).doubleValue(); + np2.add(DBIDUtil.newPair(d, iter)); + np2.add(DBIDUtil.newPair(d, pair)); } - for(DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { - DBID id2 = iter2.getDBID(); - double d = distFunc.distance(id1, id2).doubleValue(); - np.add(new DoubleObjPair<DBID>(d, id1)); - np.add(new DoubleObjPair<DBID>(d, id2)); + for (DBIDIter iter2 = randomset.iter(); iter2.valid(); iter2.advance()) { + double d = distFunc.distance(iter, iter2).doubleValue(); + np.add(DBIDUtil.newPair(d, iter)); + np.add(DBIDUtil.newPair(d, iter2)); } maxhotset.addAll(np2); shrinkHeap(maxhotset, k); // update random set - if(randomset.size() < randomsize) { - randomset.add(id1); - } - else if(rnd.nextDouble() < rprob) { - randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), id1); + if (randomset.size() < randomsize) { + randomset.add(iter); + } else if (rnd.nextDouble() < rprob) { + randomset.set((int) Math.floor(rnd.nextDouble() * randomsize), iter); } } - return new DoubleMinMax(minhotset.first().first, maxhotset.first().first); + return new DoubleMinMax(minhotset.first().doubleValue(), maxhotset.first().doubleValue()); } - private DoubleMinMax exactMinMax(Relation<O> database, DistanceQuery<O, D> distFunc) { + /** + * Compute the exact maximum and minimum. + * + * @param relation Relation to process + * @param distFunc Distance function + * @return Exact maximum and minimum + */ + private DoubleMinMax exactMinMax(Relation<O> relation, DistanceQuery<O, D> distFunc) { DoubleMinMax minmax = new DoubleMinMax(); // find exact minimum and maximum first. - for(DBIDIter iditer = database.iterDBIDs(); iditer.valid(); iditer.advance()) { - DBID id1 = iditer.getDBID(); - for(DBIDIter iditer2 = database.iterDBIDs(); iditer2.valid(); iditer2.advance()) { - DBID id2 = iditer2.getDBID(); + for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { + for (DBIDIter iditer2 = relation.iterDBIDs(); iditer2.valid(); iditer2.advance()) { // skip the point itself. - if(id1.compareTo(id2) == 0) { + if (DBIDUtil.equal(iditer, iditer2)) { continue; } - double d = distFunc.distance(id1, id2).doubleValue(); + double d = distFunc.distance(iditer, iditer2).doubleValue(); minmax.put(d); } } return minmax; } - private void shrinkHeap(TreeSet<DoubleObjPair<DBID>> hotset, int k) { + /** + * Shrink the heap of "hot" (extreme) items. + * + * @param hotset Set of hot items + * @param k target size + */ + private static void shrinkHeap(TreeSet<DoubleDBIDPair> hotset, int k) { // drop duplicates ModifiableDBIDs seenids = DBIDUtil.newHashSet(2 * k); int cnt = 0; - for(Iterator<DoubleObjPair<DBID>> i = hotset.iterator(); i.hasNext();) { - DoubleObjPair<DBID> p = i.next(); - if(cnt > k || seenids.contains(p.getSecond())) { + for (Iterator<DoubleDBIDPair> i = hotset.iterator(); i.hasNext();) { + DoubleDBIDPair p = i.next(); + if (cnt > k || seenids.contains(p)) { i.remove(); - } - else { - seenids.add(p.getSecond()); + } else { + seenids.add(p); cnt++; } } @@ -395,7 +434,7 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex @Override protected Logging getLogger() { - return logger; + return LOG; } /** @@ -412,36 +451,37 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex private int numbin = 20; /** - * Sampling + * Sampling. */ private boolean sampling = false; /** - * Sampling + * Exactness flag. */ private boolean exact = false; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - final IntParameter numbinP = new IntParameter(HISTOGRAM_BINS_ID, new GreaterEqualConstraint(2), 20); - if(config.grab(numbinP)) { + final IntParameter numbinP = new IntParameter(HISTOGRAM_BINS_ID, 20); + numbinP.addConstraint(new GreaterEqualConstraint(2)); + if (config.grab(numbinP)) { numbin = numbinP.getValue(); } - final Flag EXACT_FLAG = new Flag(EXACT_ID); - if(config.grab(EXACT_FLAG)) { - exact = EXACT_FLAG.getValue(); + final Flag exactF = new Flag(EXACT_ID); + if (config.grab(exactF)) { + exact = exactF.getValue(); } - final Flag SAMPLING_FLAG = new Flag(SAMPLING_ID); - if(config.grab(SAMPLING_FLAG)) { - sampling = SAMPLING_FLAG.getValue(); + final Flag samplingF = new Flag(SAMPLING_ID); + if (config.grab(samplingF)) { + sampling = samplingF.getValue(); } - ArrayList<Parameter<?, ?>> exclusive = new ArrayList<Parameter<?, ?>>(); - exclusive.add(EXACT_FLAG); - exclusive.add(SAMPLING_FLAG); + ArrayList<Parameter<?>> exclusive = new ArrayList<Parameter<?>>(); + exclusive.add(exactF); + exclusive.add(samplingF); config.checkConstraint(new OnlyOneIsAllowedToBeSetGlobalConstraint(exclusive)); } @@ -450,4 +490,4 @@ public class DistanceStatisticsWithClasses<O, D extends NumberDistance<D, ?>> ex return new DistanceStatisticsWithClasses<O, D>(distanceFunction, numbin, exact, sampling); } } -}
\ No newline at end of file +} |