summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/gdbscan/GeneralizedDBSCAN.java
blob: 86753fcdabf5cb68b14022514f68f475f03bc88c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
package de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan;

/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures

 Copyright (C) 2015
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

import gnu.trove.list.array.TIntArrayList;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.CoreObjectsModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;

/**
 * Generalized DBSCAN, density-based clustering with noise.
 *
 * Reference:
 * <p>
 * Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu<br />
 * Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its
 * Applications<br />
 * In: Data Mining and Knowledge Discovery, 1998.
 * </p>
 *
 * @author Erich Schubert
 * @author Arthur Zimek
 * @since 0.5.0
 *
 * @apiviz.landmark
 *
 * @apiviz.has Instance
 * @apiviz.composedOf CorePredicate
 * @apiviz.composedOf NeighborPredicate
 */
@Reference(authors = "Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu", //
title = "Density-Based Clustering in Spatial Databases: The Algorithm GDBSCAN and Its Applications", //
booktitle = "Data Mining and Knowledge Discovery", //
url = "http://dx.doi.org/10.1023/A:1009745219419")
public class GeneralizedDBSCAN extends AbstractAlgorithm<Clustering<Model>>implements ClusteringAlgorithm<Clustering<Model>> {
  /**
   * Get a logger for this algorithm
   */
  private static final Logging LOG = Logging.getLogger(GeneralizedDBSCAN.class);

  /**
   * The neighborhood predicate factory.
   */
  protected NeighborPredicate npred;

  /**
   * The core predicate factory.
   */
  protected CorePredicate corepred;

  /**
   * Track which objects are "core" objects.
   */
  protected boolean coremodel = false;

  /**
   * Constructor for parameterized algorithm.
   *
   * @param npred Neighbor predicate.
   * @param corepred Core point predicate.
   * @param coremodel Keep track of core points.
   */
  public GeneralizedDBSCAN(NeighborPredicate npred, CorePredicate corepred, boolean coremodel) {
    super();
    this.npred = npred;
    this.corepred = corepred;
    this.coremodel = coremodel;
  }

  @Override
  public Clustering<Model> run(Database database) {
    for(SimpleTypeInformation<?> t : npred.getOutputType()) {
      if(corepred.acceptsType(t)) {
        return new Instance<>(npred.instantiate(database, t), corepred.instantiate(database, t), coremodel).run();
      }
    }
    throw new AbortException("No compatible types found.");
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(npred.getInputTypeRestriction());
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Instance for a particular data set.
   *
   * @author Erich Schubert
   *
   * @apiviz.composedOf CorePredicate.Instance
   * @apiviz.composedOf NeighborPredicate.Instance
   */
  public static class Instance<T> {
    /**
     * Unprocessed IDs
     */
    protected static final int UNPROCESSED = 0;

    /**
     * Noise IDs
     */
    protected static final int NOISE = 1;

    /**
     * The neighborhood predicate
     */
    protected final NeighborPredicate.Instance<T> npred;

    /**
     * The core object property
     */
    protected final CorePredicate.Instance<? super T> corepred;

    /**
     * Track which objects are "core" objects.
     */
    protected boolean coremodel = false;

    /**
     * Full Constructor
     *
     * @param npred Neighborhood predicate
     * @param corepred Core object predicate
     * @param coremodel Keep track of core points.
     */
    public Instance(NeighborPredicate.Instance<T> npred, CorePredicate.Instance<? super T> corepred, boolean coremodel) {
      super();
      this.npred = npred;
      this.corepred = corepred;
      this.coremodel = coremodel;
    }

    /**
     * Run the actual GDBSCAN algorithm.
     *
     * @return Clustering result
     */
    public Clustering<Model> run() {
      final DBIDs ids = npred.getIDs();
      // Setup progress logging
      final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Generalized DBSCAN Clustering", ids.size(), LOG) : null;
      final IndefiniteProgress clusprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters found", LOG) : null;
      // (Temporary) store the cluster ID assigned.
      final WritableIntegerDataStore clusterids = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, UNPROCESSED);
      // Note: these are not exact, as objects may be stolen from noise.
      final TIntArrayList clustersizes = new TIntArrayList();
      clustersizes.add(0); // Unprocessed dummy value.
      clustersizes.add(0); // Noise counter.
      final ArrayModifiableDBIDs activeSet = DBIDUtil.newArray();

      // Implementation Note: using Integer objects should result in
      // reduced memory use in the HashMap!
      int clusterid = NOISE + 1;
      // Iterate over all objects in the database.
      for(DBIDIter id = ids.iter(); id.valid(); id.advance()) {
        // Skip already processed ids.
        if(clusterids.intValue(id) != UNPROCESSED) {
          continue;
        }
        // Evaluate Neighborhood predicate
        final T neighbors = npred.getNeighbors(id);
        // Evaluate Core-Point predicate:
        if(corepred.isCorePoint(id, neighbors)) {
          LOG.incrementProcessed(clusprogress);
          clustersizes.add(expandCluster(id, clusterid, clusterids, neighbors, activeSet, progress));
          // start next cluster on next iteration.
          ++clusterid;
        }
        else {
          // otherwise, it's a noise point
          clusterids.putInt(id, NOISE);
          clustersizes.set(NOISE, clustersizes.get(NOISE) + 1);
        }
        // We've completed this element
        LOG.incrementProcessed(progress);
      }
      // Finish progress logging.
      LOG.ensureCompleted(progress);
      LOG.setCompleted(clusprogress);

      // Transform cluster ID mapping into a clustering result:
      ArrayModifiableDBIDs[] clusterlists = new ArrayModifiableDBIDs[clusterid];
      ArrayModifiableDBIDs[] corelists = coremodel ? new ArrayModifiableDBIDs[clusterid] : null;
      // add storage containers for clusters
      for(int i = 0; i < clustersizes.size(); i++) {
        clusterlists[i] = DBIDUtil.newArray(clustersizes.get(i));
        if(corelists != null) {
          corelists[i] = DBIDUtil.newArray(clustersizes.get(i));
        }
      }
      // do the actual inversion
      for(DBIDIter id = ids.iter(); id.valid(); id.advance()) {
        // Negative values are non-core points:
        final int cid = clusterids.intValue(id);
        final int cluster = cid < 0 ? -cid : cid;
        clusterlists[cluster].add(id);
        if(corelists != null && cid > NOISE) {
          corelists[cluster].add(id);
        }
      }
      clusterids.destroy();

      Clustering<Model> result = new Clustering<>("GDBSCAN", "gdbscan-clustering");
      for(int cid = NOISE; cid < clusterlists.length; cid++) {
        boolean isNoise = (cid == NOISE);
        Model m = coremodel ? new CoreObjectsModel(corelists[cid]) : ClusterModel.CLUSTER;
        result.addToplevelCluster(new Cluster<Model>(clusterlists[cid], isNoise, m));
      }
      return result;
    }

    /**
     * Set-based expand cluster implementation.
     *
     * @param clusterid ID of the current cluster.
     * @param clusterids Current object to cluster mapping.
     * @param neighbors Neighbors acquired by initial getNeighbors call.
     * @param activeSet Set to manage active candidates.
     * @param progress Progress logging
     * @return cluster size
     */
    protected int expandCluster(final DBIDRef seed, final int clusterid, final WritableIntegerDataStore clusterids, final T neighbors, ArrayModifiableDBIDs activeSet, final FiniteProgress progress) {
      assert(activeSet.size() == 0);
      int clustersize = 1 + processCorePoint(seed, neighbors, clusterid, clusterids, activeSet);
      // run expandCluster as long as there is another seed
      final DBIDVar id = DBIDUtil.newVar();
      while(!activeSet.isEmpty()) {
        activeSet.pop(id);
        // Evaluate Neighborhood predicate
        final T newneighbors = npred.getNeighbors(id);
        // Evaluate Core-Point predicate
        if(corepred.isCorePoint(id, newneighbors)) {
          clustersize += processCorePoint(id, newneighbors, clusterid, clusterids, activeSet);
        }
        LOG.incrementProcessed(progress);
      }
      return clustersize;
    }

    /**
     * Process a single core point.
     *
     * @param seed Point to process
     * @param newneighbors New neighbors
     * @param clusterid Cluster to add to
     * @param clusterids Cluster assignment storage.
     * @param activeSet Active set of cluster seeds
     * @return Number of new points added to cluster
     */
    protected int processCorePoint(final DBIDRef seed, T newneighbors, final int clusterid, final WritableIntegerDataStore clusterids, ArrayModifiableDBIDs activeSet) {
      clusterids.putInt(seed, clusterid); // Core point now
      int clustersize = 0;
      // The recursion is unrolled into iteration over the active set.
      for(DBIDIter it = npred.iterDBIDs(newneighbors); it.valid(); it.advance()) {
        final int oldassign = clusterids.intValue(it);
        if(oldassign == UNPROCESSED) {
          activeSet.add(it);
        }
        else if(oldassign != NOISE) {
          continue; // Member of some cluster.
        }
        clustersize++;
        clusterids.putInt(it, -clusterid);
      }
      return clustersize;
    }
  }

  /**
   * Parameterization class
   *
   * @author Erich Schubert
   *
   * @apiviz.exclude
   */
  public static class Parameterizer extends AbstractParameterizer {
    /**
     * Parameter for neighborhood predicate.
     */
    public static final OptionID NEIGHBORHOODPRED_ID = new OptionID("gdbscan.neighborhood", //
    "Neighborhood predicate for Generalized DBSCAN");

    /**
     * Parameter for core predicate.
     */
    public static final OptionID COREPRED_ID = new OptionID("gdbscan.core", //
    "Core point predicate for Generalized DBSCAN");

    /**
     * Flag to keep track of core points.
     */
    public static final OptionID COREMODEL_ID = new OptionID("gdbscan.core-model", //
    "Use a model that keeps track of core points. Needs more memory.");

    /**
     * Neighborhood predicate.
     */
    protected NeighborPredicate npred = null;

    /**
     * Core point predicate.
     */
    protected CorePredicate corepred = null;

    /**
     * Track which objects are "core" objects.
     */
    protected boolean coremodel = false;

    @Override
    protected void makeOptions(Parameterization config) {
      // Neighborhood predicate
      ObjectParameter<NeighborPredicate> npredOpt = new ObjectParameter<>(NEIGHBORHOODPRED_ID, NeighborPredicate.class, EpsilonNeighborPredicate.class);
      if(config.grab(npredOpt)) {
        npred = npredOpt.instantiateClass(config);
      }

      // Core point predicate
      ObjectParameter<CorePredicate> corepredOpt = new ObjectParameter<>(COREPRED_ID, CorePredicate.class, MinPtsCorePredicate.class);
      if(config.grab(corepredOpt)) {
        corepred = corepredOpt.instantiateClass(config);
      }

      Flag coremodelOpt = new Flag(COREMODEL_ID);
      if(config.grab(coremodelOpt)) {
        coremodel = coremodelOpt.isTrue();
      }
    }

    @Override
    protected GeneralizedDBSCAN makeInstance() {
      return new GeneralizedDBSCAN(npred, corepred, coremodel);
    }
  }
}