summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/correlation/CASH.java
blob: adef67ffd18d1142da821bf45af94e0fb9b8a983 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
package de.lmu.ifi.dbs.elki.algorithm.clustering.correlation;

/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures

 Copyright (C) 2014
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.util.ArrayList;
import java.util.Arrays;

import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.DependencyDerivator;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval;
import de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHIntervalSplit;
import de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.HyperBoundingBox;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.CorrelationAnalysisSolution;
import de.lmu.ifi.dbs.elki.data.model.LinearEquationModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.spatial.SpatialUtil;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.datasource.filter.normalization.NonNumericFeaturesException;
import de.lmu.ifi.dbs.elki.distance.distancefunction.MatrixWeightedDistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Matrix;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.FirstNEigenPairFilter;
import de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredRunner;
import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.IntegerPriorityObject;
import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;

/**
 * The CASH algorithm is a subspace clustering algorithm based on the Hough
 * transform.
 * 
 * Reference:
 * <p>
 * E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek:<br />
 * Robust clustering in arbitrarily oriented subspaces. <br>
 * In Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008
 * </p>
 * 
 * @author Elke Achtert
 * 
 * @apiviz.has CASHInterval
 * @apiviz.has ParameterizationFunction
 * @apiviz.has LinearEquationModel
 * 
 * @param <V> Vector type
 */
// todo elke hierarchy (later)
@Title("CASH: Robust clustering in arbitrarily oriented subspaces")
@Description("Subspace clustering algorithm based on the Hough transform.")
@Reference(authors = "E. Achtert, C. Böhm, J. David, P. Kröger, A. Zimek", //
title = "Robust clustering in arbitraily oriented subspaces",//
booktitle = "Proc. 8th SIAM Int. Conf. on Data Mining (SDM'08), Atlanta, GA, 2008",//
url = "http://www.siam.org/proceedings/datamining/2008/dm08_69_AchtertBoehmDavidKroegerZimek.pdf")
public class CASH<V extends NumberVector> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
  /**
   * The logger for this class.
   */
  private static final Logging LOG = Logging.getLogger(CASH.class);

  /**
   * Parameter to specify the threshold for minimum number of points in a
   * cluster, must be an integer greater than 0.
   * <p>
   * Key: {@code -cash.minpts}
   * </p>
   */
  public static final OptionID MINPTS_ID = new OptionID("cash.minpts", "Threshold for minimum number of points in a cluster.");

  /**
   * Parameter to specify the maximum level for splitting the hypercube, must be
   * an integer greater than 0.
   * <p>
   * Key: {@code -cash.maxlevel}
   * </p>
   */
  public static final OptionID MAXLEVEL_ID = new OptionID("cash.maxlevel", "The maximum level for splitting the hypercube.");

  /**
   * Parameter to specify the minimum dimensionality of the subspaces to be
   * found, must be an integer greater than 0.
   * <p>
   * Default value: {@code 1}
   * </p>
   * <p>
   * Key: {@code -cash.mindim}
   * </p>
   */
  public static final OptionID MINDIM_ID = new OptionID("cash.mindim", "The minimum dimensionality of the subspaces to be found.");

  /**
   * Parameter to specify the maximum jitter for distance values, must be a
   * double greater than 0.
   * <p>
   * Key: {@code -cash.jitter}
   * </p>
   */
  public static final OptionID JITTER_ID = new OptionID("cash.jitter", "The maximum jitter for distance values.");

  /**
   * Flag to indicate that an adjustment of the applied heuristic for choosing
   * an interval is performed after an interval is selected.
   * <p>
   * Key: {@code -cash.adjust}
   * </p>
   */
  public static final OptionID ADJUST_ID = new OptionID("cash.adjust", "Flag to indicate that an adjustment of the applied heuristic for choosing an interval " + "is performed after an interval is selected.");

  /**
   * Holds the value of {@link #MINPTS_ID}.
   */
  private int minPts;

  /**
   * Holds the value of {@link #MAXLEVEL_ID}.
   */
  private int maxLevel;

  /**
   * Holds the value of {@link #MINDIM_ID}.
   */
  private int minDim;

  /**
   * Holds the value of {@link #JITTER_ID}.
   */
  private double jitter;

  /**
   * Holds the value of {@link #ADJUST_ID}.
   */
  private boolean adjust;

  /**
   * Holds the dimensionality for noise.
   */
  private int noiseDim;

  /**
   * Holds a set of processed ids.
   */
  private ModifiableDBIDs processedIDs;

  /**
   * The entire relation.
   */
  private Relation<ParameterizationFunction> fulldatabase;

  /**
   * Constructor.
   * 
   * @param minPts MinPts parameter
   * @param maxLevel Maximum level
   * @param minDim Minimum dimensionality
   * @param jitter Jitter
   * @param adjust Adjust
   */
  public CASH(int minPts, int maxLevel, int minDim, double jitter, boolean adjust) {
    super();
    this.minPts = minPts;
    this.maxLevel = maxLevel;
    this.minDim = minDim;
    this.jitter = jitter;
    this.adjust = adjust;
  }

  /**
   * Run CASH on the relation.
   * 
   * @param database Database
   * @param vrel Relation
   * @return Clustering result
   */
  public Clustering<Model> run(Database database, Relation<V> vrel) {
    this.fulldatabase = preprocess(database, vrel);
    if(LOG.isVerbose()) {
      StringBuilder msg = new StringBuilder();
      msg.append("DB size: ").append(fulldatabase.size());
      msg.append("\nmin Dim: ").append(minDim);
      LOG.verbose(msg.toString());
    }

    processedIDs = DBIDUtil.newHashSet(fulldatabase.size());
    noiseDim = dimensionality(fulldatabase);

    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("CASH Clustering", fulldatabase.size(), LOG) : null;
    Clustering<Model> result = doRun(fulldatabase, progress);
    LOG.ensureCompleted(progress);

    if(LOG.isVerbose()) {
      StringBuilder msg = new StringBuilder();
      for(Cluster<Model> c : result.getAllClusters()) {
        if(c.getModel() instanceof LinearEquationModel) {
          LinearEquationModel s = (LinearEquationModel) c.getModel();
          msg.append("\n Cluster: Dim: " + s.getLes().subspacedim() + " size: " + c.size());
        }
        else {
          msg.append("\n Cluster: " + c.getModel().getClass().getName() + " size: " + c.size());
        }
      }
      LOG.verbose(msg.toString());
    }
    return result;
  }

  /**
   * Preprocess the dataset, precomputing the parameterization functions.
   * 
   * @param db Database
   * @param vrel Vector relation
   * @return Preprocessed relation
   */
  private Relation<ParameterizationFunction> preprocess(Database db, Relation<V> vrel) {
    DBIDs ids = vrel.getDBIDs();
    SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<>(ParameterizationFunction.class);
    MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<>(db, type, ids);

    // Project
    for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      ParameterizationFunction f = new ParameterizationFunction(vrel.get(iter));
      prep.set(iter, f);
    }

    return prep;
  }

  /**
   * Runs the CASH algorithm on the specified database, this method is
   * recursively called until only noise is left.
   * 
   * @param relation the Relation to run the CASH algorithm on
   * @param progress the progress object for verbose messages
   * @return a mapping of subspace dimensionalities to clusters
   */
  private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) {
    Clustering<Model> res = new Clustering<>("CASH clustering", "cash-clustering");

    final int dim = dimensionality(relation);

    // init heap
    ObjectHeap<IntegerPriorityObject<CASHInterval>> heap = new ComparableMinHeap<>();
    ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs());
    initHeap(heap, relation, dim, noiseIDs);

    if(LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
      msg.append("\nXXXX dim ").append(dim);
      msg.append("\nXXXX database.size ").append(relation.size());
      msg.append("\nXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
      LOG.debugFine(msg.toString());
    }
    else if(LOG.isVerbose()) {
      StringBuilder msg = new StringBuilder();
      msg.append("XXXX dim ").append(dim).append(" database.size ").append(relation.size());
      LOG.verbose(msg.toString());
    }

    // get the ''best'' d-dimensional intervals at max level
    while(!heap.isEmpty()) {
      CASHInterval interval = determineNextIntervalAtMaxLevel(heap);
      if(LOG.isDebugging()) {
        LOG.debugFine("next interval in dim " + dim + ": " + interval);
      }
      else if(LOG.isVerbose()) {
        LOG.verbose("next interval in dim " + dim + ": " + interval);
      }

      // only noise left
      if(interval == null) {
        break;
      }

      // do a dim-1 dimensional run
      ModifiableDBIDs clusterIDs = DBIDUtil.newHashSet();
      if(dim > minDim + 1) {
        ModifiableDBIDs ids;
        Matrix basis_dim_minus_1;
        if(adjust) {
          ids = DBIDUtil.newHashSet();
          basis_dim_minus_1 = runDerivator(relation, dim, interval, ids);
        }
        else {
          ids = interval.getIDs();
          basis_dim_minus_1 = determineBasis(SpatialUtil.centroid(interval));
        }

        if(ids.size() != 0) {
          MaterializedRelation<ParameterizationFunction> db = buildDB(dim, basis_dim_minus_1, ids, relation);
          // add result of dim-1 to this result
          Clustering<Model> res_dim_minus_1 = doRun(db, progress);
          for(Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) {
            res.addToplevelCluster(cluster);
            noiseIDs.removeDBIDs(cluster.getIDs());
            clusterIDs.addDBIDs(cluster.getIDs());
            processedIDs.addDBIDs(cluster.getIDs());
          }
        }
      }
      // dim == minDim
      else {
        LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs());
        Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les));
        res.addToplevelCluster(c);
        noiseIDs.removeDBIDs(interval.getIDs());
        clusterIDs.addDBIDs(interval.getIDs());
        processedIDs.addDBIDs(interval.getIDs());
      }

      // Rebuild heap
      ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<>(heap.size());
      for(ObjectHeap.UnsortedIter<IntegerPriorityObject<CASHInterval>> iter = heap.unsortedIter(); iter.valid(); iter.advance()) {
        heapVector.add(iter.get());
      }
      heap.clear();
      for(IntegerPriorityObject<CASHInterval> pair : heapVector) {
        CASHInterval currentInterval = pair.getObject();
        currentInterval.removeIDs(clusterIDs);
        if(currentInterval.getIDs().size() >= minPts) {
          heap.add(new IntegerPriorityObject<>(currentInterval.priority(), currentInterval));
        }
      }

      if(progress != null) {
        progress.setProcessed(processedIDs.size(), LOG);
      }
    }

    // put noise to clusters
    if(!noiseIDs.isEmpty()) {
      if(dim == noiseDim) {
        Cluster<Model> c = new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER);
        res.addToplevelCluster(c);
        processedIDs.addDBIDs(noiseIDs);
      }
      else if(noiseIDs.size() >= minPts) {
        LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
        Cluster<Model> c = new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les));
        res.addToplevelCluster(c);
        processedIDs.addDBIDs(noiseIDs);
      }
    }

    if(LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append("noise fuer dim ").append(dim).append(": ").append(noiseIDs.size());

      for(Cluster<Model> c : res.getAllClusters()) {
        if(c.getModel() instanceof LinearEquationModel) {
          LinearEquationModel s = (LinearEquationModel) c.getModel();
          msg.append("\n Cluster: Dim: " + s.getLes().subspacedim() + " size: " + c.size());
        }
        else {
          msg.append("\n Cluster: " + c.getModel().getClass().getName() + " size: " + c.size());
        }
      }
      LOG.debugFine(msg.toString());
    }

    if(progress != null) {
      progress.setProcessed(processedIDs.size(), LOG);
    }
    return res;
  }

  /**
   * Get the dimensionality of a vector field.
   * 
   * @param relation Relation
   * @return Dimensionality
   */
  private static int dimensionality(Relation<ParameterizationFunction> relation) {
    return relation.get(relation.iterDBIDs()).getDimensionality();
  }

  /**
   * Initializes the heap with the root intervals.
   * 
   * @param heap the heap to be initialized
   * @param relation the database storing the parameterization functions
   * @param dim the dimensionality of the database
   * @param ids the ids of the database
   */
  private void initHeap(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap, Relation<ParameterizationFunction> relation, int dim, DBIDs ids) {
    CASHIntervalSplit split = new CASHIntervalSplit(relation, minPts);

    // determine minimum and maximum function value of all functions
    double[] minMax = determineMinMaxDistance(relation, dim);

    double d_min = minMax[0];
    double d_max = minMax[1];
    double dIntervalLength = d_max - d_min;
    int numDIntervals = (int) Math.ceil(dIntervalLength / jitter);
    double dIntervalSize = dIntervalLength / numDIntervals;
    double[] d_mins = new double[numDIntervals];
    double[] d_maxs = new double[numDIntervals];

    if(LOG.isDebugging()) {
      StringBuilder msg = new StringBuilder();
      msg.append("d_min ").append(d_min);
      msg.append("\nd_max ").append(d_max);
      msg.append("\nnumDIntervals ").append(numDIntervals);
      msg.append("\ndIntervalSize ").append(dIntervalSize);
      LOG.debugFine(msg.toString());
    }
    else if(LOG.isVerbose()) {
      StringBuilder msg = new StringBuilder();
      msg.append("d_min ").append(d_min);
      msg.append("\nd_max ").append(d_max);
      msg.append("\nnumDIntervals ").append(numDIntervals);
      msg.append("\ndIntervalSize ").append(dIntervalSize);
      LOG.verbose(msg.toString());
    }

    // alpha intervals
    double[] alphaMin = new double[dim - 1];
    double[] alphaMax = new double[dim - 1];
    Arrays.fill(alphaMax, Math.PI);

    for(int i = 0; i < numDIntervals; i++) {
      if(i == 0) {
        d_mins[i] = d_min;
      }
      else {
        d_mins[i] = d_maxs[i - 1];
      }

      if(i < numDIntervals - 1) {
        d_maxs[i] = d_mins[i] + dIntervalSize;
      }
      else {
        d_maxs[i] = d_max - d_mins[i];
      }

      HyperBoundingBox alphaInterval = new HyperBoundingBox(alphaMin, alphaMax);
      ModifiableDBIDs intervalIDs = split.determineIDs(ids, alphaInterval, d_mins[i], d_maxs[i]);
      if(intervalIDs != null && intervalIDs.size() >= minPts) {
        CASHInterval rootInterval = new CASHInterval(alphaMin, alphaMax, split, intervalIDs, -1, 0, d_mins[i], d_maxs[i]);
        heap.add(new IntegerPriorityObject<>(rootInterval.priority(), rootInterval));
      }
    }

    if(LOG.isDebuggingFiner()) {
      StringBuilder msg = new StringBuilder();
      msg.append("heap.size ").append(heap.size());
      LOG.debugFiner(msg.toString());
    }
  }

  /**
   * Builds a dim-1 dimensional database where the objects are projected into
   * the specified subspace.
   * 
   * @param dim the dimensionality of the database
   * @param basis the basis defining the subspace
   * @param ids the ids for the new database
   * @param relation the database storing the parameterization functions
   * @return a dim-1 dimensional database where the objects are projected into
   *         the specified subspace
   */
  private MaterializedRelation<ParameterizationFunction> buildDB(int dim, Matrix basis, DBIDs ids, Relation<ParameterizationFunction> relation) {
    ProxyDatabase proxy = new ProxyDatabase(ids);
    SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<>(ParameterizationFunction.class);
    MaterializedRelation<ParameterizationFunction> prep = new MaterializedRelation<>(proxy, type, ids);
    proxy.addRelation(prep);

    // Project
    for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      ParameterizationFunction f = project(basis, relation.get(iter));
      prep.set(iter, f);
    }

    if(LOG.isDebugging()) {
      LOG.debugFine("db fuer dim " + (dim - 1) + ": " + ids.size());
    }

    return prep;
  }

  /**
   * Projects the specified parameterization function into the subspace
   * described by the given basis.
   * 
   * @param basis the basis defining he subspace
   * @param f the parameterization function to be projected
   * @return the projected parameterization function
   */
  private ParameterizationFunction project(Matrix basis, ParameterizationFunction f) {
    // Matrix m = new Matrix(new
    // double[][]{f.getPointCoordinates()}).times(basis);
    Matrix m = f.getColumnVector().transposeTimes(basis);
    ParameterizationFunction f_t = new ParameterizationFunction(new DoubleVector(m.getColumnPackedCopy()));
    return f_t;
  }

  /**
   * Determines a basis defining a subspace described by the specified alpha
   * values.
   * 
   * @param alpha the alpha values
   * @return a basis defining a subspace described by the specified alpha values
   */
  private Matrix determineBasis(double[] alpha) {
    double[] nn = new double[alpha.length + 1];
    for(int i = 0; i < nn.length; i++) {
      double alpha_i = i == alpha.length ? 0 : alpha[i];
      nn[i] = sinusProduct(0, i, alpha) * StrictMath.cos(alpha_i);
    }
    Matrix n = new Matrix(nn, alpha.length + 1);
    return n.completeToOrthonormalBasis();
  }

  /**
   * Computes the product of all sinus values of the specified angles from start
   * to end index.
   * 
   * @param start the index to start
   * @param end the index to end
   * @param alpha the array of angles
   * @return the product of all sinus values of the specified angles from start
   *         to end index
   */
  private double sinusProduct(int start, int end, double[] alpha) {
    double result = 1;
    for(int j = start; j < end; j++) {
      result *= Math.sin(alpha[j]);
    }
    return result;
  }

  /**
   * Determines the next ''best'' interval at maximum level, i.e. the next
   * interval containing the most unprocessed objects.
   * 
   * @param heap the heap storing the intervals
   * @return the next ''best'' interval at maximum level
   */
  private CASHInterval determineNextIntervalAtMaxLevel(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap) {
    CASHInterval next = doDetermineNextIntervalAtMaxLevel(heap);
    // noise path was chosen
    while(next == null) {
      if(heap.isEmpty()) {
        return null;
      }
      next = doDetermineNextIntervalAtMaxLevel(heap);
    }

    return next;
  }

  /**
   * Recursive helper method to determine the next ''best'' interval at maximum
   * level, i.e. the next interval containing the most unprocessed objects
   * 
   * @param heap the heap storing the intervals
   * @return the next ''best'' interval at maximum level
   */
  private CASHInterval doDetermineNextIntervalAtMaxLevel(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap) {
    CASHInterval interval = heap.poll().getObject();
    int dim = interval.getDimensionality();
    while(true) {
      // max level is reached
      if(interval.getLevel() >= maxLevel && interval.getMaxSplitDimension() == (dim - 1)) {
        return interval;
      }

      if(heap.size() % 10000 == 0 && LOG.isVerbose()) {
        LOG.verbose("heap size " + heap.size());
      }

      if(heap.size() >= 40000) {
        LOG.warning("Heap size > 40.000!!!");
        heap.clear();
        return null;
      }

      if(LOG.isDebuggingFiner()) {
        LOG.debugFiner("split " + interval.toString() + " " + interval.getLevel() + "-" + interval.getMaxSplitDimension());
      }
      interval.split();

      // noise
      if(!interval.hasChildren()) {
        return null;
      }

      CASHInterval bestInterval;
      if(interval.getLeftChild() != null && interval.getRightChild() != null) {
        int comp = interval.getLeftChild().compareTo(interval.getRightChild());
        if(comp < 0) {
          bestInterval = interval.getRightChild();
          heap.add(new IntegerPriorityObject<>(interval.getLeftChild().priority(), interval.getLeftChild()));
        }
        else {
          bestInterval = interval.getLeftChild();
          heap.add(new IntegerPriorityObject<>(interval.getRightChild().priority(), interval.getRightChild()));
        }
      }
      else if(interval.getLeftChild() == null) {
        bestInterval = interval.getRightChild();
      }
      else {
        bestInterval = interval.getLeftChild();
      }

      interval = bestInterval;
    }
  }

  /**
   * Determines the minimum and maximum function value of all parameterization
   * functions stored in the specified database.
   * 
   * @param relation the database containing the parameterization functions.
   * @param dimensionality the dimensionality of the database
   * @return an array containing the minimum and maximum function value of all
   *         parameterization functions stored in the specified database
   */
  private double[] determineMinMaxDistance(Relation<ParameterizationFunction> relation, int dimensionality) {
    double[] min = new double[dimensionality - 1];
    double[] max = new double[dimensionality - 1];
    Arrays.fill(max, Math.PI);
    HyperBoundingBox box = new HyperBoundingBox(min, max);

    double d_min = Double.POSITIVE_INFINITY;
    double d_max = Double.NEGATIVE_INFINITY;
    for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      ParameterizationFunction f = relation.get(iditer);
      HyperBoundingBox minMax = f.determineAlphaMinMax(box);
      double f_min = f.function(SpatialUtil.getMin(minMax));
      double f_max = f.function(SpatialUtil.getMax(minMax));

      d_min = Math.min(d_min, f_min);
      d_max = Math.max(d_max, f_max);
    }
    return new double[] { d_min, d_max };
  }

  /**
   * Runs the derivator on the specified interval and assigns all points having
   * a distance less then the standard deviation of the derivator model to the
   * model to this model.
   * 
   * @param relation the database containing the parameterization functions
   * @param interval the interval to build the model
   * @param dim the dimensionality of the database
   * @param ids an empty set to assign the ids
   * @return a basis of the found subspace
   */
  private Matrix runDerivator(Relation<ParameterizationFunction> relation, int dim, CASHInterval interval, ModifiableDBIDs ids) {
    // build database for derivator
    Database derivatorDB = buildDerivatorDB(relation, interval);

    // set the parameters
    ListParameterization parameters = new ListParameterization();
    parameters.addParameter(PCAFilteredRunner.Parameterizer.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class.getName());
    parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, Integer.toString(dim - 1));
    DependencyDerivator<DoubleVector> derivator = null;
    Class<DependencyDerivator<DoubleVector>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DependencyDerivator.class);
    derivator = parameters.tryInstantiate(cls);

    CorrelationAnalysisSolution<DoubleVector> model = derivator.run(derivatorDB);

    Matrix weightMatrix = model.getSimilarityMatrix();
    DoubleVector centroid = new DoubleVector(model.getCentroid());
    DistanceQuery<DoubleVector> df = QueryUtil.getDistanceQuery(derivatorDB, new MatrixWeightedDistanceFunction(weightMatrix));
    double eps = .25;

    ids.addDBIDs(interval.getIDs());
    // Search for nearby vectors in original database
    for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
      DoubleVector v = new DoubleVector(relation.get(iditer).getColumnVector());
      double d = df.distance(v, centroid);
      if(d <= eps) {
        ids.add(iditer);
      }
    }

    Matrix basis = model.getStrongEigenvectors();
    return basis.getMatrix(0, basis.getRowDimensionality() - 1, 0, dim - 2);
  }

  /**
   * Builds a database for the derivator consisting of the ids in the specified
   * interval.
   * 
   * @param relation the database storing the parameterization functions
   * @param interval the interval to build the database from
   * @return a database for the derivator consisting of the ids in the specified
   *         interval
   */
  private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, CASHInterval interval) {
    DBIDs ids = interval.getIDs();
    ProxyDatabase proxy = new ProxyDatabase(ids);
    int dim = dimensionality(relation);
    SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dim);
    MaterializedRelation<DoubleVector> prep = new MaterializedRelation<>(proxy, type, ids);
    proxy.addRelation(prep);

    // Project
    for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef());
      prep.set(iter, v);
    }

    if(LOG.isDebugging()) {
      LOG.debugFine("db fuer derivator : " + prep.size());
    }

    return proxy;
  }

  /**
   * Runs the derivator on the specified interval and assigns all points having
   * a distance less then the standard deviation of the derivator model to the
   * model to this model.
   * 
   * @param relation the database containing the parameterization functions
   * @param ids the ids to build the model
   * @param dimensionality the dimensionality of the subspace
   * @return a basis of the found subspace
   */
  private LinearEquationSystem runDerivator(Relation<ParameterizationFunction> relation, int dimensionality, DBIDs ids) {
    try {
      // build database for derivator
      Database derivatorDB = buildDerivatorDB(relation, ids);

      ListParameterization parameters = new ListParameterization();
      parameters.addParameter(PCAFilteredRunner.Parameterizer.PCA_EIGENPAIR_FILTER, FirstNEigenPairFilter.class.getName());
      parameters.addParameter(FirstNEigenPairFilter.EIGENPAIR_FILTER_N, Integer.toString(dimensionality));
      DependencyDerivator<DoubleVector> derivator = null;
      Class<DependencyDerivator<DoubleVector>> cls = ClassGenericsUtil.uglyCastIntoSubclass(DependencyDerivator.class);
      derivator = parameters.tryInstantiate(cls);

      CorrelationAnalysisSolution<DoubleVector> model = derivator.run(derivatorDB);
      LinearEquationSystem les = model.getNormalizedLinearEquationSystem(null);
      return les;
    }
    catch(NonNumericFeaturesException e) {
      throw new IllegalStateException("Error during normalization" + e);
    }
  }

  /**
   * Builds a database for the derivator consisting of the ids in the specified
   * interval.
   * 
   * @param relation the database storing the parameterization functions
   * @param ids the ids to build the database from
   * @return a database for the derivator consisting of the ids in the specified
   *         interval
   */
  private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, DBIDs ids) {
    ProxyDatabase proxy = new ProxyDatabase(ids);
    int dim = dimensionality(relation);
    SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dim);
    MaterializedRelation<DoubleVector> prep = new MaterializedRelation<>(proxy, type, ids);
    proxy.addRelation(prep);

    // Project
    for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
      DoubleVector v = new DoubleVector(relation.get(iter).getColumnVector().getArrayRef());
      prep.set(iter, v);
    }

    return proxy;
  }

  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
  }

  @Override
  protected Logging getLogger() {
    return LOG;
  }

  /**
   * Parameterization class.
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   */
  public static class Parameterizer extends AbstractParameterizer {
    protected int minpts;

    protected int maxlevel;

    protected int mindim;

    protected double jitter;

    protected boolean adjust;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      IntParameter minptsP = new IntParameter(MINPTS_ID);
      minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if(config.grab(minptsP)) {
        minpts = minptsP.getValue();
      }
      IntParameter maxlevelP = new IntParameter(MAXLEVEL_ID);
      maxlevelP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if(config.grab(maxlevelP)) {
        maxlevel = maxlevelP.getValue();
      }
      IntParameter mindimP = new IntParameter(MINDIM_ID, 1);
      mindimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if(config.grab(mindimP)) {
        mindim = mindimP.getValue();
      }
      DoubleParameter jitterP = new DoubleParameter(JITTER_ID);
      jitterP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
      if(config.grab(jitterP)) {
        jitter = jitterP.getValue();
      }
      Flag adjustF = new Flag(ADJUST_ID);
      if(config.grab(adjustF)) {
        adjust = adjustF.getValue();
      }
    }

    @Override
    protected CASH<NumberVector> makeInstance() {
      return new CASH<>(minpts, maxlevel, mindim, jitter, adjust);
    }
  }
}