summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
diff options
context:
space:
mode:
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java896
1 files changed, 896 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
new file mode 100644
index 00000000..ceb13976
--- /dev/null
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java
@@ -0,0 +1,896 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2015
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution;
+import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution;
+import de.lmu.ifi.dbs.elki.utilities.BitsUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * Perform Cheng and Church biclustering.
+ *
+ * <p>
+ * Reference: <br>
+ * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of
+ * the 8th International Conference on Intelligent Systems for Molecular Biology
+ * (ISMB), San Diego, CA, 2000.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @param <V> Vector type.
+ */
+@Reference(authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)")
+public class ChengAndChurch<V extends NumberVector> extends AbstractBiclustering<V, BiclusterWithInversionsModel> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(ChengAndChurch.class);
+
+ /**
+ * The minimum number of columns that the database must have so that a removal
+ * of columns is performed in {@link #multipleNodeDeletion}.</p>
+ * <p>
+ * Just start deleting multiple columns when more than 100 columns are in the
+ * data matrix.
+ * </p>
+ */
+ private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100;
+
+ /**
+ * The minimum number of rows that the database must have so that a removal of
+ * rows is performed in {@link #multipleNodeDeletion}.
+ * <p>
+ * Just start deleting multiple rows when more than 100 rows are in the data
+ * matrix.
+ * </p>
+ * <!--
+ * <p>
+ * The value is set to 100 as this is not really described in the paper.
+ * </p>
+ * -->
+ */
+ private static final int MIN_ROW_REMOVE_THRESHOLD = 100;
+
+ /**
+ * Threshold for the score.
+ */
+ private double delta;
+
+ /**
+ * The parameter for multiple node deletion.</p>
+ * <p>
+ * It is used to magnify the {@link #delta} value in the
+ * {@link #multipleNodeDeletion} method.
+ * </p>
+ */
+ private double alpha;
+
+ /**
+ * Number of biclusters to be found.
+ */
+ private int n;
+
+ /**
+ * Allow inversion of rows in the last phase.
+ */
+ private boolean useinverted = true;
+
+ /**
+ * Distribution to sample random replacement values from.
+ */
+ private Distribution dist;
+
+ /**
+ * Constructor.
+ *
+ * @param delta Delta parameter: desired quality
+ * @param alpha Alpha parameter: controls switching to single node deletion
+ * approach
+ * @param n Number of clusters to detect
+ * @param dist Distribution of random values to insert
+ */
+ public ChengAndChurch(double delta, double alpha, int n, Distribution dist) {
+ super();
+ this.delta = delta;
+ this.alpha = alpha;
+ this.n = n;
+ this.dist = dist;
+ }
+
+ /**
+ * Visitor pattern for processing cells.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static interface CellVisitor {
+ /** Different modes of operation. */
+ int ALL = 0, SELECTED = 1, NOT_SELECTED = 2;
+
+ /**
+ * Visit a cell.
+ *
+ * @param val Value
+ * @param row Row Number
+ * @param col Column number
+ * @param selrow Boolean, whether row is selected
+ * @param selcol Boolean, whether column is selected
+ * @return Stop flag, return {@code true} to stop visiting
+ */
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol);
+ }
+
+ /**
+ * Bicluster candidate.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ protected static class BiclusterCandidate {
+ /**
+ * Cardinalities.
+ */
+ int rowcard, colcard;
+
+ /**
+ * Means.
+ */
+ double[] rowM, colM;
+
+ /**
+ * Row and column bitmasks.
+ */
+ long[] rows, irow, cols;
+
+ /**
+ * Mean of the current bicluster.
+ */
+ double allM;
+
+ /**
+ * The current bicluster score (mean squared residue).
+ */
+ double residue;
+
+ /**
+ * Constructor.
+ *
+ * @param rows Row dimensionality.
+ * @param cols Column dimensionality.
+ */
+ protected BiclusterCandidate(int rows, int cols) {
+ super();
+ this.rows = BitsUtil.ones(rows);
+ this.irow = BitsUtil.zero(rows);
+ this.rowcard = rows;
+ this.rowM = new double[rows];
+ this.cols = BitsUtil.ones(cols);
+ this.colcard = cols;
+ this.colM = new double[cols];
+ }
+
+ /**
+ * Resets the values for the next cluster search.
+ */
+ protected void reset() {
+ rows = BitsUtil.ones(rowM.length);
+ rowcard = rowM.length;
+ cols = BitsUtil.ones(colM.length);
+ colcard = colM.length;
+ BitsUtil.zeroI(irow);
+ }
+
+ /**
+ * Visit all selected cells in the data matrix.
+ *
+ * @param mat Data matrix
+ * @param mode Operation mode
+ * @param visitor Visitor function
+ */
+ protected void visitAll(double[][] mat, int mode, CellVisitor visitor) {
+ // For efficiency, we manually iterate over the rows and column bitmasks.
+ // This saves repeated shifting needed by the manual bit access.
+ for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
+ boolean rselected = ((rlong & 1L) == 1L);
+ if((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) {
+ continue;
+ }
+ for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ if((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
+ boolean cselected = ((clong & 1L) == 1L);
+ if((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) {
+ continue;
+ }
+ boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Visit a column of the matrix.
+ *
+ * @param mat Data matrix
+ * @param col Column to visit
+ * @param mode Operation mode
+ * @param visitor Visitor function
+ */
+ protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) {
+ boolean cselected = BitsUtil.get(cols, col);
+ // For efficiency, we manually iterate over the rows and column bitmasks.
+ // This saves repeated shifting needed by the manual bit access.
+ for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) {
+ long rlong = rows[rlpos];
+ // Fast skip blocks of 64 masked values.
+ if(mode == CellVisitor.SELECTED && rlong == 0L) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && rlong == -1L) {
+ rpos += Long.SIZE;
+ continue;
+ }
+ for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) {
+ boolean rselected = ((rlong & 1L) == 1L);
+ if(mode == CellVisitor.SELECTED && !rselected) {
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && rselected) {
+ continue;
+ }
+ boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+
+ /**
+ * Visit a row of the data matrix.
+ *
+ * @param mat Data matrix
+ * @param row Row to visit
+ * @param visitor Visitor function
+ */
+ protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) {
+ boolean rselected = BitsUtil.get(rows, row);
+ final double[] rowdata = mat[row];
+ for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) {
+ long clong = cols[clpos];
+ // Fast skip blocks of 64 masked values.
+ if(mode == CellVisitor.SELECTED && clong == 0L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && clong == -1L) {
+ cpos += Long.SIZE;
+ continue;
+ }
+ for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) {
+ boolean cselected = ((clong & 1L) == 1L);
+ if(mode == CellVisitor.SELECTED && !cselected) {
+ continue;
+ }
+ if(mode == CellVisitor.NOT_SELECTED && cselected) {
+ continue;
+ }
+ boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected);
+ if(stop) {
+ return;
+ }
+ }
+ }
+ }
+
+ /** Visitor for updating the means. */
+ private final CellVisitor MEANVISITOR = new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ if(selcol) {
+ rowM[row] += val;
+ }
+ if(selrow) {
+ colM[col] += val;
+ }
+ if(selcol && selrow) {
+ allM += val;
+ }
+ return false;
+ }
+ };
+
+ /**
+ * Update the row means and column means.
+ *
+ * @param mat Data matrix
+ * @param all Flag, to update all
+ * @return overall mean
+ */
+ protected double updateRowAndColumnMeans(final double[][] mat, boolean all) {
+ final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED;
+ Arrays.fill(rowM, 0.);
+ Arrays.fill(colM, 0.);
+ allM = 0.;
+ visitAll(mat, mode, MEANVISITOR);
+ visitColumn(mat, 0, mode, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ rowM[row] /= colcard;
+ return false;
+ }
+ });
+ visitRow(mat, 0, mode, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ colM[col] /= rowcard;
+ return false;
+ }
+ });
+ allM /= colcard * rowcard;
+ return allM;
+ }
+
+ /**
+ * Compute the mean square residue.
+ *
+ * @param mat Data matrix
+ * @return mean squared residue
+ */
+ protected double computeMeanSquaredDeviation(final double[][] mat) {
+ final Mean msr = new Mean();
+ visitAll(mat, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow && selcol);
+ double v = val - rowM[row] - colM[col] + allM;
+ msr.put(v * v);
+ return false;
+ }
+ });
+ residue = msr.getMean();
+ return residue;
+ }
+
+ /**
+ * Computes the <b>mean row residue</b> of the given <code>row</code>.
+ *
+ * @param mat Data matrix
+ * @param row The row who's residue should be computed.
+ * @param rowinverted Indicates if the row should be considered inverted.
+ * @return The row residue of the given <code>row</code>.
+ */
+ protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) {
+ final Mean rowResidue = new Mean();
+ visitRow(mat, row, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ final double rowMean = rowM[row];
+ final double colMean = colM[col];
+ double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM;
+ rowResidue.put(v * v);
+ return false;
+ }
+ });
+ return rowResidue.getMean();
+ }
+
+ /**
+ *
+ * Computes the <b>mean column residue</b> of the given <code>col</code>.
+ *
+ * @param col The column who's residue should be computed.
+ * @return The row residue of the given <code>col</code>um.
+ */
+ protected double computeColResidue(final double[][] mat, final int col) {
+ final double bias = colM[col] - allM;
+ final Mean colResidue = new Mean();
+ visitColumn(mat, col, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ final double rowMean = rowM[row];
+ double v = val - rowMean - bias;
+ colResidue.put(v * v);
+ return false;
+ }
+ });
+ return colResidue.getMean();
+ }
+
+ /**
+ * Updates the mask with replacement values for all data in the given rows
+ * and columns.
+ *
+ * @param mat Mask to update.
+ * @param replacement Distribution to sample replacement values from.
+ */
+ protected void maskMatrix(final double[][] mat, final Distribution replacement) {
+ visitAll(mat, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow && selcol);
+ mat[row][col] = replacement.nextRandom();
+ return false;
+ }
+ });
+ }
+
+ /**
+ * Select or deselect a column.
+ *
+ * @param cnum Column to select
+ * @param set Value to set
+ */
+ protected void selectColumn(int cnum, boolean set) {
+ if(set) {
+ BitsUtil.setI(cols, cnum);
+ colcard++;
+ }
+ else {
+ BitsUtil.clearI(cols, cnum);
+ colcard--;
+ }
+ }
+
+ /**
+ * Select or deselect a row.
+ *
+ * @param rnum Row to select
+ * @param set Value to set
+ */
+ protected void selectRow(int rnum, boolean set) {
+ if(set) {
+ BitsUtil.setI(rows, rnum);
+ rowcard++;
+ }
+ else {
+ BitsUtil.clearI(rows, rnum);
+ rowcard--;
+ }
+ }
+
+ protected void invertRow(int rnum, boolean b) {
+ BitsUtil.setI(irow, rnum);
+ }
+ }
+
+ @Override
+ public Clustering<BiclusterWithInversionsModel> biclustering() {
+ double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);
+
+ BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());
+
+ Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
+ ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());
+
+ FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
+ for(int i = 0; i < n; i++) {
+ cand.reset();
+ multipleNodeDeletion(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ singleNodeDeletion(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ nodeAddition(mat, cand);
+ if(LOG.isVeryVerbose()) {
+ LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ cand.maskMatrix(mat, dist);
+ BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
+ final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
+ noise.removeDBIDs(cids);
+ result.addToplevelCluster(new Cluster<>(cids, model));
+
+ if(LOG.isVerbose()) {
+ LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
+ LOG.verbose("Number of rows: " + cand.rowcard + "\n");
+ LOG.verbose("Number of columns: " + cand.colcard + "\n");
+ // LOG.verbose("Total number of masked values: " + maskedVals.size() +
+ // "\n");
+ }
+ LOG.incrementProcessed(prog);
+ }
+ // Add a noise cluster, full-dimensional.
+ if(!noise.isEmpty()) {
+ long[] allcols = BitsUtil.ones(getColDim());
+ BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
+ result.addToplevelCluster(new Cluster<>(noise, true, model));
+ }
+ LOG.ensureCompleted(prog);
+ return result;
+ }
+
+ /**
+ * Algorithm 1 of Cheng and Church:
+ *
+ * Remove single rows or columns.
+ *
+ * Inverted rows are not supported in this method.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
+ // Assume that cand.residue is up to date!
+ while(cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) {
+ // Store current maximum. Need final mutable, so use arrays.
+ final double[] max = { Double.NEGATIVE_INFINITY };
+ final int[] best = { -1, -1 };
+
+ // Test rows
+ if(cand.rowcard > 2) {
+ cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ double rowResidue = cand.computeRowResidue(mat, row, false);
+ if(max[0] < rowResidue) {
+ max[0] = rowResidue;
+ best[0] = row;
+ }
+ return false;
+ }
+ });
+ }
+
+ // Test columns:
+ if(cand.colcard > 2) {
+ cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ double colResidue = cand.computeColResidue(mat, col);
+ if(max[0] < colResidue) {
+ max[0] = colResidue;
+ best[1] = col;
+ }
+ return false;
+ }
+ });
+ }
+
+ if(best[1] >= 0) { // then override bestrow!
+ cand.selectColumn(best[1], false);
+ }
+ else {
+ assert (best[0] >= 0);
+ cand.selectRow(best[0], false);
+ }
+ // TODO: incremental update could be much faster?
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ }
+ }
+
+ //
+ /**
+ * Algorithm 2 of Cheng and Church.
+ *
+ * Remove all rows and columns that reduce the residue by alpha.
+ *
+ * Inverted rows are not supported in this method.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+
+ // Note: assumes that cand.residue = H(I,J)
+ while(cand.residue > delta) {
+ final boolean[] modified = { false, false };
+
+ // Step 2: remove rows above threshold
+ if(cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) {
+ final double alphaResidue = alpha * cand.residue;
+ cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selrow);
+ if(cand.computeRowResidue(mat, row, false) > alphaResidue) {
+ cand.selectRow(row, false);
+ modified[0] = true;
+ }
+ return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD);
+ }
+ });
+
+ // Step 3: update residue
+ if(modified[0]) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+ }
+
+ // Step 4: remove columns above threshold
+ if(cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) {
+ final double alphaResidue = alpha * cand.residue;
+ cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (selcol);
+ if(cand.computeColResidue(mat, col) > alphaResidue) {
+ cand.selectColumn(col, false);
+ modified[1] = true;
+ }
+ return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD);
+ }
+ });
+ if(modified[1]) {
+ cand.updateRowAndColumnMeans(mat, false);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+ }
+
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ // Step 5: if nothing has been removed, try removing single nodes.
+ if(!modified[0] && !modified[1]) {
+ break;
+ // Will be executed next in main loop, as per algorithm 4.
+ // singleNodeDeletion();
+ }
+ }
+ }
+
+ /**
+ * Algorithm 3 of Cheng and Church.
+ *
+ * Try to re-add rows or columns that decrease the overall score.
+ *
+ * Also try adding inverted rows.
+ *
+ * @param mat Data matrix
+ * @param cand Bicluster candidate
+ */
+ private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ while(true) {
+ // We need this to be final + mutable
+ final boolean[] added = new boolean[] { false, false };
+
+ // Step 2: add columns
+ cand.visitRow(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selcol);
+ if(cand.computeColResidue(mat, col) <= cand.residue) {
+ cand.selectColumn(col, true);
+ added[0] = true;
+ }
+ return false;
+ }
+ });
+
+ // Step 3: recompute values
+ if(added[0]) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ }
+
+ // Step 4: try adding rows.
+ cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selrow);
+ if(cand.computeRowResidue(mat, row, false) <= cand.residue) {
+ cand.selectRow(row, true);
+ added[1] = true;
+ }
+ return false;
+ }
+ });
+
+ // Step 5: try adding inverted rows.
+ if(useinverted) {
+ cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() {
+ @Override
+ public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) {
+ assert (!selrow);
+ if(cand.computeRowResidue(mat, row, true) <= cand.residue) {
+ cand.selectRow(row, true);
+ cand.invertRow(row, true);
+ added[1] = true;
+ }
+ return false;
+ }
+ });
+ }
+ if(added[1]) {
+ cand.updateRowAndColumnMeans(mat, true);
+ cand.computeMeanSquaredDeviation(mat);
+ if(LOG.isDebuggingFine()) {
+ LOG.debugFine("Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
+ }
+ }
+ if(!added[0] && !added[1]) {
+ break;
+ }
+ }
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ *
+ * @param <V> Vector type
+ */
+ public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer {
+ /**
+ * Parameter to specify the distribution of replacement values when masking
+ * a cluster.
+ */
+ public static final OptionID DIST_ID = new OptionID("chengandchurch.replacement", "Distribution of replacement values when masking found clusters.");
+
+ /**
+ * Threshold value to determine the maximal acceptable score (mean squared
+ * residue) of a bicluster.
+ * <p/>
+ * Key: {@code -chengandchurch.delta}
+ * </p>
+ */
+ public static final OptionID DELTA_ID = new OptionID("chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster.");
+
+ /**
+ * Parameter for multiple node deletion to accelerate the algorithm. (&gt;=
+ * 1)
+ * <p/>
+ * Key: {@code -chengandchurch.alpha}
+ * </p>
+ */
+ public static final OptionID ALPHA_ID = new OptionID("chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm.");
+
+ /**
+ * Number of biclusters to be found.
+ * <p/>
+ * Default value: 1
+ * </p>
+ * <p/>
+ * Key: {@code -chengandchurch.n}
+ * </p>
+ */
+ public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found.");
+
+ /**
+ * Threshold for the score ({@link #DELTA_ID}).
+ */
+ private double delta;
+
+ /**
+ * The parameter for multiple node deletion.</p>
+ * <p>
+ * It is used to magnify the {@link #delta} value in the
+ * {@link ChengAndChurch#multipleNodeDeletion} method.
+ * </p>
+ */
+ private double alpha;
+
+ /**
+ * Number of biclusters to be found.
+ */
+ private int n;
+
+ /**
+ * Distribution of replacement values.
+ */
+ private Distribution dist;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ DoubleParameter deltaP = new DoubleParameter(DELTA_ID);
+ if(config.grab(deltaP)) {
+ delta = deltaP.doubleValue();
+ }
+ deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE);
+
+ IntParameter nP = new IntParameter(N_ID, 1);
+ nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(nP)) {
+ n = nP.intValue();
+ }
+
+ DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.);
+ alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+
+ ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class);
+ if(config.grab(distP)) {
+ dist = distP.instantiateClass(config);
+ }
+ }
+
+ @Override
+ protected ChengAndChurch<V> makeInstance() {
+ return new ChengAndChurch<>(delta, alpha, n, dist);
+ }
+ }
+}