diff options
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java')
-rw-r--r-- | elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java | 896 |
1 files changed, 896 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java new file mode 100644 index 00000000..ceb13976 --- /dev/null +++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/biclustering/ChengAndChurch.java @@ -0,0 +1,896 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering.biclustering; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2015 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Arrays; + +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.math.Mean; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution; +import de.lmu.ifi.dbs.elki.utilities.BitsUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * Perform Cheng and Church biclustering. + * + * <p> + * Reference: <br> + * Y. Cheng and G. M. Church. Biclustering of expression data. In Proceedings of + * the 8th International Conference on Intelligent Systems for Molecular Biology + * (ISMB), San Diego, CA, 2000. + * </p> + * + * @author Erich Schubert + * @param <V> Vector type. + */ +@Reference(authors = "Y. Cheng, G. M. Church", title = "Biclustering of expression data", booktitle = "Proc. 8th International Conference on Intelligent Systems for Molecular Biology (ISMB)") +public class ChengAndChurch<V extends NumberVector> extends AbstractBiclustering<V, BiclusterWithInversionsModel> { + /** + * The logger for this class. + */ + private static final Logging LOG = Logging.getLogger(ChengAndChurch.class); + + /** + * The minimum number of columns that the database must have so that a removal + * of columns is performed in {@link #multipleNodeDeletion}.</p> + * <p> + * Just start deleting multiple columns when more than 100 columns are in the + * data matrix. + * </p> + */ + private static final int MIN_COLUMN_REMOVE_THRESHOLD = 100; + + /** + * The minimum number of rows that the database must have so that a removal of + * rows is performed in {@link #multipleNodeDeletion}. + * <p> + * Just start deleting multiple rows when more than 100 rows are in the data + * matrix. + * </p> + * <!-- + * <p> + * The value is set to 100 as this is not really described in the paper. + * </p> + * --> + */ + private static final int MIN_ROW_REMOVE_THRESHOLD = 100; + + /** + * Threshold for the score. + */ + private double delta; + + /** + * The parameter for multiple node deletion.</p> + * <p> + * It is used to magnify the {@link #delta} value in the + * {@link #multipleNodeDeletion} method. + * </p> + */ + private double alpha; + + /** + * Number of biclusters to be found. + */ + private int n; + + /** + * Allow inversion of rows in the last phase. + */ + private boolean useinverted = true; + + /** + * Distribution to sample random replacement values from. + */ + private Distribution dist; + + /** + * Constructor. + * + * @param delta Delta parameter: desired quality + * @param alpha Alpha parameter: controls switching to single node deletion + * approach + * @param n Number of clusters to detect + * @param dist Distribution of random values to insert + */ + public ChengAndChurch(double delta, double alpha, int n, Distribution dist) { + super(); + this.delta = delta; + this.alpha = alpha; + this.n = n; + this.dist = dist; + } + + /** + * Visitor pattern for processing cells. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static interface CellVisitor { + /** Different modes of operation. */ + int ALL = 0, SELECTED = 1, NOT_SELECTED = 2; + + /** + * Visit a cell. + * + * @param val Value + * @param row Row Number + * @param col Column number + * @param selrow Boolean, whether row is selected + * @param selcol Boolean, whether column is selected + * @return Stop flag, return {@code true} to stop visiting + */ + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol); + } + + /** + * Bicluster candidate. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + protected static class BiclusterCandidate { + /** + * Cardinalities. + */ + int rowcard, colcard; + + /** + * Means. + */ + double[] rowM, colM; + + /** + * Row and column bitmasks. + */ + long[] rows, irow, cols; + + /** + * Mean of the current bicluster. + */ + double allM; + + /** + * The current bicluster score (mean squared residue). + */ + double residue; + + /** + * Constructor. + * + * @param rows Row dimensionality. + * @param cols Column dimensionality. + */ + protected BiclusterCandidate(int rows, int cols) { + super(); + this.rows = BitsUtil.ones(rows); + this.irow = BitsUtil.zero(rows); + this.rowcard = rows; + this.rowM = new double[rows]; + this.cols = BitsUtil.ones(cols); + this.colcard = cols; + this.colM = new double[cols]; + } + + /** + * Resets the values for the next cluster search. + */ + protected void reset() { + rows = BitsUtil.ones(rowM.length); + rowcard = rowM.length; + cols = BitsUtil.ones(colM.length); + colcard = colM.length; + BitsUtil.zeroI(irow); + } + + /** + * Visit all selected cells in the data matrix. + * + * @param mat Data matrix + * @param mode Operation mode + * @param visitor Visitor function + */ + protected void visitAll(double[][] mat, int mode, CellVisitor visitor) { + // For efficiency, we manually iterate over the rows and column bitmasks. + // This saves repeated shifting needed by the manual bit access. + for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if((mode == CellVisitor.SELECTED && rlong == 0L) || (mode == CellVisitor.NOT_SELECTED && rlong == -1L)) { + rpos += Long.SIZE; + continue; + } + for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { + boolean rselected = ((rlong & 1L) == 1L); + if((mode == CellVisitor.SELECTED && !rselected) || (mode == CellVisitor.NOT_SELECTED && rselected)) { + continue; + } + for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + if((mode == CellVisitor.SELECTED && clong == 0L) || (mode == CellVisitor.NOT_SELECTED && clong == -1L)) { + cpos += Long.SIZE; + continue; + } + for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { + boolean cselected = ((clong & 1L) == 1L); + if((mode == CellVisitor.SELECTED && !cselected) || (mode == CellVisitor.NOT_SELECTED && cselected)) { + continue; + } + boolean stop = visitor.visit(mat[rpos][cpos], rpos, cpos, rselected, cselected); + if(stop) { + return; + } + } + } + } + } + } + + /** + * Visit a column of the matrix. + * + * @param mat Data matrix + * @param col Column to visit + * @param mode Operation mode + * @param visitor Visitor function + */ + protected void visitColumn(double[][] mat, int col, int mode, CellVisitor visitor) { + boolean cselected = BitsUtil.get(cols, col); + // For efficiency, we manually iterate over the rows and column bitmasks. + // This saves repeated shifting needed by the manual bit access. + for(int rpos = 0, rlpos = 0; rlpos < rows.length; ++rlpos) { + long rlong = rows[rlpos]; + // Fast skip blocks of 64 masked values. + if(mode == CellVisitor.SELECTED && rlong == 0L) { + rpos += Long.SIZE; + continue; + } + if(mode == CellVisitor.NOT_SELECTED && rlong == -1L) { + rpos += Long.SIZE; + continue; + } + for(int i = 0; i < Long.SIZE && rpos < rowM.length; ++i, ++rpos, rlong >>>= 1) { + boolean rselected = ((rlong & 1L) == 1L); + if(mode == CellVisitor.SELECTED && !rselected) { + continue; + } + if(mode == CellVisitor.NOT_SELECTED && rselected) { + continue; + } + boolean stop = visitor.visit(mat[rpos][col], rpos, col, rselected, cselected); + if(stop) { + return; + } + } + } + } + + /** + * Visit a row of the data matrix. + * + * @param mat Data matrix + * @param row Row to visit + * @param visitor Visitor function + */ + protected void visitRow(double[][] mat, int row, int mode, CellVisitor visitor) { + boolean rselected = BitsUtil.get(rows, row); + final double[] rowdata = mat[row]; + for(int cpos = 0, clpos = 0; clpos < cols.length; ++clpos) { + long clong = cols[clpos]; + // Fast skip blocks of 64 masked values. + if(mode == CellVisitor.SELECTED && clong == 0L) { + cpos += Long.SIZE; + continue; + } + if(mode == CellVisitor.NOT_SELECTED && clong == -1L) { + cpos += Long.SIZE; + continue; + } + for(int j = 0; j < Long.SIZE && cpos < colM.length; ++j, ++cpos, clong >>>= 1) { + boolean cselected = ((clong & 1L) == 1L); + if(mode == CellVisitor.SELECTED && !cselected) { + continue; + } + if(mode == CellVisitor.NOT_SELECTED && cselected) { + continue; + } + boolean stop = visitor.visit(rowdata[cpos], row, cpos, rselected, cselected); + if(stop) { + return; + } + } + } + } + + /** Visitor for updating the means. */ + private final CellVisitor MEANVISITOR = new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + if(selcol) { + rowM[row] += val; + } + if(selrow) { + colM[col] += val; + } + if(selcol && selrow) { + allM += val; + } + return false; + } + }; + + /** + * Update the row means and column means. + * + * @param mat Data matrix + * @param all Flag, to update all + * @return overall mean + */ + protected double updateRowAndColumnMeans(final double[][] mat, boolean all) { + final int mode = all ? CellVisitor.ALL : CellVisitor.SELECTED; + Arrays.fill(rowM, 0.); + Arrays.fill(colM, 0.); + allM = 0.; + visitAll(mat, mode, MEANVISITOR); + visitColumn(mat, 0, mode, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + rowM[row] /= colcard; + return false; + } + }); + visitRow(mat, 0, mode, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + colM[col] /= rowcard; + return false; + } + }); + allM /= colcard * rowcard; + return allM; + } + + /** + * Compute the mean square residue. + * + * @param mat Data matrix + * @return mean squared residue + */ + protected double computeMeanSquaredDeviation(final double[][] mat) { + final Mean msr = new Mean(); + visitAll(mat, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow && selcol); + double v = val - rowM[row] - colM[col] + allM; + msr.put(v * v); + return false; + } + }); + residue = msr.getMean(); + return residue; + } + + /** + * Computes the <b>mean row residue</b> of the given <code>row</code>. + * + * @param mat Data matrix + * @param row The row who's residue should be computed. + * @param rowinverted Indicates if the row should be considered inverted. + * @return The row residue of the given <code>row</code>. + */ + protected double computeRowResidue(final double[][] mat, int row, final boolean rowinverted) { + final Mean rowResidue = new Mean(); + visitRow(mat, row, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + final double rowMean = rowM[row]; + final double colMean = colM[col]; + double v = ((!rowinverted) ? (val - rowMean) : (rowMean - val)) - colMean + allM; + rowResidue.put(v * v); + return false; + } + }); + return rowResidue.getMean(); + } + + /** + * + * Computes the <b>mean column residue</b> of the given <code>col</code>. + * + * @param col The column who's residue should be computed. + * @return The row residue of the given <code>col</code>um. + */ + protected double computeColResidue(final double[][] mat, final int col) { + final double bias = colM[col] - allM; + final Mean colResidue = new Mean(); + visitColumn(mat, col, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + final double rowMean = rowM[row]; + double v = val - rowMean - bias; + colResidue.put(v * v); + return false; + } + }); + return colResidue.getMean(); + } + + /** + * Updates the mask with replacement values for all data in the given rows + * and columns. + * + * @param mat Mask to update. + * @param replacement Distribution to sample replacement values from. + */ + protected void maskMatrix(final double[][] mat, final Distribution replacement) { + visitAll(mat, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow && selcol); + mat[row][col] = replacement.nextRandom(); + return false; + } + }); + } + + /** + * Select or deselect a column. + * + * @param cnum Column to select + * @param set Value to set + */ + protected void selectColumn(int cnum, boolean set) { + if(set) { + BitsUtil.setI(cols, cnum); + colcard++; + } + else { + BitsUtil.clearI(cols, cnum); + colcard--; + } + } + + /** + * Select or deselect a row. + * + * @param rnum Row to select + * @param set Value to set + */ + protected void selectRow(int rnum, boolean set) { + if(set) { + BitsUtil.setI(rows, rnum); + rowcard++; + } + else { + BitsUtil.clearI(rows, rnum); + rowcard--; + } + } + + protected void invertRow(int rnum, boolean b) { + BitsUtil.setI(irow, rnum); + } + } + + @Override + public Clustering<BiclusterWithInversionsModel> biclustering() { + double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs); + + BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim()); + + Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering"); + ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs()); + + FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null; + for(int i = 0; i < n; i++) { + cand.reset(); + multipleNodeDeletion(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + singleNodeDeletion(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + nodeAddition(mat, cand); + if(LOG.isVeryVerbose()) { + LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + cand.maskMatrix(mat, dist); + BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow)); + final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows); + noise.removeDBIDs(cids); + result.addToplevelCluster(new Cluster<>(cids, model)); + + if(LOG.isVerbose()) { + LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n"); + LOG.verbose("Number of rows: " + cand.rowcard + "\n"); + LOG.verbose("Number of columns: " + cand.colcard + "\n"); + // LOG.verbose("Total number of masked values: " + maskedVals.size() + + // "\n"); + } + LOG.incrementProcessed(prog); + } + // Add a noise cluster, full-dimensional. + if(!noise.isEmpty()) { + long[] allcols = BitsUtil.ones(getColDim()); + BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS); + result.addToplevelCluster(new Cluster<>(noise, true, model)); + } + LOG.ensureCompleted(prog); + return result; + } + + /** + * Algorithm 1 of Cheng and Church: + * + * Remove single rows or columns. + * + * Inverted rows are not supported in this method. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void singleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { + // Assume that cand.residue is up to date! + while(cand.residue > delta && (cand.colcard > 2 || cand.rowcard > 2)) { + // Store current maximum. Need final mutable, so use arrays. + final double[] max = { Double.NEGATIVE_INFINITY }; + final int[] best = { -1, -1 }; + + // Test rows + if(cand.rowcard > 2) { + cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + double rowResidue = cand.computeRowResidue(mat, row, false); + if(max[0] < rowResidue) { + max[0] = rowResidue; + best[0] = row; + } + return false; + } + }); + } + + // Test columns: + if(cand.colcard > 2) { + cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + double colResidue = cand.computeColResidue(mat, col); + if(max[0] < colResidue) { + max[0] = colResidue; + best[1] = col; + } + return false; + } + }); + } + + if(best[1] >= 0) { // then override bestrow! + cand.selectColumn(best[1], false); + } + else { + assert (best[0] >= 0); + cand.selectRow(best[0], false); + } + // TODO: incremental update could be much faster? + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + } + } + + // + /** + * Algorithm 2 of Cheng and Church. + * + * Remove all rows and columns that reduce the residue by alpha. + * + * Inverted rows are not supported in this method. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void multipleNodeDeletion(final double[][] mat, final BiclusterCandidate cand) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + + // Note: assumes that cand.residue = H(I,J) + while(cand.residue > delta) { + final boolean[] modified = { false, false }; + + // Step 2: remove rows above threshold + if(cand.rowcard > MIN_ROW_REMOVE_THRESHOLD) { + final double alphaResidue = alpha * cand.residue; + cand.visitColumn(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selrow); + if(cand.computeRowResidue(mat, row, false) > alphaResidue) { + cand.selectRow(row, false); + modified[0] = true; + } + return (cand.rowcard > MIN_ROW_REMOVE_THRESHOLD); + } + }); + + // Step 3: update residue + if(modified[0]) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + } + } + + // Step 4: remove columns above threshold + if(cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD) { + final double alphaResidue = alpha * cand.residue; + cand.visitRow(mat, 0, CellVisitor.SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (selcol); + if(cand.computeColResidue(mat, col) > alphaResidue) { + cand.selectColumn(col, false); + modified[1] = true; + } + return (cand.colcard > MIN_COLUMN_REMOVE_THRESHOLD); + } + }); + if(modified[1]) { + cand.updateRowAndColumnMeans(mat, false); + cand.computeMeanSquaredDeviation(mat); + } + } + + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + // Step 5: if nothing has been removed, try removing single nodes. + if(!modified[0] && !modified[1]) { + break; + // Will be executed next in main loop, as per algorithm 4. + // singleNodeDeletion(); + } + } + } + + /** + * Algorithm 3 of Cheng and Church. + * + * Try to re-add rows or columns that decrease the overall score. + * + * Also try adding inverted rows. + * + * @param mat Data matrix + * @param cand Bicluster candidate + */ + private void nodeAddition(final double[][] mat, final BiclusterCandidate cand) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + while(true) { + // We need this to be final + mutable + final boolean[] added = new boolean[] { false, false }; + + // Step 2: add columns + cand.visitRow(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selcol); + if(cand.computeColResidue(mat, col) <= cand.residue) { + cand.selectColumn(col, true); + added[0] = true; + } + return false; + } + }); + + // Step 3: recompute values + if(added[0]) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + } + + // Step 4: try adding rows. + cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selrow); + if(cand.computeRowResidue(mat, row, false) <= cand.residue) { + cand.selectRow(row, true); + added[1] = true; + } + return false; + } + }); + + // Step 5: try adding inverted rows. + if(useinverted) { + cand.visitColumn(mat, 0, CellVisitor.NOT_SELECTED, new CellVisitor() { + @Override + public boolean visit(double val, int row, int col, boolean selrow, boolean selcol) { + assert (!selrow); + if(cand.computeRowResidue(mat, row, true) <= cand.residue) { + cand.selectRow(row, true); + cand.invertRow(row, true); + added[1] = true; + } + return false; + } + }); + } + if(added[1]) { + cand.updateRowAndColumnMeans(mat, true); + cand.computeMeanSquaredDeviation(mat); + if(LOG.isDebuggingFine()) { + LOG.debugFine("Residue in Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard); + } + } + if(!added[0] && !added[1]) { + break; + } + } + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); + } + + @Override + protected Logging getLogger() { + return LOG; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + * + * @param <V> Vector type + */ + public static class Parameterizer<V extends NumberVector> extends AbstractParameterizer { + /** + * Parameter to specify the distribution of replacement values when masking + * a cluster. + */ + public static final OptionID DIST_ID = new OptionID("chengandchurch.replacement", "Distribution of replacement values when masking found clusters."); + + /** + * Threshold value to determine the maximal acceptable score (mean squared + * residue) of a bicluster. + * <p/> + * Key: {@code -chengandchurch.delta} + * </p> + */ + public static final OptionID DELTA_ID = new OptionID("chengandchurch.delta", "Threshold value to determine the maximal acceptable score (mean squared residue) of a bicluster."); + + /** + * Parameter for multiple node deletion to accelerate the algorithm. (>= + * 1) + * <p/> + * Key: {@code -chengandchurch.alpha} + * </p> + */ + public static final OptionID ALPHA_ID = new OptionID("chengandchurch.alpha", "Parameter for multiple node deletion to accelerate the algorithm."); + + /** + * Number of biclusters to be found. + * <p/> + * Default value: 1 + * </p> + * <p/> + * Key: {@code -chengandchurch.n} + * </p> + */ + public static final OptionID N_ID = new OptionID("chengandchurch.n", "The number of biclusters to be found."); + + /** + * Threshold for the score ({@link #DELTA_ID}). + */ + private double delta; + + /** + * The parameter for multiple node deletion.</p> + * <p> + * It is used to magnify the {@link #delta} value in the + * {@link ChengAndChurch#multipleNodeDeletion} method. + * </p> + */ + private double alpha; + + /** + * Number of biclusters to be found. + */ + private int n; + + /** + * Distribution of replacement values. + */ + private Distribution dist; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + DoubleParameter deltaP = new DoubleParameter(DELTA_ID); + if(config.grab(deltaP)) { + delta = deltaP.doubleValue(); + } + deltaP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + + IntParameter nP = new IntParameter(N_ID, 1); + nP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + if(config.grab(nP)) { + n = nP.intValue(); + } + + DoubleParameter alphaP = new DoubleParameter(ALPHA_ID, 1.); + alphaP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_DOUBLE); + if(config.grab(alphaP)) { + alpha = alphaP.doubleValue(); + } + + ObjectParameter<Distribution> distP = new ObjectParameter<>(DIST_ID, Distribution.class, UniformDistribution.class); + if(config.grab(distP)) { + dist = distP.instantiateClass(config); + } + } + + @Override + protected ChengAndChurch<V> makeInstance() { + return new ChengAndChurch<>(delta, alpha, n, dist); + } + } +} |