diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/datasource/filter')
19 files changed, 422 insertions, 206 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java index 2109761a..66707da6 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ByLabelFilter.java @@ -112,8 +112,9 @@ public class ByLabelFilter extends AbstractStreamFilter { Object l = source.data(lblcol); if(l instanceof LabelList) { boolean good = false; - for(String label : (LabelList) l) { - if(pattern.matcher(label).matches()) { + final LabelList ll = (LabelList) l; + for(int i = 0; i < ll.size(); i++) { + if(pattern.matcher(ll.get(i)).matches()) { good = true; break; } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java index e8dc69c3..020dcb31 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFilter.java @@ -34,7 +34,6 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; @@ -49,23 +48,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; */ public class ClassLabelFilter implements ObjectFilter { /** - * Optional parameter that specifies the index of the label to be used as - * class label, must be an integer equal to or greater than 0. - * <p> - * Key: {@code -dbc.classLabelIndex} - * </p> - */ - public static final OptionID CLASS_LABEL_INDEX_ID = new OptionID("dbc.classLabelIndex", "The index of the label to be used as class label."); - - /** - * Parameter to specify the class of occurring class labels. - * <p> - * Key: {@code -dbc.classLabelClass} - * </p> - */ - public static final OptionID CLASS_LABEL_CLASS_ID = new OptionID("dbc.classLabelClass", "Class label class to use."); - - /** * The index of the label to be used as class label, null if no class label is * specified. */ @@ -94,10 +76,10 @@ public class ClassLabelFilter implements ObjectFilter { // Find a labellist column boolean done = false; boolean keeplabelcol = false; - for (int i = 0; i < objects.metaLength(); i++) { + for(int i = 0; i < objects.metaLength(); i++) { SimpleTypeInformation<?> meta = objects.meta(i); // Skip non-labellist columns - or if we already had a labellist - if (done || !LabelList.class.equals(meta.getRestrictionClass())) { + if(done || !LabelList.class.equals(meta.getRestrictionClass())) { bundle.appendColumn(meta, objects.getColumn(i)); continue; } @@ -107,28 +89,39 @@ public class ClassLabelFilter implements ObjectFilter { List<ClassLabel> clscol = new ArrayList<>(objects.dataLength()); List<LabelList> lblcol = new ArrayList<>(objects.dataLength()); + ArrayList<String> lbuf = new ArrayList<>(); // Split the column - for (Object obj : objects.getColumn(i)) { - if (obj != null) { + for(Object obj : objects.getColumn(i)) { + if(obj != null) { LabelList ll = (LabelList) obj; + int off = (classLabelIndex >= 0) ? classLabelIndex : (ll.size() - classLabelIndex); try { - ClassLabel lbl = classLabelFactory.makeFromString(ll.remove(classLabelIndex)); + ClassLabel lbl = classLabelFactory.makeFromString(ll.get(off)); clscol.add(lbl); - } catch (Exception e) { + } + catch(Exception e) { throw new AbortException("Cannot initialize class labels: " + e.getMessage(), e); } - lblcol.add(ll); - if (ll.size() > 0) { + lbuf.clear(); + for(int j = 0; j < ll.size(); j++) { + if(j == off) { + continue; + } + lbuf.add(ll.get(j)); + } + lblcol.add(LabelList.make(lbuf)); + if(lbuf.size() > 0) { keeplabelcol = true; } - } else { + } + else { clscol.add(null); lblcol.add(null); } } bundle.appendColumn(classLabelFactory.getTypeInformation(), clscol); // Only add the label column when it's not empty. - if (keeplabelcol) { + if(keeplabelcol) { bundle.appendColumn(meta, lblcol); } } @@ -144,6 +137,23 @@ public class ClassLabelFilter implements ObjectFilter { */ public static class Parameterizer extends AbstractParameterizer { /** + * Optional parameter that specifies the index of the label to be used as + * class label, must be an integer equal to or greater than 0. + * <p> + * Key: {@code -dbc.classLabelIndex} + * </p> + */ + public static final OptionID CLASS_LABEL_INDEX_ID = new OptionID("dbc.classLabelIndex", "The index of the label to be used as class label. The first label is 0, negative indexes are relative to the end."); + + /** + * Parameter to specify the class of occurring class labels. + * <p> + * Key: {@code -dbc.classLabelClass} + * </p> + */ + public static final OptionID CLASS_LABEL_CLASS_ID = new OptionID("dbc.classLabelClass", "Class label class to use."); + + /** * The index of the label to be used as class label, null if no class label * is specified. */ @@ -159,12 +169,11 @@ public class ClassLabelFilter implements ObjectFilter { super.makeOptions(config); // parameter class label index final IntParameter classLabelIndexParam = new IntParameter(CLASS_LABEL_INDEX_ID); - classLabelIndexParam.addConstraint(new GreaterEqualConstraint(0)); final ObjectParameter<ClassLabel.Factory<?>> classlabelClassParam = new ObjectParameter<>(CLASS_LABEL_CLASS_ID, ClassLabel.Factory.class, SimpleClassLabel.Factory.class); config.grab(classLabelIndexParam); config.grab(classlabelClassParam); - if (classLabelIndexParam.isDefined() && classlabelClassParam.isDefined()) { + if(classLabelIndexParam.isDefined() && classlabelClassParam.isDefined()) { classLabelIndex = classLabelIndexParam.intValue(); classLabelFactory = classlabelClassParam.instantiateClass(config); } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java index 97624ac8..517eb301 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ClassLabelFromPatternFilter.java @@ -129,8 +129,9 @@ public class ClassLabelFromPatternFilter extends AbstractStreamFilter { continue; } if (o instanceof LabelList) { - for (String l : (LabelList) o) { - if (pattern.matcher(l).find()) { + final LabelList ll = (LabelList) o; + for(int j = 0; j < ll.size(); j++) { + if (pattern.matcher(ll.get(j)).find()) { return positive; } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/NaNFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java index 769f3009..fb9cf83e 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/NaNFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/DropNaNFilter.java @@ -43,11 +43,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; * * @author Erich Schubert */ -public class NaNFilter extends AbstractStreamFilter { +public class DropNaNFilter extends AbstractStreamFilter { /** * Class logger */ - private static final Logging LOG = Logging.getLogger(NaNFilter.class); + private static final Logging LOG = Logging.getLogger(DropNaNFilter.class); /** * Columns to check. @@ -57,7 +57,7 @@ public class NaNFilter extends AbstractStreamFilter { /** * Constructor. */ - public NaNFilter() { + public DropNaNFilter() { super(); } @@ -178,8 +178,8 @@ public class NaNFilter extends AbstractStreamFilter { */ public static class Parameterizer extends AbstractParameterizer { @Override - protected Object makeInstance() { - return new NaNFilter(); + protected DropNaNFilter makeInstance() { + return new DropNaNFilter(); } } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java index 926ebe99..17538dc9 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ExternalIDFilter.java @@ -33,7 +33,6 @@ import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -48,15 +47,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; // TODO: use a non-string class for external ids? public class ExternalIDFilter implements ObjectFilter { /** - * Parameter that specifies the index of the label to be used as external Id, - * must be an integer equal to or greater than 0. - * <p> - * Key: {@code -dbc.externalIdIndex} - * </p> - */ - public static final OptionID EXTERNALID_INDEX_ID = new OptionID("dbc.externalIdIndex", "The index of the label to be used as external Id."); - - /** * The index of the label to be used as external Id. */ private final int externalIdIndex; @@ -77,10 +67,10 @@ public class ExternalIDFilter implements ObjectFilter { // Find a labellist column boolean done = false; boolean keeplabelcol = false; - for (int i = 0; i < objects.metaLength(); i++) { + for(int i = 0; i < objects.metaLength(); i++) { SimpleTypeInformation<?> meta = objects.meta(i); // Skip non-labellist columns - or if we already had a labellist - if (done || !LabelList.class.equals(meta.getRestrictionClass())) { + if(done || !LabelList.class.equals(meta.getRestrictionClass())) { bundle.appendColumn(meta, objects.getColumn(i)); continue; } @@ -91,15 +81,25 @@ public class ExternalIDFilter implements ObjectFilter { List<LabelList> lblcol = new ArrayList<>(objects.dataLength()); // Split the column - for (Object obj : objects.getColumn(i)) { - if (obj != null) { + ArrayList<String> lbuf = new ArrayList<>(); + for(Object obj : objects.getColumn(i)) { + if(obj != null) { LabelList ll = (LabelList) obj; - eidcol.add(new ExternalID(ll.remove(externalIdIndex))); - lblcol.add(ll); - if (ll.size() > 0) { + int off = externalIdIndex >= 0 ? externalIdIndex : (ll.size() - externalIdIndex); + eidcol.add(new ExternalID(ll.get(off))); + lbuf.clear(); + for(int j = 0; j < ll.size(); j++) { + if(j == off) { + continue; + } + lbuf.add(ll.get(j)); + } + lblcol.add(LabelList.make(lbuf)); + if(ll.size() > 0) { keeplabelcol = true; } - } else { + } + else { eidcol.add(null); lblcol.add(null); } @@ -107,7 +107,7 @@ public class ExternalIDFilter implements ObjectFilter { bundle.appendColumn(TypeUtil.EXTERNALID, eidcol); // Only add the label column when it's not empty. - if (keeplabelcol) { + if(keeplabelcol) { bundle.appendColumn(meta, lblcol); } } @@ -122,14 +122,22 @@ public class ExternalIDFilter implements ObjectFilter { * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter that specifies the index of the label to be used as external + * Id, starting at 0. Negative numbers are counted from the end. + * <p> + * Key: {@code -dbc.externalIdIndex} + * </p> + */ + public static final OptionID EXTERNALID_INDEX_ID = new OptionID("dbc.externalIdIndex", "The index of the label to be used as external Id. The first label is 0; negative indexes are relative to the end."); + int externalIdIndex = -1; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter externalIdIndexParam = new IntParameter(EXTERNALID_INDEX_ID); - externalIdIndexParam.addConstraint(new GreaterEqualConstraint(0)); - if (config.grab(externalIdIndexParam)) { + if(config.grab(externalIdIndexParam)) { externalIdIndex = externalIdIndexParam.intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java index 7f09b905..ce02fc29 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/FixedDBIDsFilter.java @@ -42,14 +42,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; */ public class FixedDBIDsFilter extends AbstractStreamFilter { /** - * Optional parameter to specify the first object ID to use. - * <p> - * Key: {@code -dbc.startid} - * </p> - */ - public static final OptionID IDSTART_ID = new OptionID("dbc.startid", "Object ID to start counting with"); - - /** * The filtered meta */ BundleMeta meta; @@ -109,6 +101,13 @@ public class FixedDBIDsFilter extends AbstractStreamFilter { * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { + /** + * Optional parameter to specify the first object ID to use. + * <p> + * Key: {@code -dbc.startid} + * </p> + */ + public static final OptionID IDSTART_ID = new OptionID("dbc.startid", "Object ID to start counting with"); int startid = -1; @Override diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java index 37f8f8d9..453d294e 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/HistogramJitterFilter.java @@ -31,7 +31,7 @@ import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -75,7 +75,7 @@ public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVe public HistogramJitterFilter(double jitter, RandomFactory rnd) { super(); this.jitter = jitter; - this.rnd = new ExponentialDistribution(1, rnd.getRandom()); + this.rnd = new ExponentialDistribution(1, rnd.getSingleThreadedRandom()); } @Override @@ -83,7 +83,7 @@ public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVe final int dim = obj.getDimensionality(); // Compute the total sum. double osum = 0; - for (int i = 0; i < dim; i++) { + for(int i = 0; i < dim; i++) { osum += obj.doubleValue(i); } // Actual maximum jitter amount: @@ -91,13 +91,13 @@ public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVe // Generate jitter vector double[] raw = new double[dim]; double jsum = 0; // Sum of jitter - for (int i = 0; i < raw.length; i++) { + for(int i = 0; i < raw.length; i++) { raw[i] = rnd.nextRandom() * maxjitter; jsum += raw[i]; } final double mix = jsum / osum; // Combine the two vector - for (int i = 0; i < raw.length; i++) { + for(int i = 0; i < raw.length; i++) { raw[i] = raw[i] + (1 - mix) * obj.doubleValue(i); } return factory.newNumberVector(raw); @@ -146,12 +146,12 @@ public class HistogramJitterFilter<V extends NumberVector<?>> extends AbstractVe protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter jitterP = new DoubleParameter(JITTER_ID); - jitterP.addConstraint(new GreaterEqualConstraint(Double.valueOf(0.0))); - if (config.grab(jitterP)) { + jitterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + if(config.grab(jitterP)) { jitter = jitterP.getValue().doubleValue(); } RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java index 5c8d07d0..a7e44d4d 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/RandomSamplingStreamFilter.java @@ -29,8 +29,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.LessEqualConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; @@ -60,7 +59,7 @@ public class RandomSamplingStreamFilter extends AbstractStreamFilter { public RandomSamplingStreamFilter(double prob, RandomFactory rnd) { super(); this.prob = prob; - this.random = rnd.getRandom(); + this.random = rnd.getSingleThreadedRandom(); } @Override @@ -75,15 +74,15 @@ public class RandomSamplingStreamFilter extends AbstractStreamFilter { @Override public Event nextEvent() { - while (true) { + while(true) { Event ev = source.nextEvent(); - switch(ev) { + switch(ev){ case END_OF_STREAM: return ev; case META_CHANGED: return ev; case NEXT_OBJECT: - if (random.nextDouble() < prob) { + if(random.nextDouble() < prob) { return ev; } continue; @@ -123,13 +122,13 @@ public class RandomSamplingStreamFilter extends AbstractStreamFilter { protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleParameter probP = new DoubleParameter(PROB_ID); - probP.addConstraint(new GreaterEqualConstraint(0.0)); - probP.addConstraint(new LessEqualConstraint(1.0)); - if (config.grab(probP)) { + probP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_DOUBLE); + probP.addConstraint(CommonConstraints.LESS_EQUAL_ONE_DOUBLE); + if(config.grab(probP)) { prob = probP.getValue().doubleValue(); } RandomParameter rndP = new RandomParameter(SEED_ID); - if (config.grab(rndP)) { + if(config.grab(rndP)) { rnd = rndP.getValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java new file mode 100644 index 00000000..9029d8ea --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ReplaceNaNWithRandomFilter.java @@ -0,0 +1,220 @@ +package de.lmu.ifi.dbs.elki.datasource.filter; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2013 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.ArrayList; + +import de.lmu.ifi.dbs.elki.data.NumberVector; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; +import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; +import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; + +/** + * A filter to replace all NaN values. + * + * Note: currently, only dense vector columns are supported. + * + * TODO: add support for sparse vectors. + * + * @author Erich Schubert + */ +public class ReplaceNaNWithRandomFilter extends AbstractStreamFilter { + /** + * Class logger + */ + private static final Logging LOG = Logging.getLogger(ReplaceNaNWithRandomFilter.class); + + /** + * Columns to check. + */ + private NumberVector.Factory<?, ?>[] densecols = null; + + /** + * Distribution to generate replacement values with. + */ + private Distribution dist; + + /** + * Row cache. + */ + private ArrayList<Object> rows = new ArrayList<>(); + + /** + * Constructor. + */ + public ReplaceNaNWithRandomFilter(Distribution dist) { + super(); + this.dist = dist; + } + + @Override + public BundleMeta getMeta() { + return source.getMeta(); + } + + @Override + public Object data(int rnum) { + return rows.get(rnum); + } + + @Override + public Event nextEvent() { + while (true) { + Event ev = source.nextEvent(); + switch(ev) { + case END_OF_STREAM: + return ev; + case META_CHANGED: + updateMeta(source.getMeta()); + return ev; + case NEXT_OBJECT: + if (densecols == null) { + updateMeta(source.getMeta()); + } + rows.clear(); + for (int j = 0; j < densecols.length; j++) { + Object o = source.data(j); + if (densecols[j] != null) { + NumberVector<?> v = (NumberVector<?>) o; + double[] ro = null; // replacement + if (v != null) { + for (int i = 0; i < v.getDimensionality(); i++) { + if (Double.isNaN(v.doubleValue(i))) { + if (ro != null) { + ro = v.getColumnVector().getArrayRef(); + } + ro[i] = dist.nextRandom(); + } + } + } + o = densecols[j].newNumberVector(ro); + } + rows.add(o); + } + return ev; + } + } + } + + /** + * Process an updated meta record. + * + * @param meta Meta record + */ + private void updateMeta(BundleMeta meta) { + final int cols = meta.size(); + densecols = new NumberVector.Factory<?, ?>[cols]; + for (int i = 0; i < cols; i++) { + if (TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH.isAssignableFromType(meta.get(i))) { + throw new AbortException("Filtering sparse vectors is not yet supported by this filter. Please contribute."); + } + if (TypeUtil.FLOAT_VECTOR_FIELD.isAssignableFromType(meta.get(i))) { + VectorFieldTypeInformation<?> vmeta = (VectorFieldTypeInformation<?>) meta.get(i); + densecols[i] = (NumberVector.Factory<?, ?>) vmeta.getFactory(); + continue; + } + if (TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(meta.get(i))) { + VectorFieldTypeInformation<?> vmeta = (VectorFieldTypeInformation<?>) meta.get(i); + densecols[i] = (NumberVector.Factory<?, ?>) vmeta.getFactory(); + continue; + } + } + } + + @Override + public MultipleObjectsBundle filter(final MultipleObjectsBundle objects) { + if (LOG.isDebuggingFinest()) { + LOG.debugFinest("Removing records with NaN values."); + } + + updateMeta(objects.meta()); + MultipleObjectsBundle bundle = new MultipleObjectsBundle(); + for (int j = 0; j < objects.metaLength(); j++) { + bundle.appendColumn(objects.meta(j), new ArrayList<>()); + } + for (int i = 0; i < objects.dataLength(); i++) { + final Object[] row = objects.getRow(i); + for (int j = 0; j < densecols.length; j++) { + if (densecols[j] != null) { + NumberVector<?> v = (NumberVector<?>) row[j]; + double[] ro = null; // replacement + if (v != null) { + for (int d = 0; d < v.getDimensionality(); d++) { + if (Double.isNaN(v.doubleValue(d))) { + if (ro != null) { + ro = v.getColumnVector().getArrayRef(); + } + ro[d] = dist.nextRandom(); + } + } + } + row[j] = densecols[j].newNumberVector(ro); + } + } + bundle.appendSimple(row); + } + return bundle; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Parameter to specify the distribution to sample replacement values from. + */ + public static final OptionID REPLACEMENT_DISTRIBUTION = new OptionID("nanfilter.replacement", "Distribution to sample replacement values from."); + + /** + * Distribution to generate replacement values with. + */ + private Distribution dist; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + ObjectParameter<Distribution> distP = new ObjectParameter<>(REPLACEMENT_DISTRIBUTION, Distribution.class); + if (config.grab(distP)) { + dist = distP.instantiateClass(config); + } + } + + @Override + protected ReplaceNaNWithRandomFilter makeInstance() { + return new ReplaceNaNWithRandomFilter(dist); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java index b8bf968b..8afa8290 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/ShuffleObjectsFilter.java @@ -76,7 +76,7 @@ public class ShuffleObjectsFilter implements ObjectFilter { if (LOG.isDebugging()) { LOG.debug("Shuffling the data set"); } - final Random random = rnd.getRandom(); + final Random random = rnd.getSingleThreadedRandom(); final int size = objects.dataLength(); final int[] offsets = new int[size]; diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java index 8146bd5b..6ac046ec 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/SplitNumberVectorFilter.java @@ -35,8 +35,7 @@ import de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.ListEachConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter; @@ -67,17 +66,17 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec @Override public MultipleObjectsBundle filter(MultipleObjectsBundle objects) { - if (objects.dataLength() == 0) { + if(objects.dataLength() == 0) { return objects; } MultipleObjectsBundle bundle = new MultipleObjectsBundle(); - for (int r = 0; r < objects.metaLength(); r++) { + for(int r = 0; r < objects.metaLength(); r++) { @SuppressWarnings("unchecked") SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) objects.meta(r); @SuppressWarnings("unchecked") final List<Object> column = (List<Object>) objects.getColumn(r); - if (!getInputTypeRestriction().isAssignableFromType(type)) { + if(!getInputTypeRestriction().isAssignableFromType(type)) { bundle.appendColumn(type, column); continue; } @@ -98,16 +97,16 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec int[] odims = new int[vtype.getDimensionality() - dims.length]; { int i = 0; - for (int d = 0; d < vtype.getDimensionality(); d++) { + for(int d = 0; d < vtype.getDimensionality(); d++) { boolean found = false; - for (int j = 0; j < dims.length; j++) { - if (dims[j] == d) { + for(int j = 0; j < dims.length; j++) { + if(dims[j] == d) { found = true; break; } } - if (!found) { - if (i >= odims.length) { + if(!found) { + if(i >= odims.length) { throw new AbortException("Dimensionalities not proper!"); } odims[i] = d; @@ -116,15 +115,15 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec } } // Splitting scan. - for (int i = 0; i < objects.dataLength(); i++) { + for(int i = 0; i < objects.dataLength(); i++) { @SuppressWarnings("unchecked") final V obj = (V) column.get(i); double[] part1 = new double[dims.length]; double[] part2 = new double[obj.getDimensionality() - dims.length]; - for (int d = 0; d < dims.length; d++) { + for(int d = 0; d < dims.length; d++) { part1[d] = obj.doubleValue(dims[d]); } - for (int d = 0; d < odims.length; d++) { + for(int d = 0; d < odims.length; d++) { part2[d] = obj.doubleValue(odims[d]); } col1.add(factory.newNumberVector(part1)); @@ -142,7 +141,7 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec private TypeInformation getInputTypeRestriction() { // Find maximum dimension requested int m = dims[0]; - for (int i = 1; i < dims.length; i++) { + for(int i = 1; i < dims.length; i++) { m = Math.max(dims[i], m); } return new VectorFieldTypeInformation<>(NumberVector.class, m, Integer.MAX_VALUE); @@ -170,11 +169,11 @@ public class SplitNumberVectorFilter<V extends NumberVector<?>> implements Objec protected void makeOptions(Parameterization config) { super.makeOptions(config); IntListParameter selectedAttributesP = new IntListParameter(SELECTED_ATTRIBUTES_ID); - selectedAttributesP.addConstraint(new ListEachConstraint<Integer>(new GreaterEqualConstraint(0))); - if (config.grab(selectedAttributesP)) { + selectedAttributesP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT_LIST); + if(config.grab(selectedAttributesP)) { List<Integer> dimensionList = selectedAttributesP.getValue(); dims = new int[dimensionList.size()]; - for (int i = 0; i < dimensionList.size(); i++) { + for(int i = 0; i < dimensionList.size(); i++) { dims[i] = dimensionList.get(i).intValue(); } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java index 8fd46336..dd86cc5a 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseCDFNormalization.java @@ -123,7 +123,7 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements // We iterate over dimensions, this kind of filter needs fast random // access. - Adapter<V> adapter = new Adapter<>(); + Adapter adapter = new Adapter(); for (int d = 0; d < dim; d++) { adapter.dim = d; if (estimators.size() == 1) { @@ -208,50 +208,56 @@ public class AttributeWiseCDFNormalization<V extends NumberVector<?>> implements return result.toString(); } - private static class Adapter<V extends NumberVector<?>> implements NumberArrayAdapter<Double, List<V>> { + /** + * Array adapter class for vectors. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + private static class Adapter implements NumberArrayAdapter<Double, List<? extends NumberVector<?>>> { /** * Dimension to process. */ - int dim; @Override - public int size(List<V> array) { + public int size(List<? extends NumberVector<?>> array) { return array.size(); } @Override - public Double get(List<V> array, int off) throws IndexOutOfBoundsException { + public Double get(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return getDouble(array, off); } @Override - public double getDouble(List<V> array, int off) throws IndexOutOfBoundsException { + public double getDouble(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return array.get(off).doubleValue(dim); } @Override - public float getFloat(List<V> array, int off) throws IndexOutOfBoundsException { + public float getFloat(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return array.get(off).floatValue(dim); } @Override - public int getInteger(List<V> array, int off) throws IndexOutOfBoundsException { + public int getInteger(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return array.get(off).intValue(dim); } @Override - public short getShort(List<V> array, int off) throws IndexOutOfBoundsException { + public short getShort(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return array.get(off).shortValue(dim); } @Override - public long getLong(List<V> array, int off) throws IndexOutOfBoundsException { + public long getLong(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return array.get(off).longValue(dim); } @Override - public byte getByte(List<V> array, int off) throws IndexOutOfBoundsException { + public byte getByte(List<? extends NumberVector<?>> array, int off) throws IndexOutOfBoundsException { return array.get(off).byteValue(dim); } } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java index 31f72660..47b6db5f 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseMinMaxNormalization.java @@ -23,8 +23,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.ArrayList; - import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -38,8 +36,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeS import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.EqualSizeGlobalConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ListParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; /** * Class to perform and undo a normalization on real vectors with respect to @@ -97,24 +93,24 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends @Override protected void prepareProcessInstance(V featureVector) { // First object? Then initialize. - if (minima.length == 0 || maxima.length == 0) { + if(minima.length == 0 || maxima.length == 0) { int dimensionality = featureVector.getDimensionality(); minima = new double[dimensionality]; maxima = new double[dimensionality]; - for (int i = 0; i < dimensionality; i++) { + for(int i = 0; i < dimensionality; i++) { maxima[i] = -Double.MAX_VALUE; minima[i] = Double.MAX_VALUE; } } - if (minima.length != featureVector.getDimensionality()) { + if(minima.length != featureVector.getDimensionality()) { throw new IllegalArgumentException("FeatureVectors differ in length."); } - for (int d = 0; d < featureVector.getDimensionality(); d++) { + for(int d = 0; d < featureVector.getDimensionality(); d++) { final double val = featureVector.doubleValue(d); - if (val > maxima[d]) { + if(val > maxima[d]) { maxima[d] = val; } - if (val < minima[d]) { + if(val < minima[d]) { minima[d] = val; } } @@ -123,10 +119,10 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends @Override protected V filterSingleObject(V featureVector) { double[] values = new double[featureVector.getDimensionality()]; - if (minima.length != featureVector.getDimensionality()) { + if(minima.length != featureVector.getDimensionality()) { throw new IllegalArgumentException("FeatureVectors and given Minima/Maxima differ in length."); } - for (int d = 0; d < featureVector.getDimensionality(); d++) { + for(int d = 0; d < featureVector.getDimensionality(); d++) { values[d] = (featureVector.doubleValue(d) - minima[d]) / factor(d); } return factory.newNumberVector(values); @@ -134,13 +130,14 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends @Override public V restore(V featureVector) throws NonNumericFeaturesException { - if (featureVector.getDimensionality() == maxima.length && featureVector.getDimensionality() == minima.length) { + if(featureVector.getDimensionality() == maxima.length && featureVector.getDimensionality() == minima.length) { double[] values = new double[featureVector.getDimensionality()]; - for (int d = 0; d < featureVector.getDimensionality(); d++) { + for(int d = 0; d < featureVector.getDimensionality(); d++) { values[d] = (featureVector.doubleValue(d) * (factor(d)) + minima[d]); } return factory.newNumberVector(values); - } else { + } + else { throw new NonNumericFeaturesException("Attributes cannot be resized: current dimensionality: " + featureVector.getDimensionality() + " former dimensionality: " + maxima.length); } } @@ -166,10 +163,10 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends int[] row = linearEquationSystem.getRowPermutations(); int[] col = linearEquationSystem.getColumnPermutations(); - for (int i = 0; i < coeff.length; i++) { - for (int r = 0; r < coeff.length; r++) { + for(int i = 0; i < coeff.length; i++) { + for(int r = 0; r < coeff.length; r++) { double sum = 0.0; - for (int c = 0; c < coeff[0].length; c++) { + for(int c = 0; c < coeff[0].length; c++) { sum += minima[c] * coeff[row[r]][col[c]] / factor(c); coeff[row[r]][col[c]] = coeff[row[r]][col[c]] / factor(c); } @@ -224,23 +221,16 @@ public class AttributeWiseMinMaxNormalization<V extends NumberVector<?>> extends protected void makeOptions(Parameterization config) { super.makeOptions(config); DoubleListParameter minimaP = new DoubleListParameter(MINIMA_ID, true); - if (config.grab(minimaP)) { + if(config.grab(minimaP)) { minima = ArrayLikeUtil.toPrimitiveDoubleArray(minimaP.getValue()); } DoubleListParameter maximaP = new DoubleListParameter(MAXIMA_ID, true); - if (config.grab(maximaP)) { + if(config.grab(maximaP)) { maxima = ArrayLikeUtil.toPrimitiveDoubleArray(maximaP.getValue()); } - ArrayList<Parameter<?>> global_1 = new ArrayList<>(); - global_1.add(minimaP); - global_1.add(maximaP); - config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(global_1)); - - ArrayList<ListParameter<?>> global = new ArrayList<>(); - global.add(minimaP); - global.add(maximaP); - config.checkConstraint(new EqualSizeGlobalConstraint(global)); + config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(minimaP, maximaP)); + config.checkConstraint(new EqualSizeGlobalConstraint(minimaP, maximaP)); } @Override diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java index 072d1a68..a24cae25 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/AttributeWiseVarianceNormalization.java @@ -23,8 +23,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization; along with this program. If not, see <http://www.gnu.org/licenses/>. */ -import java.util.ArrayList; - import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -40,8 +38,6 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.AllOrNoneMustBeS import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.EqualSizeGlobalConstraint; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleListParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ListParameter; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Parameter; /** * Class to perform and undo a normalization on real vectors with respect to @@ -186,6 +182,7 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten /** * Restore a single dimension. + * * @param d Dimension * @param val Value * @return Normalized value @@ -280,15 +277,8 @@ public class AttributeWiseVarianceNormalization<V extends NumberVector<?>> exten } } - ArrayList<Parameter<?>> global_1 = new ArrayList<>(); - global_1.add(meanP); - global_1.add(stddevP); - config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(global_1)); - - ArrayList<ListParameter<?>> global = new ArrayList<>(); - global.add(meanP); - global.add(stddevP); - config.checkConstraint(new EqualSizeGlobalConstraint(global)); + config.checkConstraint(new AllOrNoneMustBeSetGlobalConstraint(meanP, stddevP)); + config.checkConstraint(new EqualSizeGlobalConstraint(meanP, stddevP)); } @Override diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java index 94bcb32f..21263890 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/InverseDocumentFrequencyNormalization.java @@ -26,9 +26,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization; import gnu.trove.iterator.TIntDoubleIterator; import gnu.trove.map.TIntDoubleMap; import gnu.trove.map.hash.TIntDoubleHashMap; - -import java.util.BitSet; - import de.lmu.ifi.dbs.elki.data.SparseNumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; @@ -78,10 +75,10 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector< @Override protected void prepareProcessInstance(V featureVector) { - BitSet b = featureVector.getNotNullMask(); - for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) { - if(featureVector.doubleValue(i) >= 0.0) { - idf.put(i, idf.get(i) + 1); + for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) { + if(featureVector.iterDoubleValue(it) >= 0.) { + final int dim = featureVector.iterDim(it); + idf.put(dim, idf.get(dim) + 1); } } objcnt += 1; @@ -100,20 +97,20 @@ public class InverseDocumentFrequencyNormalization<V extends SparseNumberVector< @Override protected V filterSingleObject(V featureVector) { - BitSet b = featureVector.getNotNullMask(); TIntDoubleHashMap vals = new TIntDoubleHashMap(); - for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) { - vals.put(i, (float) (featureVector.doubleValue(i) * idf.get(i))); + for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) { + final int dim = featureVector.iterDim(it); + vals.put(dim, featureVector.iterDoubleValue(it) * idf.get(dim)); } return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality()); } @Override public V restore(V featureVector) { - BitSet b = featureVector.getNotNullMask(); TIntDoubleHashMap vals = new TIntDoubleHashMap(); - for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) { - vals.put(i, (float) (featureVector.doubleValue(i) / idf.get(i))); + for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) { + final int dim = featureVector.iterDim(it); + vals.put(dim, featureVector.iterDoubleValue(it) / idf.get(dim)); } return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality()); } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java index 5110d6fe..09b73aa4 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/normalization/TFIDFNormalization.java @@ -24,9 +24,6 @@ package de.lmu.ifi.dbs.elki.datasource.filter.normalization; */ import gnu.trove.map.hash.TIntDoubleHashMap; - -import java.util.BitSet; - import de.lmu.ifi.dbs.elki.data.SparseNumberVector; import de.lmu.ifi.dbs.elki.logging.Logging; @@ -58,17 +55,17 @@ public class TFIDFNormalization<V extends SparseNumberVector<?>> extends Inverse @Override protected V filterSingleObject(V featureVector) { - BitSet b = featureVector.getNotNullMask(); double sum = 0.0; - for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) { - sum += featureVector.doubleValue(i); + for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) { + sum += featureVector.iterDoubleValue(it); } if(sum <= 0) { sum = 1.0; } TIntDoubleHashMap vals = new TIntDoubleHashMap(); - for(int i = b.nextSetBit(0); i >= 0; i = b.nextSetBit(i + 1)) { - vals.put(i, (float) (featureVector.doubleValue(i) / sum * idf.get(i))); + for(int it = featureVector.iter(); featureVector.iterValid(it); it = featureVector.iterAdvance(it)) { + final int dim = featureVector.iterDim(it); + vals.put(dim, featureVector.iterDoubleValue(it) / sum * idf.get(dim)); } return ((SparseNumberVector.Factory<V, ?>) factory).newNumberVector(vals, featureVector.getDimensionality()); } diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java index 742eb977..462db9eb 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/AbstractSupervisedProjectionVectorFilter.java @@ -46,7 +46,7 @@ import de.lmu.ifi.dbs.elki.math.linearalgebra.Vector; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
@@ -62,7 +62,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; */
public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberVector<?>> implements ObjectFilter {
/**
- * r: the dimension to which the data should be reduced
+ * The dimensionality to which the data should be reduced.
*/
protected int tdim;
@@ -79,23 +79,23 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV @Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
final int dataLength = objects.dataLength();
- if (dataLength == 0) {
+ if(dataLength == 0) {
return objects;
}
List<? extends ClassLabel> classcolumn = null;
// First of all, identify a class label column.
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = objects.meta(r);
List<?> column = objects.getColumn(r);
- if (TypeUtil.CLASSLABEL.isAssignableFromType(type)) {
+ if(TypeUtil.CLASSLABEL.isAssignableFromType(type)) {
@SuppressWarnings("unchecked")
final List<? extends ClassLabel> castcolumn = (List<? extends ClassLabel>) column;
classcolumn = castcolumn;
break;
}
}
- if (classcolumn == null) {
+ if(classcolumn == null) {
getLogger().warning("No class label column found (try " + ClassLabelFilter.class.getSimpleName() + ") -- cannot run " + this.getClass().getSimpleName());
return objects;
}
@@ -103,10 +103,10 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV boolean somesuccess = false;
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
// Secondly, look for columns to train the projection on.
- for (int r = 0; r < objects.metaLength(); r++) {
+ for(int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = objects.meta(r);
List<?> column = objects.getColumn(r);
- if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
+ if(!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
bundle.appendColumn(type, column);
continue;
}
@@ -117,8 +117,8 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV NumberVector.Factory<V, ?> factory = (NumberVector.Factory<V, ?>) vtype.getFactory();
int dim = vtype.getDimensionality();
- if (tdim > dim) {
- if (getLogger().isVerbose()) {
+ if(tdim > dim) {
+ if(getLogger().isVerbose()) {
getLogger().verbose("Setting projection dimension to original dimension: projection dimension: " + tdim + " larger than original dimension: " + dim);
}
tdim = dim;
@@ -126,21 +126,22 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV try {
Matrix proj = computeProjectionMatrix(vectorcolumn, classcolumn, dim);
- for (int i = 0; i < dataLength; i++) {
+ for(int i = 0; i < dataLength; i++) {
final Vector pv = proj.times(vectorcolumn.get(i).getColumnVector());
V filteredObj = factory.newNumberVector(pv, ArrayLikeUtil.VECTORADAPTER);
vectorcolumn.set(i, filteredObj);
}
bundle.appendColumn(convertedType(type, factory), column);
somesuccess = true;
- } catch (Exception e) {
+ }
+ catch(Exception e) {
getLogger().error("Projection failed -- continuing with unprojected data!", e);
bundle.appendColumn(type, column);
continue;
}
}
- if (!somesuccess) {
+ if(!somesuccess) {
getLogger().warning("No vector field of fixed dimensionality found.");
return objects;
}
@@ -179,15 +180,15 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV * Partition the bundle based on the class label.
*
* @param classcolumn
- * @return
+ * @return Partitioned data set.
*/
protected <O> Map<O, TIntList> partition(List<? extends O> classcolumn) {
Map<O, TIntList> classes = new HashMap<>();
Iterator<? extends O> iter = classcolumn.iterator();
- for (int i = 0; iter.hasNext(); i++) {
+ for(int i = 0; iter.hasNext(); i++) {
O lbl = iter.next();
TIntList ids = classes.get(lbl);
- if (ids == null) {
+ if(ids == null) {
ids = new TIntArrayList();
classes.put(lbl, ids);
}
@@ -220,9 +221,9 @@ public abstract class AbstractSupervisedProjectionVectorFilter<V extends NumberV protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter dimP = new IntParameter(P_ID, 2);
- dimP.addConstraint(new GreaterConstraint(0));
+ dimP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
- if (config.grab(dimP)) {
+ if(config.grab(dimP)) {
tdim = dimP.getValue();
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java index 720c88df..e6d0d15d 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorFeatureSelectionFilter.java @@ -35,8 +35,7 @@ import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.datasource.filter.AbstractVectorStreamConversionFilter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.ListEachConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntListParameter;
@@ -150,11 +149,11 @@ public class NumberVectorFeatureSelectionFilter<V extends NumberVector<?>> exten protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntListParameter selectedAttributesP = new IntListParameter(SELECTED_ATTRIBUTES_ID);
- selectedAttributesP.addConstraint(new ListEachConstraint<Integer>(new GreaterEqualConstraint(0)));
- if (config.grab(selectedAttributesP)) {
+ selectedAttributesP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT_LIST);
+ if(config.grab(selectedAttributesP)) {
selectedAttributes = new BitSet();
List<Integer> dimensionList = selectedAttributesP.getValue();
- for (int d : dimensionList) {
+ for(int d : dimensionList) {
selectedAttributes.set(d);
}
}
diff --git a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java index 9b1ddbff..4086270c 100644 --- a/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java +++ b/src/de/lmu/ifi/dbs/elki/datasource/filter/transform/NumberVectorRandomFeatureSelectionFilter.java @@ -36,7 +36,7 @@ import de.lmu.ifi.dbs.elki.utilities.RandomFactory; import de.lmu.ifi.dbs.elki.utilities.Util;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
-import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
@@ -106,7 +106,7 @@ public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector<?>> */
void initializeRandomAttributes(SimpleTypeInformation<V> in) {
int d = ((VectorFieldTypeInformation<V>) in).getDimensionality();
- selectedAttributes = Util.randomBitSet(k, d, rnd.getRandom());
+ selectedAttributes = Util.randomBitSet(k, d, rnd.getSingleThreadedRandom());
}
/**
@@ -156,12 +156,12 @@ public class NumberVectorRandomFeatureSelectionFilter<V extends NumberVector<?>> protected void makeOptions(Parameterization config) {
super.makeOptions(config);
IntParameter kP = new IntParameter(NUMBER_SELECTED_ATTRIBUTES_ID, 1);
- kP.addConstraint(new GreaterEqualConstraint(1));
- if (config.grab(kP)) {
+ kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
k = kP.getValue().intValue();
}
RandomParameter rndP = new RandomParameter(SEED_ID);
- if (config.grab(rndP)) {
+ if(config.grab(rndP)) {
rnd = rndP.getValue();
}
}
|