From 3570efdfeff7fdcb069c6da96379e7466f584efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= Date: Fri, 10 Dec 2021 11:48:12 -0300 Subject: Import octave-statistics_1.4.3-1.debian.tar.xz [dgit import tarball octave-statistics 1.4.3-1 octave-statistics_1.4.3-1.debian.tar.xz] --- changelog | 403 ++++++++++++++++++++++++++++++++++++ clean | 8 + control | 27 +++ copyright | 131 ++++++++++++ gbp.conf | 4 + octave-statistics.lintian-overrides | 3 + patches/data-files-for-tests.diff | 22 ++ patches/series | 2 + patches/t_test-skip.patch | 34 +++ rules | 14 ++ source/format | 1 + upstream/metadata | 5 + watch | 3 + 13 files changed, 657 insertions(+) create mode 100644 changelog create mode 100644 clean create mode 100644 control create mode 100644 copyright create mode 100644 gbp.conf create mode 100644 octave-statistics.lintian-overrides create mode 100644 patches/data-files-for-tests.diff create mode 100644 patches/series create mode 100644 patches/t_test-skip.patch create mode 100755 rules create mode 100644 source/format create mode 100644 upstream/metadata create mode 100644 watch diff --git a/changelog b/changelog new file mode 100644 index 0000000..38fc406 --- /dev/null +++ b/changelog @@ -0,0 +1,403 @@ +octave-statistics (1.4.3-1) unstable; urgency=medium + + [ Jenkins ] + * Remove constraints unnecessary since stretch + + [ Rafael Laboissière ] + * d/rules: Use execute_before_dh_installdeb instead of override_dh_installdeb + * d/octave-statistics.lintian-overrides: New file + * d/copyright: Accentuate my family name + * d/control: Bump Standards-Version to 4.6.0 (no changes needed) + + [ Sébastien Villemot ] + * New upstream version 1.4.3 + * d/copyright: reflect upstream changes + * expose-tbl-delim-tests.patch: drop patch, applied upstream + + -- Sébastien Villemot Fri, 10 Dec 2021 15:48:12 +0100 + +octave-statistics (1.4.2-2) unstable; urgency=medium + + * d/control: Bump debhelper compatibility level to 13 + * d/u/metadata: Drop Name and Contact fields + * d/p/expose-tbl-delim-tests.patch: Add URL to Forwarded field + * d/p/t_test-skip.patch: Set Forwarded: not-needed + + -- Rafael Laboissière Wed, 29 Jul 2020 18:30:24 -0300 + +octave-statistics (1.4.2-1) unstable; urgency=medium + + * New upstream version 1.4.2 + * d/copyright: Reflect upstream changes + * d/u/metadata: New file + * d/control: Bump Standards-Version to 4.5.0 (no changes needed) + + -- Rafael Laboissière Wed, 01 Apr 2020 08:49:50 -0300 + +octave-statistics (1.4.1-3) unstable; urgency=medium + + * d/control: + + Bump Standards-Version to 4.4.1 (no changes needed) + + Bump dependency on dh-octave to >= 0.7.1 + This allows the injection of the virtual package octave-abi-N + into the package's list of dependencies. + + -- Rafael Laboissiere Sat, 09 Nov 2019 03:40:37 -0300 + +octave-statistics (1.4.1-2) unstable; urgency=medium + + * Upload to unstable + + [ Rafael Laboissiere ] + * d/copyright: Fix license name for the AppStream file + * d/rules: Use override_dh_installdeb instead of _auto_install + + [ Sébastien Villemot ] + * Bump S-V to 4.4.0 + + -- Sébastien Villemot Mon, 08 Jul 2019 15:25:03 +0200 + +octave-statistics (1.4.1-1) experimental; urgency=medium + + * New upstream version 1.4.1 + * d/p/ttest-tolerance.patch: Drop patch (applied upstream) + * d/p/t_test-skip.patch: Refresh for new upstream version + * d/copyright: Reflect upstream changes + + -- Rafael Laboissiere Sat, 13 Apr 2019 05:21:17 -0300 + +octave-statistics (1.4.0-5) unstable; urgency=medium + + * d/control: + + Bump Standards-Version to 4.3.0 (no changes needed) + + Bump to debhelper compat level 12 + * Build-depend on debhelper-compat instead of using d/compat + + -- Rafael Laboissiere Wed, 02 Jan 2019 22:57:51 -0200 + +octave-statistics (1.4.0-4) unstable; urgency=medium + + * d/control: Bump Standards-Version to 4.2.1 (no changes needed) + * d/p/t_test-increase-tol.patch: Drop patch + * d/p/t_test-skip.patch: New patch + + -- Rafael Laboissiere Wed, 12 Sep 2018 04:18:45 -0300 + +octave-statistics (1.4.0-3) unstable; urgency=medium + + [ Sébastien Villemot ] + * Revert "d/rules: Use override_dh_installdeb for post-install actions" + + [ Rafael Laboissiere ] + * d/control: + + Add Rules-Requires-Root: no + + Bump Standards-Version to 4.2.0 + * d/p/ttest-tolerance.patch: New patch. + Allow a tolerance so ttest can pass on i386 + Thanks to Graham Inggs (Closes: #906820) + * d/p/t_test-increase-tol.patch: New patch + + -- Rafael Laboissiere Wed, 22 Aug 2018 17:15:38 -0300 + +octave-statistics (1.4.0-2) unstable; urgency=medium + + * d/control: Build depends on octave >= 4.4 + + -- Rafael Laboissiere Mon, 11 Jun 2018 15:20:15 -0300 + +octave-statistics (1.4.0-1) unstable; urgency=medium + + [ Mike Miller ] + * d/control, d/copyright: Use secure URL for upstream source. + + [ Rafael Laboissiere ] + * d/rules: Use override_dh_installdeb for post-install actions + * d/control: Bump Standards-Version to 4.1.4 (no changes needed) + + [ Sébastien Villemot ] + * New upstream version 1.4.0 + * d/copyright: reflect upstream changes + * Remove patches applied upstream + - pval-in-ttest-unit-test.patch + - tsquare-in-princomp-unit-tests.patch + - xtests-in-grp2idx.patch + * Add some generated files to debian/clean + + -- Sébastien Villemot Sun, 20 May 2018 13:01:46 +0200 + +octave-statistics (1.3.0-4) unstable; urgency=medium + + * Use dh-octave for building the package + * d/control: + + Use Debian's GitLab URLs in Vcs-* headers + + Change Maintainer to team+pkg-octave-team@tracker.debian.org + + -- Rafael Laboissiere Sat, 10 Feb 2018 07:38:51 -0200 + +octave-statistics (1.3.0-3) unstable; urgency=medium + + * Use the dh-based version of octave-pkg-dev + * Set debhelper compatibility level to >= 11 + * d/control: + + Bump Standards-Version to 4.1.3 (no changes needed) + + Add Testsuite field + + -- Rafael Laboissiere Fri, 29 Dec 2017 22:14:48 -0200 + +octave-statistics (1.3.0-2) unstable; urgency=medium + + [ Sébastien Villemot ] + * d/copyright: use secure URL for format. + * d/watch: bump to format version 4. + + [ Rafael Laboissiere ] + * d/control: + + Use cgit instead of gitweb in Vcs-Browser URL + + Build-conflict with octave-nan (colision with function 'mad') + + Bump Standards-Version to 4.1.0 (no changes needed) + + Build-depends on octave-common > 4.2.1-3. This is necessary + because this version of the octave-common package + contains the function corrcoef, necessary for the unit tests. + * New patches (avoid FTBFS with octave-pkg-dev 1.5.0, closes: #874141): + + d/p/pval-in-ttest-unit-test.patch + + d/p/xtests-in-grp2idx.patch + + d/p/tsquare-in-princomp-unit-tests.patch + + -- Rafael Laboissiere Mon, 11 Sep 2017 01:03:46 -0300 + +octave-statistics (1.3.0-1) unstable; urgency=medium + + [ Sébastien Villemot ] + * New upstream version 1.3.0 + * Bump debhelper compat level to 10. + * d/copyright: reflect upstream changes. + * Drop patches applied upstream. + + d/p/combnk-implicit-conversion-warn.patch + + d/p/expected-linkage-warning.patch + + d/p/small-k-in-gev-functions.patch + + d/p/use-corr-instead-of-cor.patch + + d/p/use-corr-instead-of-corrcoef.patch + * d/rules: add workaround for wrong perms in upstream tarball. + + [ Rafael Laboissiere ] + * d/control: Use secure URIs in the Vcs-* fields + * Bump Standards-Version to 3.9.8 (no changes needed) + + -- Sébastien Villemot Thu, 27 Oct 2016 14:32:52 +0200 + +octave-statistics (1.2.4-1) unstable; urgency=medium + + [ Rafael Laboissiere ] + * Imported Upstream version 1.2.4 + * d/copyright: Reflect upstream changes + * Bump Standards-Version to 3.9.6 (no changes needed) + * Bump build-dependency on octave-pkg-dev, for proper unit testing + * Bump Build-Depends on octave-io to >> 2.2.4-1 + * d/p/autoload-yes.patch: Remove patch (deprecated upstream) + * d/p/use-corr-instead-of-cor.patch: New patch + * d/p/small-k-in-gev-functions.patch: New patch + * d/p/use-corr-instead-of-corrcoef.patch: New patch + * d/p/expected-linkage-warning.patch: Add patch + * d/p/combnk-implicit-conversion-warn.patch: Add patch + * Add myself to the Uploaders list + + [ Sébastien Villemot ] + * Remove Thomas Weber from Uploaders. + + -- Rafael Laboissiere Sun, 13 Sep 2015 04:00:51 -0300 + +octave-statistics (1.2.3-1) unstable; urgency=medium + + [ Sébastien Villemot ] + * Imported Upstream version 1.2.3 + * debian/copyright: reflect upstream changes. + * princomp-one-arg.patch: new patch, fixes princomp with only one arg. + (Closes: #731992) + + [ Rafael Laboissiere ] + * Bump to Standards-Version 3.9.5, no changes needed + + -- Sébastien Villemot Sun, 02 Feb 2014 10:09:28 +0100 + +octave-statistics (1.2.2-1) unstable; urgency=low + + * Imported Upstream version 1.2.2 + + -- Sébastien Villemot Fri, 16 Aug 2013 23:37:27 +0200 + +octave-statistics (1.2.1-1) unstable; urgency=low + + [ Sébastien Villemot ] + * Imported Upstream version 1.2.1 + * debian/copyright: reflect upstream changes + + [ Thomas Weber ] + * debian/control: Use canonical URLs in Vcs-* fields + + -- Sébastien Villemot Sat, 27 Jul 2013 08:28:41 +0200 + +octave-statistics (1.2.0-2) unstable; urgency=low + + * expose-tbl-delim-tests.patch: new patch, makes sure tbl_delim.m is tested + (Closes: #672756) + + -- Sébastien Villemot Sat, 18 May 2013 15:09:00 +0200 + +octave-statistics (1.2.0-1) experimental; urgency=low + + [ Rafael Laboissiere ] + * Imported Upstream version 1.2.0 + * Bump Standards-Version to 3.9.4 (no changes needed) + * Use Sébastien Villemot's @debian.org email address + * Remove obsolete DM-Upload-Allowed flag + * debian/copyright: + + Reflect upstream changes + + Use the octave-maintainers mailing list as upstream contact + + [ Sébastien Villemot ] + * debian/patches/autoload-yes.patch: new patch + + -- Sébastien Villemot Wed, 16 Jan 2013 10:19:41 +0100 + +octave-statistics (1.1.3-1) unstable; urgency=low + + * Imported Upstream version 1.1.3 + + -- Sébastien Villemot Sun, 13 May 2012 14:50:15 +0200 + +octave-statistics (1.1.2-1) unstable; urgency=low + + * Imported Upstream version 1.1.2 + * debian/control: no longer Build-Depends on octave-miscellaneous + * add-dependency-on-io.patch: remove patch, applied upstream + * debian/copyright: reflect upstream changes + + -- Sébastien Villemot Wed, 02 May 2012 22:00:26 +0200 + +octave-statistics (1.1.1-1) unstable; urgency=low + + * Imported Upstream version 1.1.1 + * Remove patches applied upstream: + + normalise_distribution-index-scalar.patch + + remove-zscore.patch + + combnk-cells.patch + + linkage-clear-lastwarn.patch + * debian/copyright: reflect upstream changes + * debian/watch: use SourceForge redirector + * add-dependency-on-io.patch: new patch, adds dependency on io + * debian/control: Build-Depends on octave-io >= 1.0.18 + + -- Sébastien Villemot Tue, 17 Apr 2012 15:14:12 +0200 + +octave-statistics (1.1.0-1) unstable; urgency=low + + [ Sébastien Villemot ] + * Imported Upstream version 1.1.0 + * Bump to debhelper compat level 9 + * debian/control: + + Build-depend on octave-pkg-dev >= 1.0.1, to compile against Octave 3.6 + + Remove Depends and Build-Depends-Indep on octave-gsl + (expm1 is in Octave core since version 3.2) + + Add Sébastien Villemot to Uploaders + + Simplify long description (Octave-Forge was mentionned twice) + + Remove shlibs:Depends, makes no sense on arch:all package + + Bump to Standards-Version 3.9.3, no changes needed + * debian/copyright: upgrade to machine-readable format 1.0 + * debian/source.lintian-overrides: remove file + (tag build-depends-without-arch-dep no longer exists) + * debian/patches/remove-zscore.patch: new patch taken from upstream + * debian/patches/normalise_distribution-index-scalar.patch: new patch + * debian/patches/combnk-cells.patch: new patch + * debian/patches/linkage-clear-lastwarn.patch: new patch + + -- Thomas Weber Wed, 14 Mar 2012 22:30:15 +0100 + +octave-statistics (1.0.10-1) unstable; urgency=low + + * New upstream release + * debian/control: + - Remove Rafael Laboissiere from Uploaders (Closes: #571910) + - Remove Ólafur Jens Sigurðsson from Uploaders + * Bump Standards-Version to 3.8.4 (no changes needed) + * Switch to dpkg-source 3.0 (quilt) format + * Dropped patches (applied upstream): + - fix_bug_with_sprintf + - remove_outdated_dmult + + -- Thomas Weber Sun, 16 May 2010 17:48:37 +0200 + +octave-statistics (1.0.9-1) unstable; urgency=low + + [ Rafael Laboissiere ] + * debian/patches/fix-pdist-test.diff: Add patch for making the tests of + function pdist succeed + * debian/control: Build-depend on octave-pkg-dev >= 0.7.0, such that the + package is built against octave3.2 + + [ Thomas Weber ] + * New upstream release + * Drop patch fix-pdist-test.diff, applied upstream + * New patches: + + remove_outdated_dmult: Replace the call to the outdated dmult(). This + fixes a test failure. Taken from upstream, SVN rev 6725 + + fix_bug_with_sprintf: Correct usage of sprintf, fixing a test failure. + Taken from upstream, SVN rev 6726. + + -- Thomas Weber Wed, 13 Jan 2010 01:00:18 +0100 + +octave-statistics (1.0.8-1) unstable; urgency=low + + * New upstream version + * debian/patches/data-files-for-tests.diff: New patch to add the + missing *.dat files, needed for the tests of functions caseread and + tblread. + * debian/clean: Add file for removing the *write*.dat files created when + building the package + * debian/compat: Bump the compatibility level of debhelper to 7, + otherwise debian/clean does not work + * debian/rules: Include patchsys-quilt.mk + * debian/control: + + (Build-Depends): + - Add quilt + - Bump the dependency on debhelper to >= 7 + + (Standards-Version): Bump to 3.8.1 (no changes needed) + + (Depends): Add ${misc:Depends} and octave-gsl (for expm1) + + (Vcs-Git, Vcs-Browser): Adjust to new Git repository + + (Build-Depends-Indep): Add octave-gsl (for expm1, needed in testing + copulapdf) + * debian/source.lintian-overrides: Add file for overriding Lintian + warnings build-depends-without-arch-dep + * debian/copyright: Use DEP5 URL in Format-Specification + * debian/README.source: Add file explaining the quilt patch system, as + required by the Policy + + -- Rafael Laboissiere Sun, 24 May 2009 20:23:57 +0200 + +octave-statistics (1.0.7-1) unstable; urgency=low + + [ Ólafur Jens Sigurðsson ] + * debian/control: Bumped Standards-Version to 3.8.0 (no changes + needed) + + [ Rafael Laboissiere ] + * debian/copyright: Add header + * debian/control: Bump build-dependency on octave-pkg-dev to >= 0.6.4, + such that the package is built with the versioned packages directory + + [ Thomas Weber ] + * New upstream release + + -- Thomas Weber Mon, 06 Apr 2009 23:51:36 +0200 + +octave-statistics (1.0.6-1) unstable; urgency=low + + [ Ólafur Jens Sigurðsson ] + * New upstream version + + -- Thomas Weber Thu, 15 May 2008 12:43:43 +0200 + +octave-statistics (1.0.5-1) unstable; urgency=low + + * Initial release (closes: #468526) + + -- Ólafur Jens Sigurðsson Fri, 16 Nov 2007 00:35:04 +0100 diff --git a/clean b/clean new file mode 100644 index 0000000..2ffff67 --- /dev/null +++ b/clean @@ -0,0 +1,8 @@ +*write*.dat +inst/test/caseread.dat +inst/test/tblread-space.dat +inst/test/tblread-tab.dat +INDEX +src/Makefile +src/config.log +src/config.status diff --git a/control b/control new file mode 100644 index 0000000..13890a1 --- /dev/null +++ b/control @@ -0,0 +1,27 @@ +Source: octave-statistics +Section: math +Priority: optional +Maintainer: Debian Octave Group +Uploaders: Sébastien Villemot , + Rafael Laboissière +Build-Depends: debhelper-compat (= 13), + dh-octave (>= 0.7.1), + octave, + octave-io +Build-Conflicts: octave-nan +Standards-Version: 4.6.0 +Homepage: https://octave.sourceforge.io/statistics/ +Vcs-Git: https://salsa.debian.org/pkg-octave-team/octave-statistics.git +Vcs-Browser: https://salsa.debian.org/pkg-octave-team/octave-statistics +Testsuite: autopkgtest-pkg-octave +Rules-Requires-Root: no + +Package: octave-statistics +Architecture: all +Depends: ${misc:Depends}, ${octave:Depends} +Description: additional statistical functions for Octave + This package provides additional statistical functions for Octave, including + mean and variance for several distributions (geometric, hypergeometric, + exponential, lognormal and others). + . + This Octave add-on package is part of the Octave-Forge project. diff --git a/copyright b/copyright new file mode 100644 index 0000000..13c883b --- /dev/null +++ b/copyright @@ -0,0 +1,131 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: Statistics package for Octave +Upstream-Contact: The Octave Community +Source: https://octave.sourceforge.io/statistics/ + +Files: * +Copyright: 2006-2008, 2012 Arno Onken + 2001, 2005-2017 Paul Kienzle + 2008 Bill Denney + 2007, 2009-2017 Søren Hauberg + 2002-2003 Alberto Terruzzi + 2011 Alexander Klein + 2008 Francesco Potortì + 2006 Frederick (Rick) A Niles + 2006 Alberto Pose + 2011 Pascal Dupuis + 2012, 2014, 2016-2017 Juan Pablo Carbajal + 2003 Alois Schloegl + 2003-2005 Andy Adler + 2011-2015 Carnë Draug + 2011-2021 Nir Krakauer + 2012 Daniel Ward + 2003 Iain Murray + 2011 Kyle Winfree + 2009 Levente Torok + 2005-2006 William Poetra Yoga Hadisoeseno + 2008 Sylvain Pelissier + 2012 Iñigo Urteaga + 2012-2019 Fernando Damian Nieuwveldt + 2014 JD Walsh + 2014 Maria L. Rizzo + 2014 Gabor J. Szekely + 2014-2019 Piotr Dollar + 2014 Tony Richardson + 2016 Dag Lyberg + 1995-2019 Kurt Hornik + 2016 Andreas Stahel + 2016 Pascal Dupuis + 2015-2017 Lachlan Andrew + 2018-2020 John Donoghue + 2009-2010 VZLU Prague + 1996-2017 John W. Eaton + 2016-2017 Guillaume Flandin + 2017, 2021 Nicholas R. Jankowski + 2013-2017 Julien Bect + 2012, 2017-2018 Rik Wehbring + 2008-2017 Jaroslav Hajek + 2007-2017 David Bateman + 2008-2017 Ben Abbott + 1995-2017 Friedrich Leisch + 2010 Christos Dimitrakakis + 2014 Mike Giles + 2018 Olaf Till + 2017-2018 Oliver Heimlich + 2019 Anthony Morast + 2020-2021 Stefano Guidoni + 2020-2021 Andreas Bertsatos + 2020 Philip Nienhuis (prnienhuis@users.sf.net) + 1993-2021 The Octave Project Developers + 2018 gold holk + 2016 Nan Zhou + 2014 by Mikael Kurula + 2014 Björn Vennberg + 2015 Michael Leitner +License: GPL-3+ + +Files: Makefile + octave-statistics.metainfo.xml +Copyright: 2016 Carnë Draug +License: FSFAP + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. This file is offered as-is, + without any warranty. + +Files: src/install-sh +Copyright: 1994 X Consortium +License: Expat + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- + TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + . + Except as contained in this notice, the name of the X Consortium shall not + be used in advertising or otherwise to promote the sale, use or other deal- + ings in this Software without prior written authorization from the X Consor- + tium. + +Files: inst/anderson_darling_cdf.m inst/anderson_darling_test.m inst/ff2n.m + inst/fullfact.m inst/gamfit.m inst/gamlike.m inst/mvnpdf.m inst/normplot.m +Copyright: public-domain +License: public-domain + This code is granted to the public domain. + +Files: debian/* +Copyright: 2007-2008 Ólafur Jens Sigurðsson + 2008-2009, 2013-2020 Rafael Laboissière + 2008-2012 Thomas Weber + 2011-2021 Sébastien Villemot +License: GPL-3+ + +License: GPL-3+ + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program; if not, see . + . + On Debian systems, the complete text of the GNU General Public + License, version 3, can be found in the file + `/usr/share/common-licenses/GPL-3'. + \ No newline at end of file diff --git a/gbp.conf b/gbp.conf new file mode 100644 index 0000000..6b65fe0 --- /dev/null +++ b/gbp.conf @@ -0,0 +1,4 @@ +[DEFAULT] +debian-branch = debian/latest +upstream-branch = upstream/latest +pristine-tar = True diff --git a/octave-statistics.lintian-overrides b/octave-statistics.lintian-overrides new file mode 100644 index 0000000..aae03e1 --- /dev/null +++ b/octave-statistics.lintian-overrides @@ -0,0 +1,3 @@ +# File fisheriris.txt is not a documentation file, buyt contains data +# for used in the unit tests for some functions. +package-contains-documentation-outside-usr-share-doc usr/share/octave/packages/statistics-*/fisheriris.txt diff --git a/patches/data-files-for-tests.diff b/patches/data-files-for-tests.diff new file mode 100644 index 0000000..00cc4d4 --- /dev/null +++ b/patches/data-files-for-tests.diff @@ -0,0 +1,22 @@ +Add missing *.dat files, needed for the tests of functions caseread and tblread. + + -- Rafael Laboissiere Sun, 23 May 2009 13:28:28 +0200 + +--- /dev/null ++++ b/tblread-space.dat +@@ -0,0 +1,3 @@ ++ a bc ++de 1 2 ++f 3 4 +--- /dev/null ++++ b/tblread-tab.dat +@@ -0,0 +1,3 @@ ++ a bc ++de 1 2 ++f 3 4 +--- /dev/null ++++ b/caseread.dat +@@ -0,0 +1,3 @@ ++a ++bcd ++ef diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..bcf45cb --- /dev/null +++ b/patches/series @@ -0,0 +1,2 @@ +data-files-for-tests.diff +t_test-skip.patch diff --git a/patches/t_test-skip.patch b/patches/t_test-skip.patch new file mode 100644 index 0000000..3f0f064 --- /dev/null +++ b/patches/t_test-skip.patch @@ -0,0 +1,34 @@ +Description: Skip unit tests with non-deterministic results +Author: Rafael Laboissiere +Forwarded: not-needed +Last-Update: 2018-09-12 + +--- octave-statistics-1.4.0.orig/install-conditionally/tests/t_test.m ++++ octave-statistics-1.4.0/install-conditionally/tests/t_test.m +@@ -81,7 +81,7 @@ function [pval, t, df] = t_test (x, m, a + endfunction + + +-%!test ++%!xtest + %! ## Two-sided (also the default option) + %! x = rand (10,1); n = length (x); + %! u0 = 0.5; # true mean +@@ -95,7 +95,7 @@ endfunction + %! unew = tval * std(x)/sqrt(n) + u0; + %! assert (xbar, unew, 1e6*eps); + +-%!test ++%!xtest + %! x = rand (10,1); n = length (x); + %! u0 = 0.5; + %! pval = t_test (x, u0, ">"); +@@ -103,7 +103,7 @@ endfunction + %! unew = tval * std(x)/sqrt(n) + u0; + %! assert (mean (x), unew, 1e6*eps); + +-%!test ++%!xtest + %! x = rand (10,1); n = length (x); + %! u0 = 0.5; + %! pval = t_test (x, u0, "<"); diff --git a/rules b/rules new file mode 100755 index 0000000..56dfda3 --- /dev/null +++ b/rules @@ -0,0 +1,14 @@ +#!/usr/bin/make -f +# -*- makefile -*- +#!/usr/bin/make -f +# -*- makefile -*- + +%: + dh $@ --buildsystem=octave --with=octave + +# Workaround for wrong perms in upstream tarball 1.3.0 +execute_before_dh_installdeb: + @echo "*********************************************************" + @echo "Warning: check whether the following fix is still needed:" + @echo "*********************************************************" + find debian/octave-statistics/usr/share/octave/packages -type f -exec chmod -x '{}' ';' diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/upstream/metadata b/upstream/metadata new file mode 100644 index 0000000..06a647c --- /dev/null +++ b/upstream/metadata @@ -0,0 +1,5 @@ +Bug-Database: https://savannah.gnu.org/bugs/?group=octave +Bug-Submit: https://savannah.gnu.org/bugs/?func=additem&group=octave +Repository: http://hg.code.sf.net/p/octave/statistics +Repository-Browse: https://sourceforge.net/p/octave/statistics/ci/default/tree/ + diff --git a/watch b/watch new file mode 100644 index 0000000..95eb1f4 --- /dev/null +++ b/watch @@ -0,0 +1,3 @@ +version=4 +http://sf.net/octave/statistics-(.+)\.tar\.gz + -- cgit v1.2.3 From 8efbbd2be481c00b4f4742315a54859c830f8e59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= Date: Fri, 10 Dec 2021 11:48:12 -0300 Subject: Import octave-statistics_1.4.3.orig.tar.gz [dgit import orig octave-statistics_1.4.3.orig.tar.gz] --- COPYING | 139 + DESCRIPTION | 11 + INDEX.in | 142 + Makefile | 255 ++ NEWS | 471 +++ PKG_ADD | 19 + PKG_DEL | 19 + README.crosscompilation | 2 + inst/@cvpartition/cvpartition.m | 186 + inst/@cvpartition/display.m | 50 + inst/@cvpartition/get.m | 40 + inst/@cvpartition/repartition.m | 70 + inst/@cvpartition/set.m | 45 + inst/@cvpartition/test.m | 47 + inst/@cvpartition/training.m | 47 + inst/ConfusionMatrixChart.m | 901 +++++ inst/anderson_darling_cdf.m | 131 + inst/anderson_darling_test.m | 153 + inst/anova1.m | 260 ++ inst/anovan.m | 359 ++ inst/bbscdf.m | 107 + inst/bbsinv.m | 115 + inst/bbspdf.m | 113 + inst/bbsrnd.m | 145 + inst/betastat.m | 129 + inst/binostat.m | 128 + inst/binotest.m | 142 + inst/boxplot.m | 911 +++++ inst/burrcdf.m | 98 + inst/burrinv.m | 102 + inst/burrpdf.m | 100 + inst/burrrnd.m | 141 + inst/canoncorr.m | 94 + inst/carbig.mat | Bin 0 -> 14012 bytes inst/caseread.m | 62 + inst/casewrite.m | 70 + inst/cdf.m | 109 + inst/chi2stat.m | 92 + inst/cl_multinom.m | 124 + inst/cluster.m | 198 + inst/clusterdata.m | 117 + inst/cmdscale.m | 149 + inst/combnk.m | 92 + inst/confusionchart.m | 293 ++ inst/confusionmat.m | 224 ++ inst/cophenet.m | 136 + inst/copulacdf.m | 288 ++ inst/copulapdf.m | 194 + inst/copularnd.m | 281 ++ inst/crossval.m | 174 + inst/datasample.m | 230 ++ inst/dcov.m | 142 + inst/dendrogram.m | 410 ++ inst/evalclusters.m | 362 ++ inst/expfit.m | 267 ++ inst/explike.m | 87 + inst/expstat.m | 93 + inst/ff2n.m | 13 + inst/fisheriris.mat | Bin 0 -> 1224 bytes inst/fisheriris.txt | 153 + inst/fitgmdist.m | 533 +++ inst/fstat.m | 130 + inst/fullfact.m | 27 + inst/gamfit.m | 69 + inst/gamlike.m | 21 + inst/gamstat.m | 123 + inst/geomean.m | 34 + inst/geostat.m | 93 + inst/gevcdf.m | 131 + inst/gevfit.m | 97 + inst/gevfit_lmom.m | 113 + inst/gevinv.m | 100 + inst/gevlike.m | 369 ++ inst/gevpdf.m | 130 + inst/gevrnd.m | 121 + inst/gevstat.m | 90 + inst/gmdistribution.m | 349 ++ inst/gpcdf.m | 176 + inst/gpinv.m | 162 + inst/gppdf.m | 170 + inst/gprnd.m | 174 + inst/grp2idx.m | 131 + inst/gscatter.m | 247 ++ inst/harmmean.m | 34 + inst/hist3.m | 419 +++ inst/histfit.m | 69 + inst/hmmestimate.m | 338 ++ inst/hmmgenerate.m | 251 ++ inst/hmmviterbi.m | 249 ++ inst/hygestat.m | 133 + inst/inconsistent.m | 118 + inst/iwishpdf.m | 67 + inst/iwishrnd.m | 69 + inst/jackknife.m | 141 + inst/jsucdf.m | 61 + inst/jsupdf.m | 62 + inst/kmeans.m | 564 +++ inst/kruskalwallis.m | 287 ++ inst/linkage.m | 263 ++ inst/lognstat.m | 123 + inst/mahal.m | 93 + inst/mhsample.m | 365 ++ inst/mnpdf.m | 134 + inst/mnrnd.m | 184 + inst/monotone_smooth.m | 162 + inst/mvncdf.m | 173 + inst/mvnpdf.m | 132 + inst/mvnrnd.m | 140 + inst/mvtcdf.m | 166 + inst/mvtpdf.m | 105 + inst/mvtrnd.m | 144 + inst/nakacdf.m | 103 + inst/nakainv.m | 102 + inst/nakapdf.m | 95 + inst/nakarnd.m | 137 + inst/nanmax.m | 53 + inst/nanmean.m | 36 + inst/nanmedian.m | 69 + inst/nanmin.m | 54 + inst/nanstd.m | 88 + inst/nansum.m | 52 + inst/nanvar.m | 65 + inst/nbinstat.m | 124 + inst/ncx2pdf.m | 45 + inst/normalise_distribution.m | 284 ++ inst/normplot.m | 75 + inst/normstat.m | 122 + inst/optimalleaforder.m | 326 ++ inst/pca.m | 534 +++ inst/pcacov.m | 74 + inst/pcares.m | 89 + inst/pdf.m | 109 + inst/pdist.m | 229 ++ inst/pdist2.m | 176 + inst/plsregress.m | 124 + inst/poisstat.m | 92 + inst/princomp.m | 176 + inst/private/CalinskiHarabaszEvaluation.m | 205 + inst/private/ClusterCriterion.m | 239 ++ inst/private/DaviesBouldinEvaluation.m | 208 ++ inst/private/GapEvaluation.m | 364 ++ inst/private/SilhouetteEvaluation.m | 257 ++ inst/private/tbl_delim.m | 73 + inst/qrandn.m | 93 + inst/random.m | 171 + inst/randsample.m | 127 + inst/raylcdf.m | 117 + inst/raylinv.m | 123 + inst/raylpdf.m | 116 + inst/raylrnd.m | 157 + inst/raylstat.m | 94 + inst/regress.m | 214 ++ inst/regress_gp.m | 136 + inst/repanova.m | 100 + inst/runstest.m | 107 + inst/sigma_pts.m | 123 + inst/signtest.m | 163 + inst/silhouette.m | 203 + inst/slicesample.m | 305 ++ inst/squareform.m | 122 + inst/stepwisefit.m | 168 + inst/tabulate.m | 129 + inst/tblread.m | 99 + inst/tblwrite.m | 208 ++ inst/tricdf.m | 131 + inst/triinv.m | 119 + inst/trimmean.m | 58 + inst/tripdf.m | 111 + inst/trirnd.m | 153 + inst/tstat.m | 98 + inst/ttest.m | 161 + inst/ttest2.m | 149 + inst/unidstat.m | 94 + inst/unifstat.m | 122 + inst/vartest.m | 117 + inst/vartest2.m | 121 + inst/violin.m | 329 ++ inst/vmpdf.m | 46 + inst/vmrnd.m | 76 + inst/wblplot.m | 380 ++ inst/wblstat.m | 124 + inst/wishpdf.m | 67 + inst/wishrnd.m | 85 + inst/ztest.m | 115 + install-conditionally/INDEX.in | 161 + install-conditionally/README | 14 + install-conditionally/base/center.m | 94 + install-conditionally/base/cloglog.m | 56 + install-conditionally/base/corr.m | 109 + install-conditionally/base/corrcoef.m | 250 ++ install-conditionally/base/cov.m | 565 +++ install-conditionally/base/crosstab.m | 131 + install-conditionally/base/gls.m | 144 + install-conditionally/base/histc.m | 188 + install-conditionally/base/iqr.m | 98 + install-conditionally/base/ismissing.m | 136 + install-conditionally/base/kendall.m | 151 + install-conditionally/base/kurtosis.m | 168 + install-conditionally/base/logit.m | 59 + install-conditionally/base/lscov.m | 191 + install-conditionally/base/mad.m | 107 + install-conditionally/base/mean.m | 265 ++ install-conditionally/base/meansq.m | 90 + install-conditionally/base/median.m | 239 ++ install-conditionally/base/mode.m | 447 +++ install-conditionally/base/moment.m | 209 ++ install-conditionally/base/ols.m | 173 + install-conditionally/base/ppplot.m | 89 + install-conditionally/base/prctile.m | 184 + install-conditionally/base/probit.m | 43 + install-conditionally/base/qqplot.m | 117 + install-conditionally/base/quantile.m | 488 +++ install-conditionally/base/range.m | 61 + install-conditionally/base/ranks.m | 102 + install-conditionally/base/rmmissing.m | 185 + install-conditionally/base/run_count.m | 110 + install-conditionally/base/runlength.m | 70 + install-conditionally/base/skewness.m | 167 + install-conditionally/base/spearman.m | 117 + install-conditionally/base/statistics.m | 97 + install-conditionally/base/std.m | 241 ++ install-conditionally/base/var.m | 241 ++ install-conditionally/base/zscore.m | 106 + install-conditionally/distributions/betacdf.m | 90 + install-conditionally/distributions/betainv.m | 134 + install-conditionally/distributions/betapdf.m | 127 + install-conditionally/distributions/betarnd.m | 131 + install-conditionally/distributions/binocdf.m | 126 + install-conditionally/distributions/binoinv.m | 196 + install-conditionally/distributions/binopdf.m | 109 + install-conditionally/distributions/binornd.m | 150 + install-conditionally/distributions/cauchy_cdf.m | 89 + install-conditionally/distributions/cauchy_inv.m | 96 + install-conditionally/distributions/cauchy_pdf.m | 95 + install-conditionally/distributions/cauchy_rnd.m | 130 + install-conditionally/distributions/chi2cdf.m | 70 + install-conditionally/distributions/chi2inv.m | 68 + install-conditionally/distributions/chi2pdf.m | 68 + install-conditionally/distributions/chi2rnd.m | 114 + install-conditionally/distributions/discrete_cdf.m | 78 + install-conditionally/distributions/discrete_inv.m | 92 + install-conditionally/distributions/discrete_pdf.m | 82 + install-conditionally/distributions/discrete_rnd.m | 101 + .../distributions/empirical_cdf.m | 59 + .../distributions/empirical_inv.m | 58 + .../distributions/empirical_pdf.m | 69 + .../distributions/empirical_rnd.m | 65 + install-conditionally/distributions/expcdf.m | 87 + install-conditionally/distributions/expinv.m | 92 + install-conditionally/distributions/exppdf.m | 81 + install-conditionally/distributions/exprnd.m | 114 + install-conditionally/distributions/fcdf.m | 93 + install-conditionally/distributions/finv.m | 90 + install-conditionally/distributions/fpdf.m | 102 + install-conditionally/distributions/frnd.m | 129 + install-conditionally/distributions/gamcdf.m | 88 + install-conditionally/distributions/gaminv.m | 129 + install-conditionally/distributions/gampdf.m | 100 + install-conditionally/distributions/gamrnd.m | 129 + install-conditionally/distributions/geocdf.m | 89 + install-conditionally/distributions/geoinv.m | 85 + install-conditionally/distributions/geopdf.m | 85 + install-conditionally/distributions/geornd.m | 125 + install-conditionally/distributions/hygecdf.m | 107 + install-conditionally/distributions/hygeinv.m | 113 + install-conditionally/distributions/hygepdf.m | 115 + install-conditionally/distributions/hygernd.m | 145 + .../distributions/kolmogorov_smirnov_cdf.m | 94 + install-conditionally/distributions/laplace_cdf.m | 53 + install-conditionally/distributions/laplace_inv.m | 61 + install-conditionally/distributions/laplace_pdf.m | 53 + install-conditionally/distributions/laplace_rnd.m | 71 + install-conditionally/distributions/logistic_cdf.m | 53 + install-conditionally/distributions/logistic_inv.m | 66 + install-conditionally/distributions/logistic_pdf.m | 53 + install-conditionally/distributions/logistic_rnd.m | 70 + install-conditionally/distributions/logncdf.m | 98 + install-conditionally/distributions/logninv.m | 97 + install-conditionally/distributions/lognpdf.m | 94 + install-conditionally/distributions/lognrnd.m | 129 + install-conditionally/distributions/nbincdf.m | 101 + install-conditionally/distributions/nbininv.m | 168 + install-conditionally/distributions/nbinpdf.m | 98 + install-conditionally/distributions/nbinrnd.m | 134 + install-conditionally/distributions/normcdf.m | 96 + install-conditionally/distributions/norminv.m | 90 + install-conditionally/distributions/normpdf.m | 95 + install-conditionally/distributions/normrnd.m | 128 + install-conditionally/distributions/poisscdf.m | 86 + install-conditionally/distributions/poissinv.m | 210 ++ install-conditionally/distributions/poisspdf.m | 82 + install-conditionally/distributions/poissrnd.m | 117 + .../distributions/stdnormal_cdf.m | 54 + .../distributions/stdnormal_inv.m | 54 + .../distributions/stdnormal_pdf.m | 54 + .../distributions/stdnormal_rnd.m | 71 + install-conditionally/distributions/tcdf.m | 215 ++ install-conditionally/distributions/tinv.m | 107 + install-conditionally/distributions/tpdf.m | 90 + install-conditionally/distributions/trnd.m | 115 + install-conditionally/distributions/unidcdf.m | 86 + install-conditionally/distributions/unidinv.m | 78 + install-conditionally/distributions/unidpdf.m | 84 + install-conditionally/distributions/unidrnd.m | 109 + install-conditionally/distributions/unifcdf.m | 95 + install-conditionally/distributions/unifinv.m | 87 + install-conditionally/distributions/unifpdf.m | 90 + install-conditionally/distributions/unifrnd.m | 135 + install-conditionally/distributions/wblcdf.m | 112 + install-conditionally/distributions/wblinv.m | 96 + install-conditionally/distributions/wblpdf.m | 111 + install-conditionally/distributions/wblrnd.m | 129 + install-conditionally/distributions/wienrnd.m | 54 + install-conditionally/models/logistic_regression.m | 190 + .../private/logistic_regression_derivatives.m | 42 + .../private/logistic_regression_likelihood.m | 38 + install-conditionally/tests/anova.m | 120 + install-conditionally/tests/bartlett_test.m | 65 + .../tests/chisquare_test_homogeneity.m | 66 + .../tests/chisquare_test_independence.m | 52 + install-conditionally/tests/cor_test.m | 133 + install-conditionally/tests/f_test_regression.m | 75 + install-conditionally/tests/hotelling_test.m | 70 + install-conditionally/tests/hotelling_test_2.m | 84 + .../tests/kolmogorov_smirnov_test.m | 124 + .../tests/kolmogorov_smirnov_test_2.m | 103 + install-conditionally/tests/kruskal_wallis_test.m | 97 + install-conditionally/tests/manova.m | 162 + install-conditionally/tests/mcnemar_test.m | 62 + install-conditionally/tests/prop_test_2.m | 79 + install-conditionally/tests/run_test.m | 58 + install-conditionally/tests/sign_test.m | 90 + install-conditionally/tests/t_test.m | 112 + install-conditionally/tests/t_test_2.m | 100 + install-conditionally/tests/t_test_regression.m | 96 + install-conditionally/tests/u_test.m | 85 + install-conditionally/tests/var_test.m | 96 + install-conditionally/tests/welch_test.m | 84 + install-conditionally/tests/wilcoxon_test.m | 91 + install-conditionally/tests/z_test.m | 117 + install-conditionally/tests/z_test_2.m | 142 + octave-statistics.metainfo.xml | 38 + src/Makefile.in | 69 + src/bootstrap | 1 + src/config.guess | 1462 ++++++++ src/config.sub | 1825 +++++++++ src/configure | 3933 ++++++++++++++++++++ src/configure.ac | 58 + src/install-sh | 501 +++ test/caseread.dat | 3 + test/tblread-space.dat | 3 + test/tblread-tab.dat | 3 + utils/conditional_installation.m | 153 + utils/functions_to_install | 167 + utils/update_index_file.m | 161 + 355 files changed, 55569 insertions(+) create mode 100644 COPYING create mode 100644 DESCRIPTION create mode 100644 INDEX.in create mode 100644 Makefile create mode 100644 NEWS create mode 100644 PKG_ADD create mode 100644 PKG_DEL create mode 100644 README.crosscompilation create mode 100644 inst/@cvpartition/cvpartition.m create mode 100644 inst/@cvpartition/display.m create mode 100644 inst/@cvpartition/get.m create mode 100644 inst/@cvpartition/repartition.m create mode 100644 inst/@cvpartition/set.m create mode 100644 inst/@cvpartition/test.m create mode 100644 inst/@cvpartition/training.m create mode 100644 inst/ConfusionMatrixChart.m create mode 100644 inst/anderson_darling_cdf.m create mode 100644 inst/anderson_darling_test.m create mode 100644 inst/anova1.m create mode 100644 inst/anovan.m create mode 100644 inst/bbscdf.m create mode 100644 inst/bbsinv.m create mode 100644 inst/bbspdf.m create mode 100644 inst/bbsrnd.m create mode 100644 inst/betastat.m create mode 100644 inst/binostat.m create mode 100644 inst/binotest.m create mode 100644 inst/boxplot.m create mode 100644 inst/burrcdf.m create mode 100644 inst/burrinv.m create mode 100644 inst/burrpdf.m create mode 100644 inst/burrrnd.m create mode 100644 inst/canoncorr.m create mode 100644 inst/carbig.mat create mode 100644 inst/caseread.m create mode 100644 inst/casewrite.m create mode 100644 inst/cdf.m create mode 100644 inst/chi2stat.m create mode 100644 inst/cl_multinom.m create mode 100644 inst/cluster.m create mode 100644 inst/clusterdata.m create mode 100644 inst/cmdscale.m create mode 100644 inst/combnk.m create mode 100644 inst/confusionchart.m create mode 100644 inst/confusionmat.m create mode 100644 inst/cophenet.m create mode 100644 inst/copulacdf.m create mode 100644 inst/copulapdf.m create mode 100644 inst/copularnd.m create mode 100644 inst/crossval.m create mode 100644 inst/datasample.m create mode 100644 inst/dcov.m create mode 100644 inst/dendrogram.m create mode 100644 inst/evalclusters.m create mode 100644 inst/expfit.m create mode 100644 inst/explike.m create mode 100644 inst/expstat.m create mode 100644 inst/ff2n.m create mode 100644 inst/fisheriris.mat create mode 100644 inst/fisheriris.txt create mode 100644 inst/fitgmdist.m create mode 100644 inst/fstat.m create mode 100644 inst/fullfact.m create mode 100644 inst/gamfit.m create mode 100644 inst/gamlike.m create mode 100644 inst/gamstat.m create mode 100644 inst/geomean.m create mode 100644 inst/geostat.m create mode 100644 inst/gevcdf.m create mode 100644 inst/gevfit.m create mode 100644 inst/gevfit_lmom.m create mode 100644 inst/gevinv.m create mode 100644 inst/gevlike.m create mode 100644 inst/gevpdf.m create mode 100644 inst/gevrnd.m create mode 100644 inst/gevstat.m create mode 100644 inst/gmdistribution.m create mode 100644 inst/gpcdf.m create mode 100644 inst/gpinv.m create mode 100644 inst/gppdf.m create mode 100644 inst/gprnd.m create mode 100644 inst/grp2idx.m create mode 100644 inst/gscatter.m create mode 100644 inst/harmmean.m create mode 100644 inst/hist3.m create mode 100644 inst/histfit.m create mode 100644 inst/hmmestimate.m create mode 100644 inst/hmmgenerate.m create mode 100644 inst/hmmviterbi.m create mode 100644 inst/hygestat.m create mode 100644 inst/inconsistent.m create mode 100644 inst/iwishpdf.m create mode 100644 inst/iwishrnd.m create mode 100644 inst/jackknife.m create mode 100644 inst/jsucdf.m create mode 100644 inst/jsupdf.m create mode 100644 inst/kmeans.m create mode 100644 inst/kruskalwallis.m create mode 100644 inst/linkage.m create mode 100644 inst/lognstat.m create mode 100644 inst/mahal.m create mode 100644 inst/mhsample.m create mode 100644 inst/mnpdf.m create mode 100644 inst/mnrnd.m create mode 100644 inst/monotone_smooth.m create mode 100644 inst/mvncdf.m create mode 100644 inst/mvnpdf.m create mode 100644 inst/mvnrnd.m create mode 100644 inst/mvtcdf.m create mode 100644 inst/mvtpdf.m create mode 100644 inst/mvtrnd.m create mode 100644 inst/nakacdf.m create mode 100644 inst/nakainv.m create mode 100644 inst/nakapdf.m create mode 100644 inst/nakarnd.m create mode 100644 inst/nanmax.m create mode 100644 inst/nanmean.m create mode 100644 inst/nanmedian.m create mode 100644 inst/nanmin.m create mode 100644 inst/nanstd.m create mode 100644 inst/nansum.m create mode 100644 inst/nanvar.m create mode 100644 inst/nbinstat.m create mode 100644 inst/ncx2pdf.m create mode 100644 inst/normalise_distribution.m create mode 100644 inst/normplot.m create mode 100644 inst/normstat.m create mode 100644 inst/optimalleaforder.m create mode 100644 inst/pca.m create mode 100644 inst/pcacov.m create mode 100644 inst/pcares.m create mode 100644 inst/pdf.m create mode 100644 inst/pdist.m create mode 100644 inst/pdist2.m create mode 100644 inst/plsregress.m create mode 100644 inst/poisstat.m create mode 100644 inst/princomp.m create mode 100644 inst/private/CalinskiHarabaszEvaluation.m create mode 100644 inst/private/ClusterCriterion.m create mode 100644 inst/private/DaviesBouldinEvaluation.m create mode 100644 inst/private/GapEvaluation.m create mode 100644 inst/private/SilhouetteEvaluation.m create mode 100644 inst/private/tbl_delim.m create mode 100644 inst/qrandn.m create mode 100644 inst/random.m create mode 100644 inst/randsample.m create mode 100644 inst/raylcdf.m create mode 100644 inst/raylinv.m create mode 100644 inst/raylpdf.m create mode 100644 inst/raylrnd.m create mode 100644 inst/raylstat.m create mode 100644 inst/regress.m create mode 100644 inst/regress_gp.m create mode 100644 inst/repanova.m create mode 100644 inst/runstest.m create mode 100644 inst/sigma_pts.m create mode 100644 inst/signtest.m create mode 100644 inst/silhouette.m create mode 100644 inst/slicesample.m create mode 100644 inst/squareform.m create mode 100644 inst/stepwisefit.m create mode 100644 inst/tabulate.m create mode 100644 inst/tblread.m create mode 100644 inst/tblwrite.m create mode 100644 inst/tricdf.m create mode 100644 inst/triinv.m create mode 100644 inst/trimmean.m create mode 100644 inst/tripdf.m create mode 100644 inst/trirnd.m create mode 100644 inst/tstat.m create mode 100644 inst/ttest.m create mode 100644 inst/ttest2.m create mode 100644 inst/unidstat.m create mode 100644 inst/unifstat.m create mode 100644 inst/vartest.m create mode 100644 inst/vartest2.m create mode 100644 inst/violin.m create mode 100644 inst/vmpdf.m create mode 100644 inst/vmrnd.m create mode 100644 inst/wblplot.m create mode 100644 inst/wblstat.m create mode 100644 inst/wishpdf.m create mode 100644 inst/wishrnd.m create mode 100644 inst/ztest.m create mode 100644 install-conditionally/INDEX.in create mode 100644 install-conditionally/README create mode 100644 install-conditionally/base/center.m create mode 100644 install-conditionally/base/cloglog.m create mode 100644 install-conditionally/base/corr.m create mode 100644 install-conditionally/base/corrcoef.m create mode 100644 install-conditionally/base/cov.m create mode 100644 install-conditionally/base/crosstab.m create mode 100644 install-conditionally/base/gls.m create mode 100644 install-conditionally/base/histc.m create mode 100644 install-conditionally/base/iqr.m create mode 100644 install-conditionally/base/ismissing.m create mode 100644 install-conditionally/base/kendall.m create mode 100644 install-conditionally/base/kurtosis.m create mode 100644 install-conditionally/base/logit.m create mode 100644 install-conditionally/base/lscov.m create mode 100644 install-conditionally/base/mad.m create mode 100644 install-conditionally/base/mean.m create mode 100644 install-conditionally/base/meansq.m create mode 100644 install-conditionally/base/median.m create mode 100644 install-conditionally/base/mode.m create mode 100644 install-conditionally/base/moment.m create mode 100644 install-conditionally/base/ols.m create mode 100644 install-conditionally/base/ppplot.m create mode 100644 install-conditionally/base/prctile.m create mode 100644 install-conditionally/base/probit.m create mode 100644 install-conditionally/base/qqplot.m create mode 100644 install-conditionally/base/quantile.m create mode 100644 install-conditionally/base/range.m create mode 100644 install-conditionally/base/ranks.m create mode 100644 install-conditionally/base/rmmissing.m create mode 100644 install-conditionally/base/run_count.m create mode 100644 install-conditionally/base/runlength.m create mode 100644 install-conditionally/base/skewness.m create mode 100644 install-conditionally/base/spearman.m create mode 100644 install-conditionally/base/statistics.m create mode 100644 install-conditionally/base/std.m create mode 100644 install-conditionally/base/var.m create mode 100644 install-conditionally/base/zscore.m create mode 100644 install-conditionally/distributions/betacdf.m create mode 100644 install-conditionally/distributions/betainv.m create mode 100644 install-conditionally/distributions/betapdf.m create mode 100644 install-conditionally/distributions/betarnd.m create mode 100644 install-conditionally/distributions/binocdf.m create mode 100644 install-conditionally/distributions/binoinv.m create mode 100644 install-conditionally/distributions/binopdf.m create mode 100644 install-conditionally/distributions/binornd.m create mode 100644 install-conditionally/distributions/cauchy_cdf.m create mode 100644 install-conditionally/distributions/cauchy_inv.m create mode 100644 install-conditionally/distributions/cauchy_pdf.m create mode 100644 install-conditionally/distributions/cauchy_rnd.m create mode 100644 install-conditionally/distributions/chi2cdf.m create mode 100644 install-conditionally/distributions/chi2inv.m create mode 100644 install-conditionally/distributions/chi2pdf.m create mode 100644 install-conditionally/distributions/chi2rnd.m create mode 100644 install-conditionally/distributions/discrete_cdf.m create mode 100644 install-conditionally/distributions/discrete_inv.m create mode 100644 install-conditionally/distributions/discrete_pdf.m create mode 100644 install-conditionally/distributions/discrete_rnd.m create mode 100644 install-conditionally/distributions/empirical_cdf.m create mode 100644 install-conditionally/distributions/empirical_inv.m create mode 100644 install-conditionally/distributions/empirical_pdf.m create mode 100644 install-conditionally/distributions/empirical_rnd.m create mode 100644 install-conditionally/distributions/expcdf.m create mode 100644 install-conditionally/distributions/expinv.m create mode 100644 install-conditionally/distributions/exppdf.m create mode 100644 install-conditionally/distributions/exprnd.m create mode 100644 install-conditionally/distributions/fcdf.m create mode 100644 install-conditionally/distributions/finv.m create mode 100644 install-conditionally/distributions/fpdf.m create mode 100644 install-conditionally/distributions/frnd.m create mode 100644 install-conditionally/distributions/gamcdf.m create mode 100644 install-conditionally/distributions/gaminv.m create mode 100644 install-conditionally/distributions/gampdf.m create mode 100644 install-conditionally/distributions/gamrnd.m create mode 100644 install-conditionally/distributions/geocdf.m create mode 100644 install-conditionally/distributions/geoinv.m create mode 100644 install-conditionally/distributions/geopdf.m create mode 100644 install-conditionally/distributions/geornd.m create mode 100644 install-conditionally/distributions/hygecdf.m create mode 100644 install-conditionally/distributions/hygeinv.m create mode 100644 install-conditionally/distributions/hygepdf.m create mode 100644 install-conditionally/distributions/hygernd.m create mode 100644 install-conditionally/distributions/kolmogorov_smirnov_cdf.m create mode 100644 install-conditionally/distributions/laplace_cdf.m create mode 100644 install-conditionally/distributions/laplace_inv.m create mode 100644 install-conditionally/distributions/laplace_pdf.m create mode 100644 install-conditionally/distributions/laplace_rnd.m create mode 100644 install-conditionally/distributions/logistic_cdf.m create mode 100644 install-conditionally/distributions/logistic_inv.m create mode 100644 install-conditionally/distributions/logistic_pdf.m create mode 100644 install-conditionally/distributions/logistic_rnd.m create mode 100644 install-conditionally/distributions/logncdf.m create mode 100644 install-conditionally/distributions/logninv.m create mode 100644 install-conditionally/distributions/lognpdf.m create mode 100644 install-conditionally/distributions/lognrnd.m create mode 100644 install-conditionally/distributions/nbincdf.m create mode 100644 install-conditionally/distributions/nbininv.m create mode 100644 install-conditionally/distributions/nbinpdf.m create mode 100644 install-conditionally/distributions/nbinrnd.m create mode 100644 install-conditionally/distributions/normcdf.m create mode 100644 install-conditionally/distributions/norminv.m create mode 100644 install-conditionally/distributions/normpdf.m create mode 100644 install-conditionally/distributions/normrnd.m create mode 100644 install-conditionally/distributions/poisscdf.m create mode 100644 install-conditionally/distributions/poissinv.m create mode 100644 install-conditionally/distributions/poisspdf.m create mode 100644 install-conditionally/distributions/poissrnd.m create mode 100644 install-conditionally/distributions/stdnormal_cdf.m create mode 100644 install-conditionally/distributions/stdnormal_inv.m create mode 100644 install-conditionally/distributions/stdnormal_pdf.m create mode 100644 install-conditionally/distributions/stdnormal_rnd.m create mode 100644 install-conditionally/distributions/tcdf.m create mode 100644 install-conditionally/distributions/tinv.m create mode 100644 install-conditionally/distributions/tpdf.m create mode 100644 install-conditionally/distributions/trnd.m create mode 100644 install-conditionally/distributions/unidcdf.m create mode 100644 install-conditionally/distributions/unidinv.m create mode 100644 install-conditionally/distributions/unidpdf.m create mode 100644 install-conditionally/distributions/unidrnd.m create mode 100644 install-conditionally/distributions/unifcdf.m create mode 100644 install-conditionally/distributions/unifinv.m create mode 100644 install-conditionally/distributions/unifpdf.m create mode 100644 install-conditionally/distributions/unifrnd.m create mode 100644 install-conditionally/distributions/wblcdf.m create mode 100644 install-conditionally/distributions/wblinv.m create mode 100644 install-conditionally/distributions/wblpdf.m create mode 100644 install-conditionally/distributions/wblrnd.m create mode 100644 install-conditionally/distributions/wienrnd.m create mode 100644 install-conditionally/models/logistic_regression.m create mode 100644 install-conditionally/models/private/logistic_regression_derivatives.m create mode 100644 install-conditionally/models/private/logistic_regression_likelihood.m create mode 100644 install-conditionally/tests/anova.m create mode 100644 install-conditionally/tests/bartlett_test.m create mode 100644 install-conditionally/tests/chisquare_test_homogeneity.m create mode 100644 install-conditionally/tests/chisquare_test_independence.m create mode 100644 install-conditionally/tests/cor_test.m create mode 100644 install-conditionally/tests/f_test_regression.m create mode 100644 install-conditionally/tests/hotelling_test.m create mode 100644 install-conditionally/tests/hotelling_test_2.m create mode 100644 install-conditionally/tests/kolmogorov_smirnov_test.m create mode 100644 install-conditionally/tests/kolmogorov_smirnov_test_2.m create mode 100644 install-conditionally/tests/kruskal_wallis_test.m create mode 100644 install-conditionally/tests/manova.m create mode 100644 install-conditionally/tests/mcnemar_test.m create mode 100644 install-conditionally/tests/prop_test_2.m create mode 100644 install-conditionally/tests/run_test.m create mode 100644 install-conditionally/tests/sign_test.m create mode 100644 install-conditionally/tests/t_test.m create mode 100644 install-conditionally/tests/t_test_2.m create mode 100644 install-conditionally/tests/t_test_regression.m create mode 100644 install-conditionally/tests/u_test.m create mode 100644 install-conditionally/tests/var_test.m create mode 100644 install-conditionally/tests/welch_test.m create mode 100644 install-conditionally/tests/wilcoxon_test.m create mode 100644 install-conditionally/tests/z_test.m create mode 100644 install-conditionally/tests/z_test_2.m create mode 100644 octave-statistics.metainfo.xml create mode 100644 src/Makefile.in create mode 100755 src/bootstrap create mode 100755 src/config.guess create mode 100755 src/config.sub create mode 100755 src/configure create mode 100644 src/configure.ac create mode 100755 src/install-sh create mode 100644 test/caseread.dat create mode 100644 test/tblread-space.dat create mode 100644 test/tblread-tab.dat create mode 100644 utils/conditional_installation.m create mode 100644 utils/functions_to_install create mode 100644 utils/update_index_file.m diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..b028429 --- /dev/null +++ b/COPYING @@ -0,0 +1,139 @@ +inst/private/tbl_delim.m.m GPLv3+ +inst/anderson_darling_cdf.m public domain +inst/anderson_darling_test.m public domain +inst/anovan.m GPLv3+ +inst/bbscdf.m GPLv3+ +inst/bbsinv.m GPLv3+ +inst/bbspdf.m GPLv3+ +inst/bbsrnd.m GPLv3+ +inst/betastat.m GPLv3+ +inst/binostat.m GPLv3+ +inst/binotest.m GPLv3+ +inst/boxplot.m GPLv3+ +inst/burrcdf.m GPLv3+ +inst/burrinv.m GPLv3+ +inst/burrpdf.m GPLv3+ +inst/burrrnd.m GPLv3+ +inst/caseread.m GPLv3+ +inst/casewrite.m GPLv3+ +inst/cdf.m GPLv3+ +inst/chi2stat.m GPLv3+ +inst/cl_multinom.m GPLv3+ +inst/cmdscale.m GPLv3+ +inst/combnk.m GPLv3+ +inst/copulacdf.m GPLv3+ +inst/copulapdf.m GPLv3+ +inst/copularnd.m GPLv3+ +inst/crossval.m GPLv3+ +inst/dcov.m GPLv3+ +inst/dendogram.m GPLv3+ +inst/expstat.m GPLv3+ +inst/ff2n.m public domain +inst/fitgmdist.m GPLv3+ +inst/fstat.m GPLv3+ +inst/fullfact.m public domain +inst/gamfit.m public domain +inst/gamlike.m public domain +inst/gamstat.m GPLv3+ +inst/geomean.m GPLv3+ +inst/geostat.m GPLv3+ +inst/gevcdf.m GPLv3+ +inst/gevfit_lmom.m GPLv3+ +inst/gevfit.m GPLv3+ +inst/gevinv.m GPLv3+ +inst/gevlike.m GPLv3+ +inst/gevpdf.m GPLv3+ +inst/gevrnd.m GPLv3+ +inst/gevstat.m GPLv3+ +inst/gmdistribution.m GPLv3+ +inst/gpcdf.m GPLv3+ +inst/gpinv.m GPLv3+ +inst/gppdf.m GPLv3+ +inst/gprnd.m GPLv3+ +inst/grp2idx.m GPLv3+ +inst/harmmean.m GPLv3+ +inst/hist3.m GPLv3+ +inst/histfit.m GPLv3+ +inst/hmmestimate.m GPLv3+ +inst/hmmgenerate.m GPLv3+ +inst/hmmviterbi.m GPLv3+ +inst/iwishpdf.m GPLv3+ +inst/iwishrnd.m GPLv3+ +inst/hygestat.m GPLv3+ +inst/jackknife.m GPLv3+ +inst/jsucdf.m GPLv3+ +inst/jsupdf.m GPLv3+ +inst/kmeans.m GPLv3+ +inst/linkage.m GPLv3+ +inst/lognstat.m GPLv3+ +inst/mad.m GPLv3+ +inst/mahal.m GPLv3+ +inst/mnpdf.m GPLv3+ +inst/mnrnd.m GPLv3+ +inst/monotone_smooth.m GPLv3+ +inst/mvncdf.m GPLv3+ +inst/mvnpdf.m public domain +inst/mvnrnd.m GPLv3+ +inst/mvtcdf.m GPLv3+ +inst/mvtpdf.m GPLv3+ +inst/mvtrnd.m GPLv3+ +inst/nakacdf.m GPLv3+ +inst/nakainv.m GPLv3+ +inst/nakapdf.m GPLv3+ +inst/nakarnd.m GPLv3+ +inst/nanmax.m GPLv3+ +inst/nanmean.m GPLv3+ +inst/nanmedian.m GPLv3+ +inst/nanmin.m GPLv3+ +inst/nanstd.m GPLv3+ +inst/nansum.m GPLv3+ +inst/nanvar.m GPLv3+ +inst/nbinstat.m GPLv3+ +inst/normalise_distribution.m GPLv3+ +inst/normplot.m public domain +inst/normstat.m GPLv3+ +inst/pcacov.m GPLv3+ +inst/pcares.m GPLv3+ +inst/pdf.m GPLv3+ +inst/pdist.m GPLv3+ +inst/pdist2.m GPLv3+ +inst/plsregress.m GPLv3+ +inst/poisstat.m GPLv3+ +inst/princomp.m public domain +inst/qrandn.m GPLv3+ +inst/random.m GPLv3+ +inst/randsample.m GPLv3+ +inst/raylcdf.m GPLv3+ +inst/raylinv.m GPLv3+ +inst/raylpdf.m GPLv3+ +inst/raylrnd.m GPLv3+ +inst/raylstat.m GPLv3+ +inst/regress_gp.m GPLv3+ +inst/regress.m GPLv3+ +inst/repanova.m.m GPLv3+ +inst/runtest.m GPLv3+ +inst/signtest.m GPLv3+ +inst/squareform.m GPLv3+ +inst/stepwisefit.m GPLv3+ +inst/tabulate.m GPLv3+ +inst/tblread.m GPLv3+ +inst/tblwrite.m GPLv3+ +inst/tricdf.m GPLv3+ +inst/triinv.m GPLv3+ +inst/trimmean.m GPLv3+ +inst/tripdf.m GPLv3+ +inst/trirnd.m GPLv3+ +inst/tstat.m GPLv3+ +inst/ttest.m GPLv3+ +inst/ttest2.m GPLv3+ +inst/unidstat.m GPLv3+ +inst/unifstat.m GPLv3+ +inst/vartest.m GPLv3+ +inst/vartest2.m GPLv3+ +inst/violin.m GPLv3+ +inst/vmpdf.m GPLv3+ +inst/vmrnd.m GPLv3+ +inst/wblstat.m GPLv3+ +inst/wishpdf.m GPLv3+ +inst/wishrnd.m GPLv3+ +inst/ztest.m GPLv3+ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..26cfd7a --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,11 @@ +Name: statistics +Version: 1.4.3 +Date: 2021-12-03 +Author: various authors +Maintainer: Octave-Forge community +Title: Statistics +Description: Additional statistics functions for Octave. +Categories: Statistics +Depends: octave (>= 4.0.0), io (>= 1.0.18) +License: GPLv3+, public domain +Url: http://octave.sf.net diff --git a/INDEX.in b/INDEX.in new file mode 100644 index 0000000..6d39105 --- /dev/null +++ b/INDEX.in @@ -0,0 +1,142 @@ +statistics >> Statistics +Distributions + anderson_darling_cdf + bbscdf bbsinv bbspdf bbsrnd + betastat + binostat binotest + burrcdf burrinv burrpdf burrrnd + cdf + chi2stat + cl_multinom + copulacdf copulapdf copularnd + datasample + expfit explike expstat + fstat + gamlike + gamstat + geostat + gevcdf gevfit gevfit_lmom gevinv gevlike gevpdf gevrnd gevstat + gmdistribution + gpcdf gpinv gppdf gprnd + hygestat + iwishpdf iwishrnd + jsucdf + jsupdf + lognstat + mnpdf mnrnd + mvnpdf mvnrnd mvncdf + mvtcdf mvtpdf mvtrnd + nakacdf nakainv nakapdf nakarnd + nbinstat + ncx2pdf + normalise_distribution + normstat + pdf + poisstat + qrandn + random + randsample + raylcdf raylinv raylpdf raylrnd raylstat + tricdf triinv tripdf trirnd + tstat + unidstat + unifstat + vmpdf vmrnd + wblstat + wishpdf wishrnd +Descriptive statistics + combnk + dcov + geomean + harmmean + jackknife + nanmax + nanmean + nanmedian + nanmin + nanstd + nansum + nanvar + trimmean + tabulate +Experimental design + fullfact ff2n +Regression + anova1 + anovan + canoncorr + crossval + monotone_smooth + pca + pcacov + pcares + plsregress + princomp + regress + regress_gp + stepwisefit +Plots + boxplot + confusionchart + dendrogram + gscatter + histfit + hist3 + normplot + repanova + silhouette + violin + wblplot +Models + hmmestimate + hmmgenerate + hmmviterbi + mhsample + slicesample +Hypothesis testing + anderson_darling_test + kruskalwallis + runstest + signtest + ttest + ttest2 + vartest + vartest2 + ztest +Fitting + fitgmdist + gamfit +Clustering + cluster + clusterdata + cmdscale + cophenet + evalclusters + inconsistent + kmeans + linkage + mahal + optimalleaforder + pdist + pdist2 + squareform +Reading and Writing + caseread + casewrite + tblread + tblwrite +Cvpartition (class of set partitions for cross-validation, used in crossval) + @cvpartition/cvpartition + @cvpartition/display + @cvpartition/get + @cvpartition/repartition + @cvpartition/set + @cvpartition/test + @cvpartition/training +Categorical data + grp2idx +Classification Performance Evaluation + confusionchart + confusionmat +Other + sigma_pts diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a2d94cd --- /dev/null +++ b/Makefile @@ -0,0 +1,255 @@ +## Copyright 2015-2016 Carnë Draug +## Copyright 2015-2016 Oliver Heimlich +## Copyright 2017 Julien Bect +## Copyright 2017 Olaf Till +## Copyright 2018 John Donoghue +## +## Copying and distribution of this file, with or without modification, +## are permitted in any medium without royalty provided the copyright +## notice and this notice are preserved. This file is offered as-is, +## without any warranty. +TOPDIR := $(shell pwd) + +## Some basic tools (can be overriden using environment variables) +SED ?= sed +TAR ?= tar +GREP ?= grep +CUT ?= cut +TR ?= tr + +## Note the use of ':=' (immediate set) and not just '=' (lazy set). +## http://stackoverflow.com/a/448939/1609556 +package := $(shell $(GREP) "^Name: " DESCRIPTION | $(CUT) -f2 -d" " | \ +$(TR) '[:upper:]' '[:lower:]') +version := $(shell $(GREP) "^Version: " DESCRIPTION | $(CUT) -f2 -d" ") + +## These are the paths that will be created for the releases. +target_dir := target +release_dir := $(target_dir)/$(package)-$(version) +release_tarball := $(target_dir)/$(package)-$(version).tar.gz +html_dir := $(target_dir)/$(package)-html +html_tarball := $(target_dir)/$(package)-html.tar.gz +## Using $(realpath ...) avoids problems with symlinks due to bug +## #50994 in Octaves scripts/pkg/private/install.m. But at least the +## release directory above is needed in the relative form, for 'git +## archive --format=tar --prefix=$(release_dir). +real_target_dir := $(realpath .)/$(target_dir) +installation_dir := $(real_target_dir)/.installation +package_list := $(installation_dir)/.octave_packages +install_stamp := $(installation_dir)/.install_stamp + +## These can be set by environment variables which allow to easily +## test with different Octave versions. +ifndef OCTAVE +OCTAVE := octave +endif +OCTAVE := $(OCTAVE) --no-gui --silent --no-history --norc +MKOCTFILE ?= mkoctfile + +## Command used to set permissions before creating tarballs +FIX_PERMISSIONS ?= chmod -R a+rX,u+w,go-w,ug-s + +## Detect which VCS is used +vcs := $(if $(wildcard .hg),hg,$(if $(wildcard .git),git,unknown)) +ifeq ($(vcs),hg) +release_dir_dep := .hg/dirstate +endif +ifeq ($(vcs),git) +release_dir_dep := .git/index +endif + +HG := hg +HG_CMD = $(HG) --config alias.$(1)=$(1) --config defaults.$(1)= $(1) +HG_ID := $(shell $(call HG_CMD,identify) --id | sed -e 's/+//' ) +HG_TIMESTAMP := $(firstword $(shell $(call HG_CMD,log) --rev $(HG_ID) --template '{date|hgdate}')) + +TAR_REPRODUCIBLE_OPTIONS := --sort=name --mtime="@$(HG_TIMESTAMP)" --owner=0 --group=0 --numeric-owner +TAR_OPTIONS := --format=ustar $(TAR_REPRODUCIBLE_OPTIONS) + +## .PHONY indicates targets that are not filenames +## (https://www.gnu.org/software/make/manual/html_node/Phony-Targets.html) +.PHONY: help + +## make will display the command before runnning them. Use @command +## to not display it (makes specially sense for echo). +help: + @echo "Targets:" + @echo " dist - Create $(release_tarball) for release." + @echo " html - Create $(html_tarball) for release." + @echo " release - Create both of the above and show md5sums." + @echo " install - Install the package in $(installation_dir), where it is not visible in a normal Octave session." + @echo " check - Execute package tests." + @echo " doctest - Test the help texts with the doctest package." + @echo " run - Run Octave with the package installed in $(installation_dir) in the path." + @echo " clean - Remove everything made with this Makefile." + + +## +## Recipes for release tarballs (package + html) +## + +.PHONY: release dist html clean-tarballs clean-unpacked-release + +## To make a release, build the distribution and html tarballs. +release: dist html + md5sum $(release_tarball) $(html_tarball) + @echo "Upload @ https://sourceforge.net/p/octave/package-releases/new/" + @echo " and note the changeset the release corresponds to" + +## dist and html targets are only PHONY/alias targets to the release +## and html tarballs. +dist: $(release_tarball) +html: $(html_tarball) + +## An implicit rule with a recipe to build the tarballs correctly. +%.tar.gz: % + $(TAR) -cf - $(TAR_OPTIONS) -C "$(target_dir)/" "$(notdir $<)" | gzip -9n > "$@" + +clean-tarballs: + @echo "## Cleaning release tarballs (package + html)..." + -$(RM) $(release_tarball) $(html_tarball) + @echo + +## Create the unpacked package. +## +## Notes: +## * having ".hg/dirstate" (or ".git/index") as a prerequesite means it is +## only rebuilt if we are at a different commit. +## * the variable RM usually defaults to "rm -f" +## * having this recipe separate from the one that makes the tarball +## makes it easy to have packages in alternative formats (such as zip) +## * note that if a commands needs to be run in a specific directory, +## the command to "cd" needs to be on the same line. Each line restores +## the original working directory. +$(release_dir): $(release_dir_dep) + -$(RM) -r "$@" +ifeq (${vcs},hg) + hg archive --exclude ".hg*" --type files "$@" +endif +ifeq (${vcs},git) + git archive --format=tar --prefix="$@/" HEAD | $(TAR) -x + $(RM) "$@/.gitignore" +endif +## Don't fall back to run the supposed necessary contents of +## 'bootstrap' here. Users are better off if they provide +## 'bootstrap'. Administrators, checking build reproducibility, can +## put in the missing 'bootstrap' file if they feel they know its +## necessary contents. +ifneq (,$(wildcard src/bootstrap)) + cd "$@/src" && ./bootstrap && $(RM) -r "autom4te.cache" +endif +## Uncomment this if your src/Makefile.in has these targets for +## pre-building something for the release (e.g. documentation). +# cd "$@/src" && ./configure && $(MAKE) prebuild && \ +# $(MAKE) distclean && $(RM) Makefile +## + ${FIX_PERMISSIONS} "$@" + +run_in_place = $(OCTAVE) --eval ' pkg ("local_list", "$(package_list)"); ' \ + --eval ' pkg ("load", "$(package)"); ' + +html_options = --eval 'options = get_html_options ("octave-forge");' +## Uncomment this for package documentation. +# html_options = --eval 'options = get_html_options ("octave-forge");' \ +# --eval 'options.package_doc = "$(package).texi";' +$(html_dir): $(install_stamp) + $(RM) -r "$@"; + $(run_in_place) \ + --eval ' pkg load generate_html; ' \ + $(html_options) \ + --eval ' generate_package_html ("$(package)", "$@", options); '; + $(FIX_PERMISSIONS) "$@"; + +clean-unpacked-release: + @echo "## Cleaning unpacked release tarballs (package + html)..." + -$(RM) -r $(release_dir) $(html_dir) + @echo + +## +## Recipes for installing the package. +## + +.PHONY: install clean-install + +octave_install_commands = \ +' llist_path = pkg ("local_list"); \ + mkdir ("$(installation_dir)"); \ + load (llist_path); \ + local_packages(cellfun (@ (x) strcmp ("$(package)", x.name), local_packages)) = []; \ + save ("$(package_list)", "local_packages"); \ + pkg ("local_list", "$(package_list)"); \ + pkg ("prefix", "$(installation_dir)", "$(installation_dir)"); \ + pkg ("install", "-local", "-verbose", "$(release_tarball)"); ' + +## Install unconditionally. Maybe useful for testing installation with +## different versions of Octave. +install: $(release_tarball) + @echo "Installing package under $(installation_dir) ..." + $(OCTAVE) --eval $(octave_install_commands) + touch $(install_stamp) + +## Install only if installation (under target/...) is not current. +$(install_stamp): $(release_tarball) + @echo "Installing package under $(installation_dir) ..." + $(OCTAVE) --eval $(octave_install_commands) + touch $(install_stamp) + +clean-install: + @echo "## Cleaning installation under $(installation_dir) ..." + -$(RM) -r $(installation_dir) + @echo + + +## +## Recipes for testing purposes +## + +.PHONY: run doctest check clean-check + +## Start an Octave session with the package directories on the path for +## interactice test of development sources. +run: $(install_stamp) + $(run_in_place) --persist + +## Test example blocks in the documentation. Needs doctest package +## https://octave.sourceforge.io/doctest/index.html +doctest: $(install_stamp) + $(run_in_place) --eval 'pkg load doctest;' \ + --eval "targets = '$(shell (ls inst; ls src | $(GREP) .oct) | $(CUT) -f2 -d@ | $(CUT) -f1 -d.)';" \ + --eval "targets = strsplit (targets, ' '); doctest (targets);" + + +## Test package. +orig_octave_test_commands = \ +' dirs = {"inst", "src"}; \ + dirs(cellfun (@ (x) ! isdir (x), dirs)) = []; \ + if (isempty (dirs)) error ("no \"inst\" or \"src\" directory"); exit (1); \ + else __run_test_suite__ (dirs, {}); endif ' +octave_test_commands = \ +' pkgs = pkg("list", "$(package)"); \ + dirs = {pkgs{1}.dir}; \ + __run_test_suite__ (dirs, {}); ' +## the following works, too, but provides no overall summary output as +## __run_test_suite__ does: +## +## else cellfun (@runtests, horzcat (cellfun (@ (dir) ostrsplit (([~, dirs] = system (sprintf ("find %s -type d", dir))), "\n\r", true), dirs, "UniformOutput", false){:})); endif ' +check: $(install_stamp) + $(run_in_place) --eval $(octave_test_commands) + +clean-check: + @echo "## Removing fntests.log ..." + test -e $(target_dir)/fntests.log && rm -f $(target_dir)/fntests.log || true + +## +## CLEAN +## + +.PHONY: clean + +clean: clean-tarballs clean-unpacked-release clean-install clean-check + @echo "## Removing target directory (if empty)..." + test -e $(target_dir) && rmdir $(target_dir) || true + @echo + @echo "## Cleaning done" + @echo + diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..7fccb9b --- /dev/null +++ b/NEWS @@ -0,0 +1,471 @@ +Summary of important user-visible changes for statistics 1.4.3: +------------------------------------------------------------------- + + New functions: + ============== + + ** anova1 (patch #10127) + kruskalwallis + + ** cluster (patch #10009) + + ** clusterdata (patch #10012) + + ** confusionchart (patch #9985) + + ** confusionmat (patch #9971) + + ** cophenet (patch #10040) + + ** datasample (patch #10050) + + ** evalclusters (patch #10052) + + ** expfit (patch #10092) + explike + + ** gscatter (patch #10043) + + ** ismissing (patch #10102) + + ** inconsistent (patch #10008) + + ** mhsample.m (patch #10016) + + ** ncx2pdf (patch #9711) + + ** optimalleaforder.m (patch #10034) + + ** pca (patch #10104) + + ** rmmissing (patch #10102) + + ** silhouette (patch #9743) + + ** slicesample (patch #10019) + + ** wblplot (patch #8579) + + Improvements: + ============= + + ** anovan.m: use double instead of toascii (bug #60514) + + ** binocdf: new option "upper" (bug #43721) + + ** boxplot: better Matlab compatibility; several Matlab-compatible + plot options added (OutlierTags, Sample_IDs, BoxWidth, Widths, + BoxStyle, Positions, Labels, Colors) and an Octave-specific one + (CapWidhts); demos added; texinfo improved (patch #9930) + + ** auto MPG (carbig) sample dataset added from + https://archive.ics.uci.edu/ml/datasets/Auto+MPG (patch #10045) + + ** crosstab.m: make n-dimensional (patch #10014) + + ** dendrogram.m: many improvements (patch #10036) + + ** fitgmdist.m: fix typo in ComponentProportion (bug #59386) + + ** gevfit: change orientation of results for Matlab compatibility (bug #47369) + + ** hygepdf: avoid overflow for certain inputs (bug #35827) + + ** kmeans: efficiency and compatibility tweaks (patch #10042) + + ** pdist: option for squared Euclidean distance (patch #10051) + + ** stepwisefit.m: give another option to select predictors (patch #8584) + + ** tricdf, triinv: fixes (bug #60113) + + +Summary of important user-visible changes for statistics 1.4.2: +------------------------------------------------------------------- + + ** canoncorr: allow more variables than observations + + ** fitgmdist: return fitgmdist parameters (Bug #57917) + + ** gamfit: invert parameter per docs (Bug #57849) + + ** geoXXX: update docs 'number of failures (X-1)' => 'number of failures (X)' (Bug #57606) + + ** kolmogorov_smirnov_test.m: update function handle usage from octave6+ (Bug #57351) + + ** linkage.m: fix octave6+ parse error (Bug #57348) + + ** unifrnd: changed unifrnd(a,a) to return a 0 rather than NaN (Bug #56342) + + ** updates for usage of depreciated octave functions + +Summary of important user-visible changes for statistics 1.4.1: +------------------------------------------------------------------- + ** update install scripts for octave 5.0 depreciated functions + + ** bug fixes to the following functions: + pdist2.m: use max in distEucSq (Bug #50377) + normpdf: use eps tolerance in tests (Bug #51963) + fitgmdist: fix an output bug in fitgmdist + t_test: Set tolerance on t_test BISTS (Bug #54557) + gpXXXXX: change order of inputs to match matlab (Bug #54009) + bartlett_test: df = k-1 (Bug #45894) + gppdf: apply scale factor (Bug #54009) + gmdistribution: updates for bug #54278, ##54279 + wishrnd: Bug #55860 + +Summary of important user-visible changes for statistics 1.4.0: +------------------------------------------------------------------- + + ** The following functions are new: + + canoncorr + fitgmdist + gmdistribution + sigma_pts + + ** The following functions have been moved from the statistics package but are + conditionally installed: + + mad + + ** The following functions have been moved from octave to be conditionally + installed: + + BASE + cloglog + logit + prctile + probit + qqplot + table (renamed to crosstab) + + DISTRIBUTIONS + betacdf + betainv + betapdf + betarnd + binocdf + binoinv + binopdf + binornd + cauchy_cdf + cauchy_inv + cauchy_pdf + cauchy_rnd + chi2cdf + chi2inv + chi2pdf + chi2rnd + expcdf + expinv + exppdf + exprnd + fcdf + finv + fpdf + frnd + gamcdf + gaminv + gampdf + gamrnd + geocdf + geoinv + geopdf + geornd + hygecdf + hygeinv + hygepdf + hygernd + kolmogorov_smirnov_cdf + laplace_cdf + laplace_inv + laplace_pdf + laplace_rnd + logistic_cdf + logistic_inv + logistic_pdf + logistic_rnd + logncdf + logninv + lognpdf + lognrnd + nbincdf + nbininv + nbinpdf + nbinrnd + normcdf + norminv + normpdf + normrnd + poisscdf + poissinv + poisspdf + poissrnd + stdnormal_cdf + stdnormal_inv + stdnormal_pdf + stdnormal_rnd + tcdf + tinv + tpdf + trnd + unidcdf + unidinv + unidpdf + unidrnd + unifcdf + unifinv + unifpdf + unifrnd + wblcdf + wblinv + wblpdf + wblrnd + wienrnd + + MODELS + logistic_regression + + TESTS + anova + bartlett_test + chisquare_test_homogeneity + chisquare_test_independence + cor_test + f_test_regression + hotelling_test + hotelling_test_2 + kolmogorov_smirnov_test + kolmogorov_smirnov_test_2 + kruskal_wallis_test + manova + mcnemar_test + prop_test_2 + run_test + sign_test + t_test + t_test_2 + t_test_regression + u_test + var_test + welch_test + wilcoxon_test + z_test + z_test_2 + + ** Functions marked with known test failures: + grp2idx: bug #51928 + gevfir_lmom: bug #31070 + + ** Other functions that have been changed for smaller bugfixes, increased + Matlab compatibility, or performance: + + dcov: returned dcov instead of dcor. added demo. + violin: can be used with subplots. violin quality improved. + princomp: Fix expected values of tsquare in unit tests + fitgmdist: test number inputs to function + hist3: fix removal of rows with NaN values + + ** added the packages test data to install + + +Summary of important user-visible changes for statistics 1.3.0: +------------------------------------------------------------------- + + ** The following functions are new: + + bbscdf bbsinv bbspdf bbsrnd + binotest + burrcdf burrinv burrpdf burrrnd + gpcdf gpinv gppdf gprnd + grp2idx + mahal + mvtpdf + nakacdf nakainv nakapdf nakarnd + pdf + tricdf triinv tripdf trirnd + violin + + ** Other functions that have been changed for smaller bugfixes, increased + Matlab compatibility, or performance: + + betastat + binostat + cdf + combnk + gevfit + hist3 + kmeans + linkage + randsample + squareform + ttest + + +Summary of important user-visible changes for statistics 1.2.4: +------------------------------------------------------------------- + + ** Made princomp work with nargout < 2. + + ** Renamed dendogram to dendrogram. + + ** Added isempty check to kmeans. + + ** Transposed output of hist3. + + ** Converted calculation in hmmviterbi to log space. + + ** Bug fixes for + stepwisefit wishrnd. + + ** Rewrite of cmdscale for improved compatibility. + + ** Fix in squareform for improved compatibility. + + ** New cvpartition class, with methods: + + display repartition test training + + ** New sample data file fisheriris.txt for tests + + ** The following functions are new: + + cdf crossval dcov pdist2 qrandn randsample signtest ttest ttest2 + vartest vartest2 ztest + + +Summary of important user-visible changes for statistics 1.2.3: +------------------------------------------------------------------- + + ** Made sure that output of nanstd is real. + + ** Fixed second output of nanmax and nanmin. + + ** Corrected handle for outliers in boxplot. + + ** Bug fix and enhanced functionality for mvnrnd. + + ** The following functions are new: + + wishrnd iwishrnd wishpdf iwishpdf cmdscale + +Summary of important user-visible changes for statistics 1.2.2: +------------------------------------------------------------------- + + ** Fixed documentation of dendogram and hist3 to work with TexInfo 5. + +Summary of important user-visible changes for statistics 1.2.1: +------------------------------------------------------------------- + + ** The following functions are new: + + pcares pcacov runstest stepwisefit hist3 + + ** dendogram now returns the leaf node numbers and order that the nodes were displayed in. + + ** New faster implementation of princomp. + +Summary of important user-visible changes for statistics 1.2.0: +------------------------------------------------------------------- + + ** The following functions are new: + + regress_gp dendogram plsregress + + ** New functions for the generalized extreme value (GEV) distribution: + + gevcdf gevfit gevfit_lmom gevinv gevlike gevpdf gevrnd gevstat + + ** The interface of the following functions has been modified: + + mvnrnd + + ** `kmeans' has been fixed to deal with clusters that contain only + one element. + + ** `normplot' has been fixed to avoid use of functions that have been + removed from Octave core. Also, the plot produced should now display some + aesthetic elements and appropriate legends. + + ** The help text of `mvtrnd' has been improved. + + ** Package is no longer autoloaded. + +Summary of important user-visible changes for statistics 1.1.3: +------------------------------------------------------------------- + + ** The following functions are new in 1.1.3: + + copularnd mvtrnd + + ** The functions mnpdf and mnrnd are now also usable for greater numbers + of categories for which the rows do not exactly sum to 1. + +Summary of important user-visible changes for statistics 1.1.2: +------------------------------------------------------------------- + + ** The following functions are new in 1.1.2: + + mnpdf mnrnd + + ** The package is now dependent on the io package (version 1.0.18 or + later) since the functions that it depended of from miscellaneous + package have been moved to io. + + ** The function `kmeans' now accepts the 'emptyaction' property with + the 'singleton' value. This allows for the kmeans algorithm to handle + empty cluster better. It also throws an error if the user does not + request an empty cluster handling, and there is an empty cluster. + Plus, the returned items are now a closer match to Matlab. + +Summary of important user-visible changes for statistics 1.1.1: +------------------------------------------------------------------- + + ** The following functions are new in 1.1.1: + + monotone_smooth kmeans jackknife + + ** Bug fixes on the functions: + + normalise_distribution combnk + repanova + + ** The following functions were removed since equivalents are now + part of GNU octave core: + + zscore + + ** boxplot.m now returns a structure with handles to the plot elemenets. + +Summary of important user-visible changes for statistics 1.1.0: +------------------------------------------------------------------- + + ** IMPORTANT note about `fstat' shadowing core library function: + + GNU octave's 3.2 release added a new function `fstat' to return + information of a file. Statistics' `fstat' computes F mean and + variance. Since MatLab's `fstat' is the equivalent to statistics' + `fstat' (not to core's `fstat'), and to avoid problems with the + statistics package, `fstat' has been deprecated in octave 3.4 + and will be removed in Octave 3.8. In the mean time, please + ignore this warning when installing the package. + + ** The following functions are new in 1.1.0: + + normalise_distribution repanova combnk + + ** The following functions were removed since equivalents are now + part of GNU octave core: + + prctile + + ** The __tbl_delim__ function is now private. + + ** The function `boxplot' now accepts named arguments. + + ** Bug fixes on the functions: + + harmmean nanmax nanmin regress + + ** Small improvements on help text. diff --git a/PKG_ADD b/PKG_ADD new file mode 100644 index 0000000..71bcd06 --- /dev/null +++ b/PKG_ADD @@ -0,0 +1,19 @@ +## three problems: +## - the directory returned by 'mfilename' must not be added also +## (endless loop) +## - 'genpath' of Octave 4.0 includes directories indiscriminately, +## e.g. 'private' directories +## - PKG_ADD (and PKG_DEL?) is run during installation, too, from the +## root directory of the package, where no such subdirectories +## exist. +if exist ("isfolder") == 0 + if (isdir (fullfile (fileparts (mfilename ("fullpath")), "base"))) + addpath (fullfile (fileparts (mfilename ("fullpath")), + {"base", "distributions", "models", "tests"}){:}); + endif +else + if (isfolder (fullfile (fileparts (mfilename ("fullpath")), "base"))) + addpath (fullfile (fileparts (mfilename ("fullpath")), + {"base", "distributions", "models", "tests"}){:}); + endif +endif diff --git a/PKG_DEL b/PKG_DEL new file mode 100644 index 0000000..a2f4668 --- /dev/null +++ b/PKG_DEL @@ -0,0 +1,19 @@ +## three problems: +## - the directory returned by 'mfilename' must not be added also +## (endless loop) +## - 'genpath' of Octave 4.0 includes directories indiscriminately, +## e.g. 'private' directories +## - PKG_ADD (and PKG_DEL?) is run during installation, too, from the +## root directory of the package, where no such subdirectories +## exist. +if exist ("isfolder") == 0 + if (isdir (fullfile (fileparts (mfilename ("fullpath")), "base"))) + rmpath (fullfile (fileparts (mfilename ("fullpath")), + {"base", "distributions", "models", "tests"}){:}); + endif +else + if (isfolder (fullfile (fileparts (mfilename ("fullpath")), "base"))) + rmpath (fullfile (fileparts (mfilename ("fullpath")), + {"base", "distributions", "models", "tests"}){:}); + endif +endif diff --git a/README.crosscompilation b/README.crosscompilation new file mode 100644 index 0000000..78df829 --- /dev/null +++ b/README.crosscompilation @@ -0,0 +1,2 @@ +Instructions for cross-compiling are in +utils/conditional_installation.m . diff --git a/inst/@cvpartition/cvpartition.m b/inst/@cvpartition/cvpartition.m new file mode 100644 index 0000000..7691c0c --- /dev/null +++ b/inst/@cvpartition/cvpartition.m @@ -0,0 +1,186 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File}{@var{C} =} cvpartition (@var{X}, [@var{partition_type}, [@var{k}]]) +## Create a partition object for cross validation. +## +## @var{X} may be a positive integer, interpreted as the number of values @var{n} to partition, or a vector of length @var{n} containing class designations for the elements, in which case the partitioning types @var{KFold} and @var{HoldOut} attempt to ensure each partition represents the classes proportionately. +## +## @var{partition_type} must be one of the following: +## +## @table @asis +## @item @samp{KFold} +## Divide set into @var{k} equal-size subsets (this is the default, with @var{k}=10). +## @item @samp{HoldOut} +## Divide set into two subsets, "training" and "validation". If @var{k} is a fraction, that is the fraction of values put in the validation subset; if it is a positive integer, that is the number of values in the validation subset (by default @var{k}=0.1). +## @item @samp{LeaveOut} +## Leave-one-out partition (each element is placed in its own subset). +## @item @samp{resubstitution} +## Training and validation subsets that both contain all the original elements. +## @item @samp{Given} +## Subset indices are as given in @var{X}. +## @end table +## +## The following fields are defined for the @samp{cvpartition} class: +## +## @table @asis +## @item @samp{classes} +## Class designations for the elements. +## @item @samp{inds} +## Subset indices for the elements. +## @item @samp{n_classes} +## Number of different classes. +## @item @samp{NumObservations} +## @var{n}, number of elements in data set. +## @item @samp{NumTestSets} +## Number of testing subsets. +## @item @samp{TestSize} +## Number of elements in (each) testing subset. +## @item @samp{TrainSize} +## Number of elements in (each) training subset. +## @item @samp{Type} +## Partition type. +## @end table +## +## @seealso{crossval} +## @end deftypefn + +## Author: Nir Krakauer + +function C = cvpartition (X, partition_type = 'KFold', k = []) + + if (nargin < 1 || nargin > 3 || !isvector(X)) + print_usage (); + endif + + if isscalar (X) + n = X; + n_classes = 1; + else + n = numel (X); + endif + + switch tolower(partition_type) + case {'kfold' 'holdout' 'leaveout' 'resubstitution' 'given'} + otherwise + warning ('unrecognized type, using KFold') + partition_type = 'KFold'; + endswitch + + switch tolower(partition_type) + case {'kfold' 'holdout' 'given'} + if !isscalar (X) + [y, ~, j] = unique (X(:)); + n_per_class = accumarray (j, 1); + n_classes = numel (n_per_class); + endif + endswitch + + C = struct ("classes", [], "inds", [], "n_classes", [], "NumObservations", [], "NumTestSets", [], "TestSize", [], "TrainSize", [], "Type", []); + #The non-Matlab fields classes, inds, n_classes are only useful for some methods + + switch tolower(partition_type) + case 'kfold' + if isempty (k) + k = 10; + endif + if n_classes == 1 + inds = floor((0:(n-1))' * (k / n)) + 1; + else + inds = nan(n, 1); + for i = 1:n_classes + if mod (i, 2) #alternate ordering over classes so that the subsets are more nearly the same size + inds(j == i) = floor((0:(n_per_class(i)-1))' * (k / n_per_class(i))) + 1; + else + inds(j == i) = floor(((n_per_class(i)-1):-1:0)' * (k / n_per_class(i))) + 1; + endif + endfor + endif + C.inds = inds; + C.NumTestSets = k; + [~, ~, jj] = unique (inds); + n_per_subset = accumarray (jj, 1); + C.TrainSize = n - n_per_subset; + C.TestSize = n_per_subset; + case 'given' + C.inds = j; + C.NumTestSets = n_classes; + C.TrainSize = n - n_per_class; + C.TestSize = n_per_class; + case 'holdout' + if isempty (k) + k = 0.1; + endif + if k < 1 + f = k; #target fraction to sample + k = round (k * n); #number of samples + else + f = k / n; + endif + inds = zeros (n, 1, "logical"); + if n_classes == 1 + inds(randsample(n, k)) = true; #indices for test set + else #sample from each class + k_check = 0; + for i = 1:n_classes + ki = round(f*n_per_class(i)); + inds(find(j == i)(randsample(n_per_class(i), ki))) = true; + k_check += ki; + endfor + if k_check < k #add random elements to test set to make it k + inds(find(!inds)(randsample(n - k_check, k - k_check))) = true; + elseif k_check > k #remove random elements from test set + inds(find(inds)(randsample(k_check, k_check - k))) = false; + endif + C.classes = j; + endif + C.n_classes = n_classes; + C.TrainSize = n - k; + C.TestSize = k; + C.NumTestSets = 1; + C.inds = inds; + case 'leaveout' + C.TrainSize = ones (n, 1); + C.TestSize = (n-1) * ones (n, 1); + C.NumTestSets = n; + case 'resubstitution' + C.TrainSize = C.TestSize = n; + C.NumTestSets = 1; + endswitch + + C.NumObservations = n; + C.Type = tolower (partition_type); + + C = class (C, "cvpartition"); + +endfunction + + +%!demo +%! # Partition with Fisher iris dataset (n = 150) +%! # Stratified by species +%! load fisheriris.txt +%! y = fisheriris(:, 1); +%! # 10-fold cross-validation partition +%! c = cvpartition (y, 'KFold', 10) +%! # leave-10-out partition +%! c1 = cvpartition (y, 'HoldOut', 10) +%! idx1 = test (c, 2); +%! idx2 = training (c, 2); +%! # another leave-10-out partition +%! c2 = repartition (c1) +#plot(struct(c).inds, '*') + diff --git a/inst/@cvpartition/display.m b/inst/@cvpartition/display.m new file mode 100644 index 0000000..a296659 --- /dev/null +++ b/inst/@cvpartition/display.m @@ -0,0 +1,50 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File} display (@var{C}) +## Display a cvpartition object. +## +## @seealso{cvpartition} +## @end deftypefn + +## Author: Nir Krakauer + +function display (C) + + if nargin != 1 + print_usage (); + endif + + switch C.Type + case 'kfold' + str = 'K-fold'; + case 'given' + str = 'Given'; + case 'holdout' + str = 'HoldOut'; + case 'leaveout' + str = 'Leave-One-Out'; + case 'resubstitution' + str = 'Resubstitution'; + otherwise + str = 'Unknown-type'; + endswitch + +disp([str ' cross validation partition']) +disp([' N: ' num2str(C.NumObservations)]) +disp(['NumTestSets: ' num2str(C.NumTestSets)]) +disp([' TrainSize: ' num2str(C.TrainSize')]) +disp([' TestSize: ' num2str(C.TestSize')]) diff --git a/inst/@cvpartition/get.m b/inst/@cvpartition/get.m new file mode 100644 index 0000000..22dff0c --- /dev/null +++ b/inst/@cvpartition/get.m @@ -0,0 +1,40 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File}@var{s} = get (@var{c}, [@var{f}]) +## Get a field from a @samp{cvpartition} object. +## +## @seealso{cvpartition} +## @end deftypefn + +function s = get (c, f) + if (nargin == 1) + s = c; + elseif (nargin == 2) + if (ischar (f)) + switch (f) + case {"classes", "inds", "n_classes", "NumObservations", "NumTestSets", "TestSize", "TrainSize", "Type"} + s = eval(["struct(c)." f]); + otherwise + error ("get: invalid property %s", f); + endswitch + else + error ("get: expecting the property to be a string"); + endif + else + print_usage (); + endif +endfunction diff --git a/inst/@cvpartition/repartition.m b/inst/@cvpartition/repartition.m new file mode 100644 index 0000000..d0da58b --- /dev/null +++ b/inst/@cvpartition/repartition.m @@ -0,0 +1,70 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File}{@var{Cnew} =} repartition (@var{C}) +## Return a new cvpartition object. +## +## @var{C} should be a cvpartition object. @var{Cnew} will use the same partition_type as @var{C} but redo any randomization performed (currently, only the HoldOut type uses randomization). +## +## @seealso{cvpartition} +## @end deftypefn + +## Author: Nir Krakauer + +function Cnew = repartition (C) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + Cnew = C; + + switch C.Type + case 'kfold' + case 'given' + case 'holdout' #currently, only the HoldOut method uses randomization + n = C.NumObservations; + k = C.TestSize; + n_classes = C.n_classes; + if k < 1 + f = k; #target fraction to sample + k = round (k * n); #number of samples + else + f = k / n; + endif + inds = zeros (n, 1, "logical"); + if n_classes == 1 + inds(randsample(n, k)) = true; #indices for test set + else #sample from each class + j = C.classes; #integer class labels + n_per_class = accumarray (j, 1); + n_classes = numel (n_per_class); + k_check = 0; + for i = 1:n_classes + ki = round(f*n_per_class(i)); + inds(find(j == i)(randsample(n_per_class(i), ki))) = true; + k_check += ki; + endfor + if k_check < k #add random elements to test set to make it k + inds(find(!inds)(randsample(n - k_check, k - k_check))) = true; + elseif k_check > k #remove random elements from test set + inds(find(inds)(randsample(k_check, k_check - k))) = false; + endif + endif + Cnew.inds = inds; + case 'leaveout' + case 'resubstitution' + endswitch diff --git a/inst/@cvpartition/set.m b/inst/@cvpartition/set.m new file mode 100644 index 0000000..892d303 --- /dev/null +++ b/inst/@cvpartition/set.m @@ -0,0 +1,45 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File}@var{s} = set (@var{c}, @var{varargin}) +## Set field(s) in a @samp{cvpartition} object. +## +## @seealso{cvpartition} +## @end deftypefn + +function s = set (c, varargin) + s = struct(c); + if (length (varargin) < 2 || rem (length (varargin), 2) != 0) + error ("set: expecting property/value pairs"); + endif + while (length (varargin) > 1) + prop = varargin{1}; + val = varargin{2}; + varargin(1:2) = []; + if (ischar (prop)) + switch (prop) + case {"classes", "inds", "n_classes", "NumObservations", "NumTestSets", "TestSize", "TrainSize", "Type"} + s = setfield (s, prop, val); + otherwise + error ("set: invalid property %s", f); + endswitch + else + error ("set: expecting the property to be a string"); + endif + endwhile + s = class (s, "cvpartition"); +endfunction + diff --git a/inst/@cvpartition/test.m b/inst/@cvpartition/test.m new file mode 100644 index 0000000..57cc830 --- /dev/null +++ b/inst/@cvpartition/test.m @@ -0,0 +1,47 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File}{@var{inds} =} test (@var{C}, [@var{i}]) +## Return logical vector for testing-subset indices from a cvpartition object. +## +## @var{C} should be a cvpartition object. @var{i} is the fold index (default is 1). +## +## @seealso{cvpartition, @@cvpartition/training} +## @end deftypefn + +## Author: Nir Krakauer + +function inds = test (C, i = []) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + if nargin < 2 || isempty (i) + i = 1; + endif + + switch C.Type + case {'kfold' 'given'} + inds = C.inds == i; + case 'holdout' + inds = C.inds; + case 'leaveout' + inds = zeros(C.NumObservations, 1, "logical"); + inds(i) = true; + case 'resubstitution' + inds = ones(C.NumObservations, 1, "logical"); + endswitch diff --git a/inst/@cvpartition/training.m b/inst/@cvpartition/training.m new file mode 100644 index 0000000..ae61142 --- /dev/null +++ b/inst/@cvpartition/training.m @@ -0,0 +1,47 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn{Function File}{@var{inds} =} training (@var{C}, [@var{i}]) +## Return logical vector for training-subset indices from a cvpartition object. +## +## @var{C} should be a cvpartition object. @var{i} is the fold index (default is 1). +## +## @seealso{cvpartition, @@cvpartition/test} +## @end deftypefn + +## Author: Nir Krakauer + +function inds = training (C, i = []) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + if nargin < 2 || isempty (i) + i = 1; + endif + + switch C.Type + case {'kfold' 'given'} + inds = C.inds != i; + case 'holdout' + inds = !C.inds; + case 'leaveout' + inds = ones (C.NumObservations, 1, "logical"); + inds(i) = false; + case 'resubstitution' + inds = ones (C.NumObservations, 1, "logical"); + endswitch diff --git a/inst/ConfusionMatrixChart.m b/inst/ConfusionMatrixChart.m new file mode 100644 index 0000000..9a7cd5c --- /dev/null +++ b/inst/ConfusionMatrixChart.m @@ -0,0 +1,901 @@ +## Copyright (C) 2020-2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## +## Author: Stefano Guidoni + +classdef ConfusionMatrixChart < handle + + ## -*- texinfo -*- + ## @deftypefn {} {@var{p} =} ConfusionMatrixChart () + ## Create object @var{p}, a Confusion Matrix Chart object. + ## + ## @table @asis + ## @item @qcode{"DiagonalColor"} + ## The color of the patches on the diagonal, default is [0.0, 0.4471, 0.7412]. + ## + ## @item @qcode{"OffDiagonalColor"} + ## The color of the patches off the diagonal, default is [0.851, 0.3255, 0.098]. + ## + ## @item @qcode{"GridVisible"} + ## Available values: @qcode{on} (default), @qcode{off}. + ## + ## @item @qcode{"Normalization"} + ## Available values: @qcode{absolute} (default), @qcode{column-normalized}, + ## @qcode{row-normalized}, @qcode{total-normalized}. + ## + ## @item @qcode{"ColumnSummary"} + ## Available values: @qcode{off} (default), @qcode{absolute}, + ## @qcode{column-normalized},@qcode{total-normalized}. + ## + ## @item @qcode{"RowSummary"} + ## Available values: @qcode{off} (default), @qcode{absolute}, + ## @qcode{row-normalized}, @qcode{total-normalized}. + ## @end table + ## + ## MATLAB compatibility -- the not implemented properties are: FontColor, + ## PositionConstraint, InnerPosition, Layout. + ## + ## @seealso{confusionchart} + ## @end deftypefn + + properties (Access = public) + ## text properties + XLabel = "Predicted Class"; + YLabel = "True Class"; + Title = ""; + + FontName = ""; + FontSize = 0; + + ## chart colours + DiagonalColor = [0 0.4471 0.7412]; + OffDiagonalColor = [0.8510 0.3255 0.0980]; + + ## data visualization + Normalization = "absolute"; + ColumnSummary = "off"; + RowSummary = "off"; + + GridVisible = "on"; + + HandleVisibility = ""; + OuterPosition = []; + Position = []; + Units = ""; + endproperties + + properties (GetAccess = public, SetAccess = private) + ClassLabels = {}; # a string cell array of classes + NormalizedValues = []; # the normalized confusion matrix + Parent = 0; # a handle to the parent object + endproperties + + properties (Access = protected) + hax = 0.0; # a handle to the axes + ClassN = 0; # the number of classes + AbsoluteValues = []; # the original confusion matrix + ColumnSummaryAbsoluteValues = []; # default values of the column summary + RowSummaryAbsoluteValues = []; # default values of the row summary + endproperties + + methods (Access = public) + ## class constructor + ## inputs: axis handle, a confusion matrix, a list of class labels, + ## an array of optional property-value pairs. + function this = ConfusionMatrixChart (hax, cm, cl, args) + ## class initialization + this.hax = hax; + this.Parent = get (this.hax, "parent"); + this.ClassLabels = cl; + this.NormalizedValues = cm; + this.AbsoluteValues = cm; + this.ClassN = rows (cm); + this.FontName = get (this.hax, "fontname"); + this.FontSize = get (this.hax, "fontsize"); + + set (this.hax, "xlabel", this.XLabel); + set (this.hax, "ylabel", this.YLabel); + + ## draw the chart + draw (this); + + ## apply paired properties + if (! isempty (args)) + pair_idx = 1; + while (pair_idx < length (args)) + switch (args{pair_idx}) + case "XLabel" + this.XLabel = args{pair_idx + 1}; + case "YLabel" + this.YLabel = args{pair_idx + 1}; + case "Title" + this.Title = args{pair_idx + 1}; + case "FontName" + this.FontName = args{pair_idx + 1}; + case "FontSize" + this.FontSize = args{pair_idx + 1}; + case "DiagonalColor" + this.DiagonalColor = args{pair_idx + 1}; + case "OffDiagonalColor" + this.OffDiagonalColor = args{pair_idx + 1}; + case "Normalization" + this.Normalization = args{pair_idx + 1}; + case "ColumnSummary" + this.ColumnSummary = args{pair_idx + 1}; + case "RowSummary" + this.RowSummary = args{pair_idx + 1}; + case "GridVisible" + this.GridVisible = args{pair_idx + 1}; + case "HandleVisibility" + this.HandleVisibility = args{pair_idx + 1}; + case "OuterPosition" + this.OuterPosition = args{pair_idx + 1}; + case "Position" + this.Position = args{pair_idx + 1}; + case "Units" + this.Units = args{pair_idx + 1}; + otherwise + close (this.Parent); + error ("confusionchart: invalid property %s", args{pair_idx}); + endswitch + + pair_idx += 2; + endwhile + endif + + ## init the color map + updateColorMap (this); + endfunction + + ## set functions + function set.XLabel (this, string) + if (! ischar (string)) + close (this.Parent); + error ("confusionchart: XLabel must be a string."); + endif + + this.XLabel = updateAxesProperties (this, "xlabel", string); + endfunction + + function set.YLabel (this, string) + if (! ischar (string)) + close (this.Parent); + error ("confusionchart: YLabel must be a string."); + endif + + this.YLabel = updateAxesProperties (this, "ylabel", string); + endfunction + + function set.Title (this, string) + if (! ischar (string)) + close (this.Parent); + error ("confusionchart: Title must be a string."); + endif + + this.Title = updateAxesProperties (this, "title", string); + endfunction + + function set.FontName (this, string) + if (! ischar (string)) + close (this.Parent); + error ("confusionchart: FontName must be a string."); + endif + + this.FontName = updateTextProperties (this, "fontname", string); + endfunction + + function set.FontSize (this, value) + if (! isnumeric (value)) + close (this.Parent); + error ("confusionchart: FontSize must be numeric."); + endif + + this.FontSize = updateTextProperties (this, "fontsize", value); + endfunction + + function set.DiagonalColor (this, color) + if (ischar (color)) + color = this.convertNamedColor (color); + endif + + if (! (isvector (color) && length (color) == 3 )) + close (this.Parent); + error ("confusionchart: DiagonalColor must be a color."); + endif + + this.DiagonalColor = color; + updateColorMap (this); + endfunction + + function set.OffDiagonalColor (this, color) + if (ischar (color)) + color = this.convertNamedColor (color); + endif + + if (! (isvector (color) && length (color) == 3)) + close (this.Parent); + error ("confusionchart: OffDiagonalColor must be a color."); + endif + + this.OffDiagonalColor = color; + updateColorMap (this); + endfunction + + function set.Normalization (this, string) + if (! any (strcmp (string, {"absolute", "column-normalized",... + "row-normalized", "total-normalized"}))) + close (this.Parent); + error ("confusionchart: invalid value for Normalization."); + endif + + this.Normalization = string; + updateChart (this); + endfunction + + function set.ColumnSummary (this, string) + if (! any (strcmp (string, {"off", "absolute", "column-normalized",... + "total-normalized"}))) + close (this.Parent); + error ("confusionchart: invalid value for ColumnSummary."); + endif + + this.ColumnSummary = string; + updateChart (this); + endfunction + + function set.RowSummary (this, string) + if (! any (strcmp (string, {"off", "absolute", "row-normalized",... + "total-normalized"}))) + close (this.Parent); + error ("confusionchart: invalid value for RowSummary."); + endif + + this.RowSummary = string; + updateChart (this); + endfunction + + function set.GridVisible (this, string) + if (! any (strcmp (string, {"off", "on"}))) + close (this.Parent); + error ("confusionchart: invalid value for GridVisible."); + endif + + this.GridVisible = string; + setGridVisibility (this); + endfunction + + function set.HandleVisibility (this, string) + if (! any (strcmp (string, {"off", "on", "callback"}))) + close (this.Parent); + error ("confusionchart: invalid value for HandleVisibility"); + endif + + set (this.hax, "handlevisibility", string); + endfunction + + function set.OuterPosition (this, vector) + if (! isvector (vector) || ! isnumeric (vector) || length (vector) != 4) + close (this.Parent); + error ("confusionchart: invalid value for OuterPosition"); + endif + + set (this.hax, "outerposition", vector); + endfunction + + function set.Position (this, vector) + if (! isvector (vector) || ! isnumeric (vector) || length (vector) != 4) + close (this.Parent); + error ("confusionchart: invalid value for Position"); + endif + + set (this.hax, "position", vector); + endfunction + + function set.Units (this, string) + if (! any (strcmp (string, {"centimeters", "characters", "inches", ... + "normalized", "pixels", "points"}))) + close (this.Parent); + error ("confusionchart: invalid value for Units"); + endif + + set (this.hax, "units", string); + endfunction + + ## display method + ## MATLAB compatibility, this tries to mimic the MATLAB behaviour + function disp (this) + nv_sizes = size (this.NormalizedValues); + cl_sizes = size (this.ClassLabels); + + printf ("%s with properties:\n\n", class (this)); + printf ("\tNormalizedValues: [ %dx%d %s ]\n", nv_sizes(1), nv_sizes(2),... + class (this.NormalizedValues)); + printf ("\tClassLabels: { %dx%d %s }\n\n", cl_sizes(1), cl_sizes(2),... + class (this.ClassLabels)); + endfunction + + ## sortClasses + ## reorder the chart + function sortClasses (this, order) + ## -*- texinfo -*- + ## @deftypefn {} {} sortClasses (@var{cm},@var{order}) + ## Sort the classes of the @code{ConfusionMatriChart} object @var{cm} + ## according to @var{order}. + ## + ## Valid values for @var{order} can be an array or cell array including + ## the same class labels as @var{cm}, or a value like @code{'auto'}, + ## @code{'ascending-diagonal'}, @code{'descending-diagonal'} and + ## @code{'cluster'}. + ## + ## @end deftypefn + ## + ## @seealso{confusionchart, linkage, pdist} + + ## check the input parameters + if (nargin != 2) + print_usage (); + endif + + cl = this.ClassLabels; + cm_size = this.ClassN; + nv = this.NormalizedValues; + av = this.AbsoluteValues; + cv = this.ColumnSummaryAbsoluteValues; + rv = this.RowSummaryAbsoluteValues; + + scl = {}; + Idx = []; + + if (strcmp (order, "auto")) + [scl, Idx] = sort (cl); + elseif (strcmp (order, "ascending-diagonal")) + [s, Idx] = sort (diag (nv)); + scl = cl(Idx); + elseif (strcmp (order, "descending-diagonal")) + [s, Idx] = sort (diag (nv)); + Idx = flip (Idx); + scl = cl(Idx); + elseif (strcmp (order, "cluster")) + ## the classes are all grouped together + ## this way one can visually evaluate which are the most similar classes + ## according to the learning algorithm + D = zeros (1, ((cm_size - 1) * cm_size / 2)); # a pdist like vector + maxD = 2 * max (max (av)); + k = 1; # better than computing the index at every cycle + for i = 1 : (cm_size - 1) + for j = (i + 1) : cm_size + D(k++) = maxD - (av(i, j) + av(j, i)); # distance + endfor + endfor + tree = linkage (D, "average"); # clustering + ## we could have optimal leaf ordering with + Idx = optimalleaforder (tree, D); # optimal clustering + ## [sorted_v Idx] = sort (cluster (tree, )); + nodes_to_visit = 2 * cm_size - 1; + nodecount = 0; + while (! isempty (nodes_to_visit)) + current_node = nodes_to_visit(1); + nodes_to_visit(1) = []; + if (current_node > cm_size) + node = current_node - cm_size; + nodes_to_visit = [tree(node,[2 1]) nodes_to_visit]; + end + + if (current_node <= cm_size) + nodecount++; + Idx(nodecount) = current_node; + end + end + ## + scl = cl(Idx); + else + ## must be an array or cell array of labels + if (! iscellstr (order)) + if (! ischar (order)) + if (isrow (order)) + order = vec (order); + endif + order = num2str (order); + endif + + scl = cellstr (order); + endif + + if (length (scl) != length (cl)) + error ("sortClasses: wrong size for order.") + endif + + Idx = zeros (length (scl), 1); + + for i = 1 : length (scl) + Idx(i) = find (strcmp (cl, scl{i})); + endfor + endif + + ## rearrange the normalized values... + nv = nv(Idx, :); + nv = nv(:, Idx); + this.NormalizedValues = nv; + + ## ...and the absolute values... + av = av(Idx, :); + av = av(:, Idx); + this.AbsoluteValues = av; + + cv = cv([Idx ( Idx + cm_size )]); + this.ColumnSummaryAbsoluteValues = cv; + + rv = rv([Idx ( Idx + cm_size )]); + this.RowSummaryAbsoluteValues = rv; + + ## ...and the class labels + this.ClassLabels = scl; + + ## update the axes + set (this.hax, "xtick", (0.5 : 1 : (cm_size - 0.5)), "xticklabel", scl,... + "ytick", (0.5 : 1 : (cm_size - 0.5)), "yticklabel", scl); + + ## get text and patch handles + kids = get (this.hax, "children"); + t_kids = kids(find (isprop (kids, "fontname"))); # hack to find texts + m_kid = kids(find (strcmp (get (kids, "userdata"), "MainChart"))); + c_kid = kids(find (strcmp (get (kids, "userdata"), "ColumnSummary"))); + r_kid = kids(find (strcmp (get (kids, "userdata"), "RowSummary"))); + + ## re-assign colors to the main chart + cdata_m = reshape (get (m_kid, "cdata"), cm_size, cm_size); + cdata_m = cdata_m(Idx, :); + cdata_m = cdata_m(:, Idx); + + cdata_v = vec (cdata_m); + + set (m_kid, "cdata", cdata_v); + + ## re-assign colors to the column summary + cdata_m = reshape (transpose (get (c_kid, "cdata")), cm_size, 2); + cdata_m = cdata_m(Idx, :); + + cdata_v = vec (cdata_m); + + set (c_kid, "cdata", cdata_v); + + ## re-assign colors to the row summary + cdata_m = reshape (get (r_kid, "cdata"), cm_size, 2); + cdata_m = cdata_m(Idx, :); + + cdata_v = vec (cdata_m); + + set (r_kid, "cdata", cdata_v); + + ## move the text labels + for i = 1:length (t_kids) + t_pos = get (t_kids(i), "userdata"); + + if (t_pos(2) > cm_size) + ## row summary + t_pos(1) = find (Idx == (t_pos(1) + 1)) - 1; + set (t_kids(i), "userdata", t_pos); + + t_pos = t_pos([2 1]) + 0.5; + set (t_kids(i), "position", t_pos); + elseif (t_pos(1) > cm_size) + ## column summary + t_pos(2) = find (Idx == (t_pos(2) + 1)) - 1; + set (t_kids(i), "userdata", t_pos); + + t_pos = t_pos([2 1]) + 0.5; + set (t_kids(i), "position", t_pos); + else + ## main chart + t_pos(1) = find (Idx == (t_pos(1) + 1)) - 1; + t_pos(2) = find (Idx == (t_pos(2) + 1)) - 1; + set (t_kids(i), "userdata", t_pos); + + t_pos = t_pos([2 1]) + 0.5; + set (t_kids(i), "position", t_pos); + endif + endfor + + updateChart (this); + endfunction + endmethods + + methods (Access = private) + ## convertNamedColor + ## convert a named colour to a colour triplet + function ret = convertNamedColor (this, color) + vColorNames = ["ymcrgbwk"]'; + vColorTriplets = [1 1 0; 1 0 1; 0 1 1; 1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0]; + if (strcmp (color, "black")) + color = 'k'; + endif + + index = find (vColorNames == color(1)); + if (! isempty (index)) + ret = vColorTriplets(index, :); + else + ret = []; # trigger an error message + endif + endfunction + + ## updateAxesProperties + ## update the properties of the axes + function ret = updateAxesProperties (this, prop, value) + set (this.hax, prop, value); + + ret = value; + endfunction + + ## updateTextProperties + ## set the properties of the texts + function ret = updateTextProperties (this, prop, value) + hax_kids = get (this.hax, "children"); + text_kids = hax_kids(isprop (hax_kids , "fontname")); # hack to find texts + text_kids(end + 1) = get (this.hax, "xlabel"); + text_kids(end + 1) = get (this.hax, "ylabel"); + text_kids(end + 1) = get (this.hax, "title"); + + updateAxesProperties (this, prop, value); + set (text_kids, prop, value); + + ret = value; + endfunction + + ## setGridVisibility + ## toggle the visibility of the grid + function setGridVisibility (this) + kids = get (this.hax, "children"); + kids = kids(find (isprop (kids, "linestyle"))); + + if (strcmp (this.GridVisible, "on")) + set (kids, "linestyle", "-"); + else + set (kids, "linestyle", "none"); + endif + endfunction + + ## updateColorMap + ## change the colormap and, accordingly, the text colors + function updateColorMap (this) + cm_size = this.ClassN; + d_color = this.DiagonalColor; + o_color = this.OffDiagonalColor; + + ## quick hack + d_color(find (d_color == 1.0)) = 0.999; + o_color(find (o_color == 1.0)) = 0.999; + + ## 64 shades for each color + cm_colormap(1:64,:) = [1.0 : (-(1.0 - o_color(1)) / 63) : o_color(1);... + 1.0 : (-(1.0 - o_color(2)) / 63) : o_color(2);... + 1.0 : (-(1.0 - o_color(3)) / 63) : o_color(3)]'; + cm_colormap(65:128,:) = [1.0 : (-(1.0 - d_color(1)) / 63) : d_color(1);... + 1.0 : (-(1.0 - d_color(2)) / 63) : d_color(2);... + 1.0 : (-(1.0 - d_color(3)) / 63) : d_color(3)]'; + + colormap (this.hax, cm_colormap); + + ## update text colors + kids = get (this.hax, "children"); + t_kids = kids(find (isprop (kids, "fontname"))); # hack to find texts + m_patch = kids(find (strcmp (get (kids, "userdata"), "MainChart"))); + c_patch = kids(find (strcmp (get (kids, "userdata"), "ColumnSummary"))); + r_patch = kids(find (strcmp (get (kids, "userdata"), "RowSummary"))); + + m_colors = get (m_patch, "cdata"); + c_colors = get (c_patch, "cdata"); + r_colors = get (r_patch, "cdata"); + + ## when a patch is dark, let's use a pale color for the text + for i = 1 : length (t_kids) + t_pos = get (t_kids(i), "userdata"); + color_idx = 1; + + if (t_pos(2) > cm_size) + ## row summary + idx = (t_pos(2) - cm_size - 1) * cm_size + t_pos(1) + 1; + color_idx = r_colors(idx) + 1; + elseif (t_pos(1) > cm_size) + ## column summary + idx = (t_pos(1) - cm_size - 1) * cm_size + t_pos(2) + 1; + color_idx = c_colors(idx) + 1; + else + ## main chart + idx = t_pos(2) * cm_size + t_pos(1) + 1; + color_idx = m_colors(idx) + 1; + endif + + if (sum (cm_colormap(color_idx, :)) < 1.8) + set (t_kids(i), "color", [.97 .97 1.0]); + else + set (t_kids(i), "color", [.15 .15 .15]); + endif + endfor + endfunction + + ## updateChart + ## update the text labels and the NormalizedValues property + function updateChart (this) + cm_size = this.ClassN; + cm = this.AbsoluteValues; + l_cs = this.ColumnSummaryAbsoluteValues; + l_rs = this.RowSummaryAbsoluteValues; + + kids = get (this.hax, "children"); + t_kids = kids(find (isprop (kids, "fontname"))); # hack to find texts + + normalization = this.Normalization; + column_summary = this.ColumnSummary; + row_summary = this.RowSummary; + + ## normalization for labelling + row_totals = sum (cm, 2); + col_totals = sum (cm, 1); + mat_total = sum (col_totals); + cm_labels = cm; + add_percent = true; + + if (strcmp (normalization, "column-normalized")) + for i = 1 : cm_size + cm_labels(:,i) = cm_labels(:,i) ./ col_totals(i); + endfor + elseif (strcmp (normalization, "row-normalized")) + for i = 1 : cm_size + cm_labels(i,:) = cm_labels(i,:) ./ row_totals(i); + endfor + elseif (strcmp (normalization, "total-normalized")) + cm_labels = cm_labels ./ mat_total; + else + add_percent = false; + endif + + ## update NormalizedValues + this.NormalizedValues = cm_labels; + + ## update axes + last_row = cm_size; + last_col = cm_size; + userdata = cell2mat (get (t_kids, "userdata")); + + cs_kids = t_kids(find (userdata(:,1) > cm_size)); + cs_kids(end + 1) = kids(find (strcmp (get (kids, "userdata"),... + "ColumnSummary"))); + + if (! strcmp ("off", column_summary)) + set (cs_kids, "visible", "on"); + last_row += 3; + else + set (cs_kids, "visible", "off"); + endif + + rs_kids = t_kids(find (userdata(:,2) > cm_size)); + rs_kids(end + 1) = kids(find (strcmp (get (kids, "userdata"),... + "RowSummary"))); + + if (! strcmp ("off", row_summary)) + set (rs_kids, "visible", "on"); + last_col += 3; + else + set (rs_kids, "visible", "off"); + endif + + axis (this.hax, [0 last_col 0 last_row]); + + ## update column summary data + cs_add_percent = true; + if (! strcmp (column_summary, "off")) + if (strcmp (column_summary, "column-normalized")) + for i = 1 : cm_size + if (col_totals(i) == 0) + ## avoid division by zero + l_cs([i (cm_size + i)]) = 0; + else + l_cs([i, cm_size + i]) = l_cs([i, cm_size + i]) ./ col_totals(i); + endif + endfor + elseif strcmp (column_summary, "total-normalized") + l_cs = l_cs ./ mat_total; + else + cs_add_percent = false; + endif + endif + + ## update row summary data + rs_add_percent = true; + if (! strcmp (row_summary, "off")) + if (strcmp (row_summary, "row-normalized")) + for i = 1 : cm_size + if (row_totals(i) == 0) + ## avoid division by zero + l_rs([i (cm_size + i)]) = 0; + else + l_rs([i, cm_size + i]) = l_rs([i, cm_size + i]) ./ row_totals(i); + endif + endfor + elseif (strcmp (row_summary, "total-normalized")) + l_rs = l_rs ./ mat_total; + else + rs_add_percent = false; + endif + endif + + ## update text + label_list = vec (cm_labels); + + for i = 1 : length (t_kids) + t_pos = get (t_kids(i), "userdata"); + new_string = ""; + + if (t_pos(2) > cm_size) + ## this is the row summary + idx = (t_pos(2) - cm_size - 1) * cm_size + t_pos(1) + 1; + + if (rs_add_percent) + new_string = num2str (100.0 * l_rs(idx), "%3.1f"); + new_string = [new_string "%"]; + else + new_string = num2str (l_rs(idx)); + endif + elseif (t_pos(1) > cm_size) + ## this is the column summary + idx = (t_pos(1) - cm_size - 1) * cm_size + t_pos(2) + 1; + + if (cs_add_percent) + new_string = num2str (100.0 * l_cs(idx), "%3.1f"); + new_string = [new_string "%"]; + else + new_string = num2str (l_cs(idx)); + endif + else + ## this is the main chart + idx = t_pos(2) * cm_size + t_pos(1) + 1; + + if (add_percent) + new_string = num2str (100.0 * label_list(idx), "%3.1f"); + new_string = [new_string "%"]; + else + new_string = num2str (label_list(idx)); + endif + endif + + set (t_kids(i), "string", new_string); + endfor + + endfunction + + ## draw + ## draw the chart + function draw (this) + cm = this.AbsoluteValues; + cl = this.ClassLabels; + cm_size = this.ClassN; + + ## set up the axes + set (this.hax, "xtick", (0.5 : 1 : (cm_size - 0.5)), "xticklabel", cl,... + "ytick", (0.5 : 1 : (cm_size - 0.5)), "yticklabel", cl ); + axis ("ij"); + axis (this.hax, [0 cm_size 0 cm_size]); + + ## prepare the patches + indices_b = 0 : (cm_size -1); + indices_v = repmat (indices_b, cm_size, 1); + indices_vx = transpose (vec (indices_v)); + indices_vy = vec (indices_v', 2); + indices_ex = vec ((cm_size + 1) * [1; 2] .* ones (2, cm_size), 2); + + ## normalization for colorization + ## it is used a colormap of 128 shades of two colors, 64 shades for each + ## color + normal = max (max (cm)); + cm_norm = round (63 * cm ./ normal); + cm_norm = cm_norm + 64 * eye (cm_size); + + ## default normalization: absolute + cm_labels = vec (cm); + + ## the patches of the main chart + x_patch = [indices_vx; + ( indices_vx + 1 ); + ( indices_vx + 1 ); + indices_vx]; + y_patch = [indices_vy; + indices_vy; + ( indices_vy + 1 ); + ( indices_vy + 1 )]; + c_patch = vec (cm_norm(1 : cm_size, 1 : cm_size)); + + ## display the patches + ph = patch (this.hax, x_patch, y_patch, c_patch); + + set (ph, "userdata", "MainChart"); + + ## display the labels + userdata = [indices_vy; indices_vx]'; + nonzero_idx = find (cm_labels != 0); + th = text ((x_patch(1, nonzero_idx) + 0.5), (y_patch(1, nonzero_idx) +... + 0.5), num2str (cm_labels(nonzero_idx)), "parent", this.hax ); + + set (th, "horizontalalignment", "center"); + for i = 1 : length (nonzero_idx) + set (th(i), "userdata", userdata(nonzero_idx(i), :)); + endfor + + ## patches for the summaries + main_values = diag (cm); + ct_values = sum (cm)'; + rt_values = sum (cm, 2); + cd_values = ct_values - main_values; + rd_values = rt_values - main_values; + + ## column summary + x_cs = [[indices_b indices_b]; + ( [indices_b indices_b] + 1 ); + ( [indices_b indices_b] + 1 ); + [indices_b indices_b]]; + y_cs = [(repmat ([1 1 2 2]', 1, cm_size)) (repmat ([2 2 3 3]', 1, cm_size))] +... + cm_size; + c_cs = [(round (63 * (main_values ./ ct_values)) + 64); + (round (63 * (cd_values ./ ct_values)))]; + c_cs(isnan (c_cs)) = 0; + l_cs = [main_values; cd_values]; + + ph = patch (this.hax, x_cs, y_cs, c_cs); + + set (ph, "userdata", "ColumnSummary"); + set (ph, "visible", "off" ); + + userdata = [y_cs(1,:); x_cs(1,:)]'; + nonzero_idx = find (l_cs != 0); + th = text ((x_cs(1,nonzero_idx) + 0.5), (y_cs(1,nonzero_idx) + 0.5),... + num2str (l_cs(nonzero_idx)), "parent", this.hax); + + set (th, "horizontalalignment", "center"); + for i = 1 : length (nonzero_idx) + set (th(i), "userdata", userdata(nonzero_idx(i), :)); + endfor + set (th, "visible", "off"); + + ## row summary + x_rs = y_cs; + y_rs = x_cs; + c_rs = [(round (63 * (main_values ./ rt_values)) + 64); + (round (63 * (rd_values ./ rt_values)))]; + c_rs(isnan (c_rs)) = 0; + l_rs = [main_values; rd_values]; + + ph = patch (this.hax, x_rs, y_rs, c_rs); + + set (ph, "userdata", "RowSummary"); + set (ph, "visible", "off"); + + userdata = [y_rs(1,:); x_rs(1,:)]'; + nonzero_idx = find (l_rs != 0); + th = text ((x_rs(1,nonzero_idx) + 0.5), (y_rs(1,nonzero_idx) + 0.5),... + num2str (l_rs(nonzero_idx)), "parent", this.hax); + + set (th, "horizontalalignment", "center"); + for i = 1 : length (nonzero_idx) + set (th(i), "userdata", userdata(nonzero_idx(i), :)); + endfor + set (th, "visible", "off"); + + this.ColumnSummaryAbsoluteValues = l_cs; + this.RowSummaryAbsoluteValues = l_rs; + endfunction + endmethods + +endclassdef + diff --git a/inst/anderson_darling_cdf.m b/inst/anderson_darling_cdf.m new file mode 100644 index 0000000..67c602e --- /dev/null +++ b/inst/anderson_darling_cdf.m @@ -0,0 +1,131 @@ +## Author: Paul Kienzle +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} @var{p} = anderson_darling_cdf (@var{A}, @var{n}) +## +## Return the CDF for the given Anderson-Darling coefficient @var{A} +## computed from @var{n} values sampled from a distribution. For a +## vector of random variables @var{x} of length @var{n}, compute the CDF +## of the values from the distribution from which they are drawn. +## You can uses these values to compute @var{A} as follows: +## +## @example +## @var{A} = -@var{n} - sum( (2*i-1) .* (log(@var{x}) + log(1 - @var{x}(@var{n}:-1:1,:))) )/@var{n}; +## @end example +## +## From the value @var{A}, @code{anderson_darling_cdf} returns the probability +## that @var{A} could be returned from a set of samples. +## +## The algorithm given in [1] claims to be an approximation for the +## Anderson-Darling CDF accurate to 6 decimal points. +## +## Demonstrate using: +## +## @example +## n = 300; reps = 10000; +## z = randn(n, reps); +## x = sort ((1 + erf (z/sqrt (2)))/2); +## i = [1:n]' * ones (1, size (x, 2)); +## A = -n - sum ((2*i-1) .* (log (x) + log (1 - x (n:-1:1, :))))/n; +## p = anderson_darling_cdf (A, n); +## hist (100 * p, [1:100] - 0.5); +## @end example +## +## You will see that the histogram is basically flat, which is to +## say that the probabilities returned by the Anderson-Darling CDF +## are distributed uniformly. +## +## You can easily determine the extreme values of @var{p}: +## +## @example +## [junk, idx] = sort (p); +## @end example +## +## The histograms of various @var{p} aren't very informative: +## +## @example +## histfit (z (:, idx (1)), linspace (-3, 3, 15)); +## histfit (z (:, idx (end/2)), linspace (-3, 3, 15)); +## histfit (z (:, idx (end)), linspace (-3, 3, 15)); +## @end example +## +## More telling is the qqplot: +## +## @example +## qqplot (z (:, idx (1))); hold on; plot ([-3, 3], [-3, 3], ';;'); hold off; +## qqplot (z (:, idx (end/2))); hold on; plot ([-3, 3], [-3, 3], ';;'); hold off; +## qqplot (z (:, idx (end))); hold on; plot ([-3, 3], [-3, 3], ';;'); hold off; +## @end example +## +## Try a similarly analysis for @var{z} uniform: +## +## @example +## z = rand (n, reps); x = sort(z); +## @end example +## +## and for @var{z} exponential: +## +## @example +## z = rande (n, reps); x = sort (1 - exp (-z)); +## @end example +## +## [1] Marsaglia, G; Marsaglia JCW; (2004) "Evaluating the Anderson Darling +## distribution", Journal of Statistical Software, 9(2). +## +## @seealso{anderson_darling_test} +## @end deftypefn + +function y = anderson_darling_cdf(z,n) + y = ADinf(z); + y += ADerrfix(y,n); +end + +function y = ADinf(z) + y = zeros(size(z)); + + idx = (z < 2); + if any(idx(:)) + p = [.00168691, -.0116720, .0347962, -.0649821, .247105, 2.00012]; + z1 = z(idx); + y(idx) = exp(-1.2337141./z1)./sqrt(z1).*polyval(p,z1); + end + + idx = (z >= 2); + if any(idx(:)) + p = [-.0003146, +.008056, -.082433, +.43424, -2.30695, 1.0776]; + y(idx) = exp(-exp(polyval(p,z(idx)))); + end +end + +function y = ADerrfix(x,n) + if isscalar(n), n = n*ones(size(x)); + elseif isscalar(x), x = x*ones(size(n)); + end + y = zeros(size(x)); + c = .01265 + .1757./n; + + idx = (x >= 0.8); + if any(idx(:)) + p = [255.7844, -1116.360, 1950.646, -1705.091, 745.2337, -130.2137]; + g3 = polyval(p,x(idx)); + y(idx) = g3./n(idx); + end + + idx = (x < 0.8 & x > c); + if any(idx(:)) + p = [1.91864, -8.259, 14.458, -14.6538, 6.54034, -.00022633]; + n1 = 1./n(idx); + c1 = c(idx); + g2 = polyval(p,(x(idx)-c1)./(.8-c1)); + y(idx) = (.04213 + .01365*n1).*n1 .* g2; + end + + idx = (x <= c); + if any(idx(:)) + x1 = x(idx)./c(idx); + n1 = 1./n(idx); + g1 = sqrt(x1).*(1-x1).*(49*x1-102); + y(idx) = ((.0037*n1+.00078).*n1+.00006).*n1 .* g1; + end +end diff --git a/inst/anderson_darling_test.m b/inst/anderson_darling_test.m new file mode 100644 index 0000000..802d0d0 --- /dev/null +++ b/inst/anderson_darling_test.m @@ -0,0 +1,153 @@ +## Author: Paul Kienzle +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{q}, @var{Asq}, @var{info}] = } = @ +## anderson_darling_test (@var{x}, @var{distribution}) +## +## Test the hypothesis that @var{x} is selected from the given distribution +## using the Anderson-Darling test. If the returned @var{q} is small, reject +## the hypothesis at the @var{q}*100% level. +## +## The Anderson-Darling @math{@var{A}^2} statistic is calculated as follows: +## +## @example +## @iftex +## A^2_n = -n - \sum_{i=1}^n (2i-1)/n log(z_i (1-z_{n-i+1})) +## @end iftex +## @ifnottex +## n +## A^2_n = -n - SUM (2i-1)/n log(@math{z_i} (1 - @math{z_@{n-i+1@}})) +## i=1 +## @end ifnottex +## @end example +## +## where @math{z_i} is the ordered position of the @var{x}'s in the CDF of the +## distribution. Unlike the Kolmogorov-Smirnov statistic, the +## Anderson-Darling statistic is sensitive to the tails of the +## distribution. +## +## The @var{distribution} argument must be a either @t{"uniform"}, @t{"normal"}, +## or @t{"exponential"}. +## +## For @t{"normal"}' and @t{"exponential"} distributions, estimate the +## distribution parameters from the data, convert the values +## to CDF values, and compare the result to tabluated critical +## values. This includes an correction for small @var{n} which +## works well enough for @var{n} >= 8, but less so from smaller @var{n}. The +## returned @code{info.Asq_corrected} contains the adjusted statistic. +## +## For @t{"uniform"}, assume the values are uniformly distributed +## in (0,1), compute @math{@var{A}^2} and return the corresponding @math{p}-value from +## @code{1-anderson_darling_cdf(A^2,n)}. +## +## If you are selecting from a known distribution, convert your +## values into CDF values for the distribution and use @t{"uniform"}. +## Do not use @t{"uniform"} if the distribution parameters are estimated +## from the data itself, as this sharply biases the @math{A^2} statistic +## toward smaller values. +## +## [1] Stephens, MA; (1986), "Tests based on EDF statistics", in +## D'Agostino, RB; Stephens, MA; (eds.) Goodness-of-fit Techinques. +## New York: Dekker. +## +## @seealso{anderson_darling_cdf} +## @end deftypefn + +function [q,Asq,info] = anderson_darling_test(x,dist) + + if size(x,1) == 1, x=x(:); end + x = sort(x); + n = size(x,1); + use_cdf = 0; + + # Compute adjustment and critical values to use for stats. + switch dist + case 'normal', + # This expression for adj is used in R. + # Note that the values from NIST dataplot don't work nearly as well. + adj = 1 + (.75 + 2.25/n)/n; + qvals = [ 0.1, 0.05, 0.025, 0.01 ]; + Acrit = [ 0.631, 0.752, 0.873, 1.035]; + x = stdnormal_cdf(zscore(x)); + + case 'uniform', + ## Put invalid data at the limits of the distribution + ## This will drive the statistic to infinity. + x(x<0) = 0; + x(x>1) = 1; + adj = 1.; + qvals = [ 0.1, 0.05, 0.025, 0.01 ]; + Acrit = [ 1.933, 2.492, 3.070, 3.857 ]; + use_cdf = 1; + + case 'XXXweibull', + adj = 1 + 0.2/sqrt(n); + qvals = [ 0.1, 0.05, 0.025, 0.01 ]; + Acrit = [ 0.637, 0.757, 0.877, 1.038]; + ## XXX FIXME XXX how to fit alpha and sigma? + x = wblcdf (x, ones(n,1)*sigma, ones(n,1)*alpha); + + case 'exponential', + adj = 1 + 0.6/n; + qvals = [ 0.1, 0.05, 0.025, 0.01 ]; + # Critical values depend on n. Choose the appropriate critical set. + # These values come from NIST dataplot/src/dp8.f. + Acritn = [ + 0, 1.022, 1.265, 1.515, 1.888 + 11, 1.045, 1.300, 1.556, 1.927; + 21, 1.062, 1.323, 1.582, 1.945; + 51, 1.070, 1.330, 1.595, 1.951; + 101, 1.078, 1.341, 1.606, 1.957; + ]; + # FIXME: consider interpolating in the critical value table. + Acrit = Acritn(lookup(Acritn(:,1),n),2:5); + + lambda = 1./mean(x); # exponential parameter estimation + x = expcdf(x, 1./(ones(n,1)*lambda)); + + otherwise + # FIXME consider implementing more of distributions; a number + # of them are defined in NIST dataplot/src/dp8.f. + error("Anderson-Darling test for %s not implemented", dist); + endswitch + + if any(x<0 | x>1) + error('Anderson-Darling test requires data in CDF form'); + endif + + i = [1:n]'*ones(1,size(x,2)); + Asq = -n - sum( (2*i-1) .* (log(x) + log(1-x(n:-1:1,:))) )/n; + + # Lookup adjusted critical value in the cdf (if uniform) or in the + # the critical table. + if use_cdf + q = 1-anderson_darling_cdf(Asq*adj, n); + else + idx = lookup([-Inf,Acrit],Asq*adj); + q = [1,qvals](idx); + endif + + if nargout > 2, + info.Asq = Asq; + info.Asq_corrected = Asq*adj; + info.Asq_critical = [100*(1-qvals); Acrit]'; + info.p = 1-q; + info.p_is_precise = use_cdf; + endif +endfunction + +%!demo +%! c = anderson_darling_test(10*rande(12,10000),'exponential'); +%! tabulate(100*c,100*[unique(c),1]); +%! % The Fc column should report 100, 250, 500, 1000, 10000 more or less. + +%!demo +%! c = anderson_darling_test(randn(12,10000),'normal'); +%! tabulate(100*c,100*[unique(c),1]); +%! % The Fc column should report 100, 250, 500, 1000, 10000 more or less. + +%!demo +%! c = anderson_darling_test(rand(12,10000),'uniform'); +%! hist(100*c,1:2:99); +%! % The histogram should be flat more or less. diff --git a/inst/anova1.m b/inst/anova1.m new file mode 100644 index 0000000..2b04d22 --- /dev/null +++ b/inst/anova1.m @@ -0,0 +1,260 @@ +## Copyright (C) 2021 Andreas Bertsatos +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} anova1 (@var{x}) +## @deftypefnx {Function File} {@var{p} =} anova1 (@var{x}, @var{group}) +## @deftypefnx {Function File} {@var{p} =} anova1 (@var{x}, @var{group}, @var{displayopt}) +## @deftypefnx {Function File} {[@var{p}, @var{atab}] =} anova1 (@var{x}, @dots{}) +## @deftypefnx {Function File} {[@var{p}, @var{atab}, @var{stats}] =} anova1 (@var{x}, @dots{}) +## +## Perform a one-way analysis of variance (ANOVA) for comparing the means of two +## or more groups of data under the null hypothesis that the groups are drawn +## from the same distribution, i.e. the group means are equal. +## +## anova1 can take up to three input arguments: +## +## @itemize +## @item +## @var{x} contains the data and it can either be a vector or matrix. +## If @var{x} is a matrix, then each column is treated as a separate group. +## If @var{x} is a vector, then the @var{group} argument is mandatory. +## @item +## @var{group} contains the names for each group. If @var{x} is a matrix, then +## @var{group} can either be a cell array of strings of a character array, with +## one row per column of @var{x}. If you want to omit this argument, enter an +## empty array ([]). If @var{x} is a vector, then @var{group} must be a vector +## of the same length, or a string array or cell array of strings with one row +## for each element of @var{x}. @var{x} values corresponding to the same value +## of @var{group} are placed in the same group. +## @item +## @var{displayopt} is an optional parameter for displaying the groups contained +## in the data in a boxplot. If omitted, it is 'on' by default. If group names +## are defined in @var{group}, these are used to identify the groups in the +## boxplot. Use 'off' to omit displaying this figure. +## @end itemize +## +## anova1 can return up to three output arguments: +## +## @itemize +## @item +## @var{p} is the p-value of the null hypothesis that all group means are equal. +## @item +## @var{atab} is a cell array containing the results in a standard ANOVA table. +## @item +## @var{stats} is a structure containing statistics useful for performing +## a multiple comparison of means with the MULTCOMPARE function. +## @end itemize +## +## If anova1 is called without any output arguments, then it prints the results +## in a one-way ANOVA table to the standard output. It is also printed when +## @var{displayopt} is 'on'. +## +## +## Examples: +## +## @example +## x = meshgrid (1:6); +## x = x + normrnd (0, 1, 6, 6); +## anova1 (x, [], 'off'); +## [p, atab] = anova1(x); +## @end example +## +## +## @example +## x = ones (50, 4) .* [-2, 0, 1, 5]; +## x = x + normrnd (0, 2, 50, 4); +## groups = @{"A", "B", "C", "D"@}; +## anova1 (x, groups); +## @end example +## +## @end deftypefn + +function [p, anovatab, stats] = anova1 (x, group, displayopt) + + ## check for valid number of input arguments + narginchk (1, 3); + ## add defaults + if (nargin < 2) + group = []; + endif + if (nargin < 3) + displayopt = 'on'; + endif + plotdata = ~(strcmp (displayopt, 'off')); + + ## Convert group to cell array from character array, make it a column + if (! isempty (group) && ischar (group)) + group = cellstr(group); + endif + if (size (group, 1) == 1) + group = group'; + endif + + ## If X is a matrix, convert it to column vector and create a + ## corresponging column vector for groups + if (length (x) < prod (size (x))) + [n, m] = size (x); + x = x(:); + gi = reshape (repmat ((1:m), n, 1), n*m, 1); + if (length (group) == 0) ## no group names are provided + group = gi; + elseif (size (group, 1) == m) ## group names exist and match columns + group = group(gi,:); + else + error("X columns and GROUP length do not match."); + endif + endif + + ## Identify NaN values (if any) and remove them from X along with + ## their corresponding values from group vector + nonan = ~isnan (x); + x = x(nonan); + group = group(nonan, :); + + ## Convert group to indices and separate names + [group_id, group_names] = grp2idx (group); + group_id = group_id(:); + named = 1; + + ## Center data to improve accuracy and keep uncentered data for ploting + xorig = x; + mu = mean(x); + x = x - mu; + xr = x; + + ## Get group size and mean for each group + groups = size (group_names, 1); + xs = zeros (1, groups); + xm = xs; + for j = 1:groups + group_size = find (group_id == j); + xs(j) = length (group_size); + xm(j) = mean (xr(group_size)); + endfor + + ## Calculate statistics + lx = length (xr); ## Number of samples in groups + gm = mean (xr); ## Grand mean of groups + dfm = length (xm) - 1; ## degrees of freedom for model + dfe = lx - dfm - 1; ## degrees of freedom for error + SSM = xs .* (xm - gm) * (xm - gm)'; ## Sum of Squares for Model + SST = (xr(:) - gm)' * (xr(:) - gm); ## Sum of Squares Total + SSE = SST - SSM; ## Sum of Squares Error + if (dfm > 0) + MSM = SSM / dfm; ## Mean Square for Model + else + MSM = NaN; + endif + if (dfe > 0) + MSE = SSE / dfe; ## Mean Square for Error + else + MSE = NaN; + endif + ## Calculate F statistic + if (SSE != 0) ## Regular Matrix case. + F = (SSM / dfm) / MSE; + p = 1 - fcdf (F, dfm, dfe); ## Probability of F given equal means. + elseif (SSM == 0) ## Constant Matrix case. + F = 0; + p = 1; + else ## Perfect fit case. + F = Inf; + p = 0; + end + + ## Create results table (if requested) + if (nargout > 1) + anovatab = {"Source", "SS", "df", "MS", "F", "Prob>F"; ... + "Groups", SSM, dfm, MSM, F, p; ... + "Error", SSE, dfe, MSE, "", ""; ... + "Total", SST, dfm + dfe, "", "", ""}; + endif + ## Create stats structure (if requested) for MULTCOMPARE + if (nargout > 2) + if (length (group_names) > 0) + stats.gnames = group_names; + else + stats.gnames = strjust (num2str ((1:length (xm))'), 'left'); + end + stats.n = xs; + stats.source = 'anova1'; + stats.means = xm + mu; + stats.df = dfe; + stats.s = sqrt (MSE); + endif + ## Print results table on screen if no output argument was requested + if (nargout == 0 || plotdata) + printf(" ANOVA Table\n"); + printf("Source SS df MS F Prob>F\n"); + printf("------------------------------------------------------\n"); + printf("Columns %10.4f %5.0f %10.4f %8.2f %9.4f\n", SSM, dfm, MSM, F, p); + printf("Error %10.4f %5.0f %10.4f\n", SSE, dfe, MSE); + printf("Total %10.4f %5.0f\n", SST, dfm + dfe); + endif + ## Plot data using BOXPLOT (unless opted out) + if (plotdata) + boxplot (x, group_id, 'Notch', "on", 'Labels', group_names); + endif +endfunction + + +%!demo +%! x = meshgrid (1:6); +%! x = x + normrnd (0, 1, 6, 6); +%! anova1 (x, [], 'off'); + +%!demo +%! x = meshgrid (1:6); +%! x = x + normrnd (0, 1, 6, 6); +%! [p, atab] = anova1(x); + +%!demo +%! x = ones (50, 4) .* [-2, 0, 1, 5]; +%! x = x + normrnd (0, 2, 50, 4); +%! groups = {"A", "B", "C", "D"}; +%! anova1 (x, groups); + +## testing against GEAR.DAT data file and results for one-factor ANOVA from +## https://www.itl.nist.gov/div898/handbook/eda/section3/eda354.htm +%!test +%! data = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000, ... +%! 0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988, ... +%! 0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996, ... +%! 1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996, ... +%! 0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996, ... +%! 1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996, ... +%! 0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002, ... +%! 0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006, ... +%! 1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991, ... +%! 0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997]; +%! group = [1:10] .* ones (10,10); +%! group = group(:); +%! [p, tbl] = anova1 (data, group, "off"); +%! assert (p, 0.022661, 1e-6); +%! assert (tbl{2,5}, 2.2969, 1e-4); +%! assert (tbl{2,3}, 9, 0); +%! assert (tbl{4,2}, 0.003903, 1e-6); +%! data = reshape (data, 10, 10); +%! [p, tbl, stats] = anova1 (data, [], "off"); +%! assert (p, 0.022661, 1e-6); +%! assert (tbl{2,5}, 2.2969, 1e-4); +%! assert (tbl{2,3}, 9, 0); +%! assert (tbl{4,2}, 0.003903, 1e-6); +%! means = [0.998, 0.9991, 0.9954, 0.9982, 0.9919, 0.9988, 1.0015, 1.0004, 0.9983, 0.9948]; +%! N = 10 * ones (1, 10); +%! assert (stats.means, means, 1e-6); +%! assert (length (stats.gnames), 10, 0); +%! assert (stats.n, N, 0); diff --git a/inst/anovan.m b/inst/anovan.m new file mode 100644 index 0000000..bac8de7 --- /dev/null +++ b/inst/anovan.m @@ -0,0 +1,359 @@ +## Copyright (C) 2003-2005 Andy Adler +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{pval}, @var{f}, @var{df_b}, @var{df_e}] =} anovan (@var{data}, @var{grps}) +## @deftypefnx {Function File} {[@var{pval}, @var{f}, @var{df_b}, @var{df_e}] =} anovan (@var{data}, @var{grps}, 'param1', @var{value1}) +## Perform a multi-way analysis of variance (ANOVA). The goal is to test +## whether the population means of data taken from @var{k} different +## groups are all equal. +## +## Data is a single vector @var{data} with groups specified by +## a corresponding matrix of group labels @var{grps}, where @var{grps} +## has the same number of rows as @var{data}. For example, if +## @var{data} = [1.1;1.2]; @var{grps}= [1,2,1; 1,5,2]; +## then data point 1.1 was measured under conditions 1,2,1 and +## data point 1.2 was measured under conditions 1,5,2. +## Note that groups do not need to be sequentially numbered. +## +## By default, a 'linear' model is used, computing the N main effects +## with no interactions. this may be modified by param 'model' +## +## p= anovan(data,groups, 'model', modeltype) +## - modeltype = 'linear': compute N main effects +## - modeltype = 'interaction': compute N effects and +## N*(N-1) two-factor interactions +## - modeltype = 'full': compute interactions at all levels +## +## Under the null of constant means, the statistic @var{f} follows an F +## distribution with @var{df_b} and @var{df_e} degrees of freedom. +## +## The p-value (1 minus the CDF of this distribution at @var{f}) is +## returned in @var{pval}. +## +## If no output argument is given, the standard one-way ANOVA table is +## printed. +## +## BUG: DFE is incorrect for modeltypes != full +## @end deftypefn + +## Author: Andy Adler +## Based on code by: KH +## $Id$ +## +## TESTING RESULTS: +## 1. ANOVA ACCURACY: www.itl.nist.gov/div898/strd/anova/anova.html +## Passes 'easy' test. Comes close on 'Average'. Fails 'Higher'. +## This could be fixed with higher precision arithmetic +## 2. Matlab anova2 test +## www.mathworks.com/access/helpdesk/help/toolbox/stats/anova2.html +## % From web site: +## popcorn= [ 5.5 4.5 3.5; 5.5 4.5 4.0; 6.0 4.0 3.0; +## 6.5 5.0 4.0; 7.0 5.5 5.0; 7.0 5.0 4.5]; +## % Define groups so reps = 3 +## groups = [ 1 1;1 2;1 3;1 1;1 2;1 3;1 1;1 2;1 3; +## 2 1;2 2;2 3;2 1;2 2;2 3;2 1;2 2;2 3 ]; +## anovan( vec(popcorn'), groups, 'model', 'full') +## % Results same as Matlab output +## 3. Matlab anovan test +## www.mathworks.com/access/helpdesk/help/toolbox/stats/anovan.html +## % From web site +## y = [52.7 57.5 45.9 44.5 53.0 57.0 45.9 44.0]'; +## g1 = [1 2 1 2 1 2 1 2]; +## g2 = {'hi';'hi';'lo';'lo';'hi';'hi';'lo';'lo'}; +## g3 = {'may'; 'may'; 'may'; 'may'; 'june'; 'june'; 'june'; 'june'}; +## anovan( y', [g1',g2',g3']) +## % Fails because we always do interactions + +function [PVAL, FSTAT, DF_B, DFE] = anovan (data, grps, varargin) + + if nargin <= 1 + usage ("anovan (data, grps)"); + end + + # test supplied parameters + modeltype= 'linear'; + for idx= 3:2:nargin + param= varargin{idx-2}; + value= varargin{idx-1}; + + if strcmp(param, 'model') + modeltype= value; +# elseif strcmp(param # add other parameters here + else + error(sprintf('parameter %s is not supported', param)); + end + end + + if ~isvector (data) + error ("anova: for `anova (data, grps)', data must be a vector"); + endif + + nd = size (grps,1); # number of data points + nw = size (grps,2); # number of anova "ways" + if (~ isvector (data) || (length(data) ~= nd)) + error ("anova: grps must be a matrix of the same number of rows as data"); + endif + + [g,grp_map] = relabel_groups (grps); + if strcmp(modeltype, 'linear') + max_interact = 1; + elseif strcmp(modeltype,'interaction') + max_interact = 2; + elseif strcmp(modeltype,'full') + max_interact = rows(grps); + else + error(sprintf('modeltype %s is not supported', modeltype)); + end + ng = length(grp_map); + int_tbl = interact_tbl (nw, ng, max_interact ); + [gn, gs, gss] = raw_sums(data, g, ng, int_tbl); + + stats_tbl = int_tbl(2:size(int_tbl,1),:)>0; + nstats= size(stats_tbl,1); + stats= zeros( nstats+1, 5); # SS, DF, MS, F, p + for i= 1:nstats + [SS, DF, MS]= factor_sums( gn, gs, gss, stats_tbl(i,:), ng, nw); + stats(i,1:3)= [SS, DF, MS]; + end + + # The Mean squared error is the data - avg for each possible measurement + # This calculation doesn't work unless there is replication for all grps +# SSE= sum( gss(sel) ) - sum( gs(sel).^2 ./ gn(sel) ); + SST= gss(1) - gs(1)^2/gn(1); + SSE= SST - sum(stats(:,1)); + sel = select_pat( ones(1,nw), ng, nw); %incorrect for modeltypes != full + DFE= sum( (gn(sel)-1).*(gn(sel)>0) ); + MSE= SSE/DFE; + stats(nstats+1,1:3)= [SSE, DFE, MSE]; + + for i= 1:nstats + MS= stats(i,3); + DF= stats(i,2); + F= MS/MSE; + pval = 1 - fcdf (F, DF, DFE); + stats(i,4:5)= [F, pval]; + end + + if nargout==0; + printout( stats, stats_tbl ); + else + PVAL= stats(1:nstats,5); + FSTAT=stats(1:nstats,4); + DF_B= stats(1:nstats,2); + DF_E= DFE; + end +endfunction + + +# relabel groups to a mapping from 1 to ng +# Input +# grps input grouping +# Output +# g relabelled grouping +# grp_map map from output to input grouping +function [g,grp_map] = relabel_groups(grps) + grp_vec= vec(grps); + s= sort (grp_vec); + uniq = 1+[0;find(diff(s))]; + # mapping from new grps to old groups + grp_map = s(uniq); + # create new group g + ngroups= length(uniq); + g= zeros(size(grp_vec)); + for i = 1:ngroups + g( find( grp_vec== grp_map(i) ) ) = i; + end + g= reshape(g, size(grps)); +endfunction + +# Create interaction table +# +# Input: +# nw number of "ways" +# ng number of ANOVA groups +# max_interact maximum number of interactions to consider +# default is nw +function int_tbl =interact_tbl(nw, ng, max_interact) + combin= 2^nw; + inter_tbl= zeros( combin, nw); + idx= (0:combin-1)'; + for i=1:nw; + inter_tbl(:,i) = ( rem(idx,2^i) >= 2^(i-1) ); + end + + # find elements with more than max_interact 1's + idx = ( sum(inter_tbl',1) > max_interact ); + inter_tbl(idx,:) =[]; + combin= size(inter_tbl,1); # update value + + #scale inter_tbl + # use ng+1 to map combinations of groups to integers + # this would be lots easier with a hash data structure + int_tbl = inter_tbl .* (ones(combin,1) * (ng+1).^(0:nw-1) ); +endfunction + +# Calculate sums for each combination +# +# Input: +# g relabelled grouping matrix +# ng number of ANOVA groups +# max_interact +# +# Output (virtual (ng+1)x(nw) matrices): +# gn number of data sums in each group +# gs sum of data in each group +# gss sumsqr of data in each group +function [gn, gs, gss] = raw_sums(data, g, ng, int_tbl); + nw= size(g,2); + ndata= size(g,1); + gn= gs= gss= zeros((ng+1)^nw, 1); + for i=1:ndata + # need offset by one for indexing + datapt= data(i); + idx = 1+ int_tbl*g(i,:)'; + gn(idx) +=1; + gs(idx) +=datapt; + gss(idx) +=datapt^2; + end +endfunction + +# Calcualte the various factor sums +# Input: +# gn number of data sums in each group +# gs sum of data in each group +# gss sumsqr of data in each group +# select binary vector of factor for this "way"? +# ng number of ANOVA groups +# nw number of ways + +function [SS,DF]= raw_factor_sums( gn, gs, gss, select, ng, nw); + sel= select_pat( select, ng, nw); + ss_raw= gs(sel).^2 ./ gn(sel); + SS= sum( ss_raw( ~isnan(ss_raw) )); + if length(find(select>0))==1 + DF= sum(gn(sel)>0)-1; + else + DF= 1; #this isn't the real DF, but needed to multiply + end +endfunction + +function [SS, DF, MS]= factor_sums( gn, gs, gss, select, ng, nw); + SS=0; + DF=1; + + ff = find(select); + lff= length(ff); + # zero terms added, one term subtracted, two added, etc + for i= 0:2^lff-1 + remove= find( rem( floor( i * 2.^(-lff+1:0) ), 2) ); + sel1= select; + if ~isempty(remove) + sel1( ff( remove ) )=0; + end + [raw_sum,raw_df]= raw_factor_sums(gn,gs,gss,sel1,ng,nw); + + add_sub= (-1)^length(remove); + SS+= add_sub*raw_sum; + DF*= raw_df; + end + + MS= SS/DF; +endfunction + +# Calcualte the various factor sums +# Input: +# select binary vector of factor for this "way"? +# ng number of ANOVA groups +# nw number of ways +function sel= select_pat( select, ng, nw); + # if select(i) is zero, remove nonzeros + # if select(i) is zero, remove zero terms for i + field=[]; + + if length(select) ~= nw; + error("length of select must be = nw"); + end + ng1= ng+1; + + if isempty(field) + # expand 0:(ng+1)^nw in base ng+1 + field= (0:(ng1)^nw-1)'* ng1.^(-nw+1:0); + field= rem( floor( field), ng1); + # select zero or non-zero elements + field= field>0; + end + sel= find( all( field == ones(ng1^nw,1)*select(:)', 2) ); +endfunction + + +function printout( stats, stats_tbl ); + nw= size( stats_tbl,2); + [jnk,order]= sort( sum(stats_tbl,2) ); + + printf('\n%d-way ANOVA Table (Factors A%s):\n\n', nw, ... + sprintf(',%c',double('A')+(1:nw-1)) ); + printf('Source of Variation Sum Sqr df MeanSS Fval p-value\n'); + printf('*********************************************************************\n'); + printf('Error %10.2f %4d %10.2f\n', stats( size(stats,1),1:3)); + + for i= order(:)' + str= sprintf(' %c x',double('A')+find(stats_tbl(i,:)>0)-1 ); + str= str(1:length(str)-2); # remove x + printf('Factor %15s %10.2f %4d %10.2f %7.3f %7.6f\n', ... + str, stats(i,:) ); + end + printf('\n'); +endfunction + +#{ +# Test Data from http://maths.sci.shu.ac.uk/distance/stats/14.shtml +data=[7 9 9 8 12 10 ... + 9 8 10 11 13 13 ... + 9 10 10 12 10 12]'; +grp = [1,1; 1,1; 1,2; 1,2; 1,3; 1,3; + 2,1; 2,1; 2,2; 2,2; 2,3; 2,3; + 3,1; 3,1; 3,2; 3,2; 3,3; 3,3]; +data=[7 9 9 8 12 10 9 8 ... + 9 8 10 11 13 13 10 11 ... + 9 10 10 12 10 12 10 12]'; +grp = [1,4; 1,4; 1,5; 1,5; 1,6; 1,6; 1,7; 1,7; + 2,4; 2,4; 2,5; 2,5; 2,6; 2,6; 2,7; 2,7; + 3,4; 3,4; 3,5; 3,5; 3,6; 3,6; 3,7; 3,7]; +# Test Data from http://maths.sci.shu.ac.uk/distance/stats/9.shtml +data=[9.5 11.1 11.9 12.8 ... + 10.9 10.0 11.0 11.9 ... + 11.2 10.4 10.8 13.4]'; +grp= [1:4,1:4,1:4]'; +# Test Data from http://maths.sci.shu.ac.uk/distance/stats/13.shtml +data=[7.56 9.68 11.65 ... + 9.98 9.69 10.69 ... + 7.23 10.49 11.77 ... + 8.22 8.55 10.72 ... + 7.59 8.30 12.36]'; +grp = [1,1;1,2;1,3; + 2,1;2,2;2,3; + 3,1;3,2;3,3; + 4,1;4,2;4,3; + 5,1;5,2;5,3]; +# Test Data from www.mathworks.com/ +# access/helpdesk/help/toolbox/stats/linear10.shtml +data=[23 27 43 41 15 17 3 9 20 63 55 90]; +grp= [ 1 1 1 1 2 2 2 2 3 3 3 3; + 1 1 2 2 1 1 2 2 1 1 2 2]'; +#} + + + diff --git a/inst/bbscdf.m b/inst/bbscdf.m new file mode 100644 index 0000000..db6bf18 --- /dev/null +++ b/inst/bbscdf.m @@ -0,0 +1,107 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} bbscdf (@var{x}, @var{shape}, @var{scale}, @var{location}) +## For each element of @var{x}, compute the cumulative distribution function +## (CDF) at @var{x} of the Birnbaum-Saunders distribution with parameters +## @var{location}, @var{scale} and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: CDF of the Birnbaum-Saunders distribution + +function cdf = bbscdf (x, shape, scale, location) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar(shape)) + [retval, x, location, scale, shape] = ... + common_size (x, location, scale, shape); + if (retval > 0) + error ("bbscdf: X, LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (location) || iscomplex (scale) ... + || iscomplex(shape)) + error ("bbscdf: X, LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (isa (x, "single") || isa (location, "single") || isa (scale, "single") ... + || isa (shape, "single")) + cdf = zeros (size (x), "single"); + else + cdf = zeros (size (x)); + endif + + k = isnan(x) | ! (-Inf < location) | ! (location < Inf) ... + | ! (scale > 0) | ! (scale < Inf) | ! (shape > 0) | ! (shape < Inf); + cdf(k) = NaN; + + k = (x > location) & (x <= Inf) & (-Inf < location) & (location < Inf) ... + & (0 < scale) & (scale < Inf) & (0 < shape) & (shape < Inf); + if (isscalar (location) && isscalar(scale) && isscalar(shape)) + a = x(k) - location; + b = sqrt(a ./ scale); + cdf(k) = normcdf ((b - b.^-1) / shape); + else + a = x(k) - location(k); + b = sqrt(a ./ scale(k)); + cdf(k) = normcdf ((b - b.^-1) ./ shape(k)); + endif +endfunction + + +%!shared x,y +%! x = [-1, 0, 1, 2, Inf]; +%! y = [0, 0, 1/2, 0.76024993890652337, 1]; +%!assert (bbscdf (x, ones (1,5), ones (1,5), zeros (1,5)), y, eps) +%!assert (bbscdf (x, 1, 1, zeros (1,5)), y, eps) +%!assert (bbscdf (x, 1, ones (1,5), 0), y, eps) +%!assert (bbscdf (x, ones (1,5), 1, 0), y, eps) +%!assert (bbscdf (x, 1, 1, 0), y, eps) +%!assert (bbscdf (x, 1, 1, [0, 0, NaN, 0, 0]), [y(1:2), NaN, y(4:5)], eps) +%!assert (bbscdf (x, 1, [1, 1, NaN, 1, 1], 0), [y(1:2), NaN, y(4:5)], eps) +%!assert (bbscdf (x, [1, 1, NaN, 1, 1], 1, 0), [y(1:2), NaN, y(4:5)], eps) +%!assert (bbscdf ([x, NaN], 1, 1, 0), [y, NaN], eps) + +## Test class of input preserved +%!assert (bbscdf (single ([x, NaN]), 1, 1, 0), single ([y, NaN]), eps('single')) +%!assert (bbscdf ([x, NaN], 1, 1, single (0)), single ([y, NaN]), eps('single')) +%!assert (bbscdf ([x, NaN], 1, single (1), 0), single ([y, NaN]), eps('single')) +%!assert (bbscdf ([x, NaN], single (1), 1, 0), single ([y, NaN]), eps('single')) + +## Test input validation +%!error bbscdf () +%!error bbscdf (1) +%!error bbscdf (1,2,3) +%!error bbscdf (1,2,3,4,5) +%!error bbscdf (ones (3), ones (2), ones(2), ones(2)) +%!error bbscdf (ones (2), ones (3), ones(2), ones(2)) +%!error bbscdf (ones (2), ones (2), ones(3), ones(2)) +%!error bbscdf (ones (2), ones (2), ones(2), ones(3)) +%!error bbscdf (i, 4, 3, 2) +%!error bbscdf (1, i, 3, 2) +%!error bbscdf (1, 4, i, 2) +%!error bbscdf (1, 4, 3, i) + diff --git a/inst/bbsinv.m b/inst/bbsinv.m new file mode 100644 index 0000000..141e41e --- /dev/null +++ b/inst/bbsinv.m @@ -0,0 +1,115 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} bbsinv (@var{x}, @var{shape}, @var{scale}, @var{location}) +## For each element of @var{x}, compute the quantile (the inverse of the CDF) +## at @var{x} of the Birnbaum-Saunders distribution with parameters +## @var{location}, @var{scale}, and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Quantile function of the Birnbaum-Saunders distribution + +function inv = bbsinv (x, shape, scale, location) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar(shape)) + [retval, x, location, scale, shape] = ... + common_size (x, location, scale, shape); + if (retval > 0) + error ("bbsinv: X, LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (location) ... + || iscomplex (scale) || iscomplex(shape)) + error ("bbsinv: X, LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (isa (x, "single") || isa (location, "single") ... + || isa (scale, "single") || isa (shape, "single")) + inv = zeros (size (x), "single"); + else + inv = zeros (size (x)); + endif + + k = isnan (x) | (x < 0) | (x > 1) | ! (-Inf < location) | ! (location < Inf) ... + | ! (scale > 0) | ! (scale < Inf) | ! (shape > 0) | ! (shape < Inf); + inv(k) = NaN; + + k = (x <= 0) & (-Inf < location) & (location < Inf) ... + & (scale > 0) & (scale < Inf) & (shape > 0) & (shape < Inf); + inv(k) = 0; + + k = (x == 1) & (-Inf < location) & (location < Inf) ... + & (scale > 0) & (scale < Inf) & (shape > 0) & (shape < Inf); + inv(k) = Inf; + + k = (0 < x) & (x < 1) & (location < Inf) & (0 < scale) & (scale < Inf) ... + & (0 < shape) & (shape < Inf); + if (isscalar (location) && isscalar(scale) && isscalar(shape)) + y = shape * norminv (x(k)); + inv(k) = location + scale * (y + sqrt (4 + y.^2)).^2 / 4; + else + y = shape(k) .* norminv (x(k)); + inv(k) = location(k) + scale(k) .* (y + sqrt (4 + y.^2)).^2 ./ 4; + endif + +endfunction + + +%!shared x,y,f +%! f = @(x,a,b,c) (a + b * (c * norminv (x) + sqrt (4 + (c * norminv(x))^2))^2) / 4; +%! x = [-1, 0, 1/4, 1/2, 1, 2]; +%! y = [0, 0, f(1/4, 0, 1, 1), 1, Inf, NaN]; +%!assert (bbsinv (x, ones (1,6), ones (1,6), zeros (1,6)), y) +%!assert (bbsinv (x, 1, 1, zeros (1,6)), y) +%!assert (bbsinv (x, 1, ones (1,6), 0), y) +%!assert (bbsinv (x, ones (1,6), 1, 0), y) +%!assert (bbsinv (x, 1, 1, 0), y) +%!assert (bbsinv (x, 1, 1, [0, 0, 0, NaN, 0, 0]), [y(1:3), NaN, y(5:6)]) +%!assert (bbsinv (x, 1, [1, 1, 1, NaN, 1, 1], 0), [y(1:3), NaN, y(5:6)]) +%!assert (bbsinv (x, [1, 1, 1, NaN, 1, 1], 1, 0), [y(1:3), NaN, y(5:6)]) +%!assert (bbsinv ([x, NaN], 1, 1, 0), [y, NaN]) + +## Test class of input preserved +%!assert (bbsinv (single ([x, NaN]), 1, 1, 0), single ([y, NaN])) +%!assert (bbsinv ([x, NaN], 1, 1, single (0)), single ([y, NaN])) +%!assert (bbsinv ([x, NaN], 1, single (1), 0), single ([y, NaN])) +%!assert (bbsinv ([x, NaN], single (1), 1, 0), single ([y, NaN])) + +## Test input validation +%!error bbsinv () +%!error bbsinv (1) +%!error bbsinv (1,2,3) +%!error bbsinv (1,2,3,4,5) +%!error bbsinv (ones (3), ones (2), ones(2), ones(2)) +%!error bbsinv (ones (2), ones (3), ones(2), ones(2)) +%!error bbsinv (ones (2), ones (2), ones(3), ones(2)) +%!error bbsinv (ones (2), ones (2), ones(2), ones(3)) +%!error bbsinv (i, 4, 3, 2) +%!error bbsinv (1, i, 3, 2) +%!error bbsinv (1, 4, i, 2) +%!error bbsinv (1, 4, 3, i) + diff --git a/inst/bbspdf.m b/inst/bbspdf.m new file mode 100644 index 0000000..f1764fc --- /dev/null +++ b/inst/bbspdf.m @@ -0,0 +1,113 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} bbspdf (@var{x}, @var{shape}, @var{scale}, @var{location}) +## For each element of @var{x}, compute the probability density function (PDF) +## at @var{x} of the Birnbaum-Saunders distribution with parameters +## @var{location}, @var{scale} and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: PDF of the Birnbaum-Saunders distribution + +function pdf = bbspdf (x, shape, scale, location) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar(shape)) + [retval, x, location, scale, shape] = ... + common_size (x, location, scale, shape); + if (retval > 0) + error ("bbspdf: X, LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (location) ... + || iscomplex (scale) || iscomplex(shape)) + error ("bbspdf: X, LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (isa (x, "single") || isa (location, "single") || isa (scale, "single") ... + || isa (shape, "single")) + pdf = zeros (size (x), "single"); + else + pdf = zeros (size (x)); + endif + + k = isnan (x) | ! (-Inf < location) | ! (location < Inf) ... + | ! (scale > 0) | ! (scale < Inf) ... + | ! (shape > 0) | ! (shape < Inf); + + pdf(k) = NaN; + + k = (x > location) & (x < Inf) & (-Inf < location) ... + & (location < Inf) & (0 < scale) & (scale < Inf) ... + & (0 < shape) & (shape < Inf); + + if (isscalar (location) && isscalar(scale) && isscalar(shape)) + a = x(k) - location; + b = sqrt(a ./ scale); + pdf(k) = ((b + b.^-1) ./ (2 * shape * a)) ... + .* normpdf ((b - b.^-1) / shape); + else + a = x(k) - location(k); + b = sqrt(a ./ scale(k)); + pdf(k) = ((b + b.^-1) ./ (2 * shape(k).* a)) ... + .* normpdf ((b - b.^-1) ./ shape(k)); + endif +endfunction + + +%!shared x,y +%! x = [-1, 0, 1, 2, Inf]; +%! y = [0, 0, 0.3989422804014327, 0.1647717335503959, 0]; +%!assert (bbspdf (x, ones (1,5), ones (1,5), zeros (1,5)), y, eps) +%!assert (bbspdf (x, 1, 1, zeros (1,5)), y, eps) +%!assert (bbspdf (x, 1, ones (1,5), 0), y, eps) +%!assert (bbspdf (x, ones (1,5), 1, 0), y, eps) +%!assert (bbspdf (x, 1, 1, 0), y, eps) +%!assert (bbspdf (x, 1, 1, [0, 0, NaN, 0, 0]), [y(1:2), NaN, y(4:5)], eps) +%!assert (bbspdf (x, 1, [1, 1, NaN, 1, 1], 0), [y(1:2), NaN, y(4:5)], eps) +%!assert (bbspdf (x, [1, 1, NaN, 1, 1], 1, 0), [y(1:2), NaN, y(4:5)], eps) +%!assert (bbspdf ([x, NaN], 1, 1, 0), [y, NaN], eps) + +## Test class of input preserved +%!assert (bbspdf (single ([x, NaN]), 1, 1, 0), single ([y, NaN]), eps('single')) +%!assert (bbspdf ([x, NaN], 1, 1, single (0)), single ([y, NaN]), eps('single')) +%!assert (bbspdf ([x, NaN], 1, single (1), 0), single ([y, NaN]), eps('single')) +%!assert (bbspdf ([x, NaN], single (1), 1, 0), single ([y, NaN]), eps('single')) + +## Test input validation +%!error bbspdf () +%!error bbspdf (1) +%!error bbspdf (1,2,3) +%!error bbspdf (1,2,3,4,5) +%!error bbspdf (ones (3), ones (2), ones(2), ones(2)) +%!error bbspdf (ones (2), ones (3), ones(2), ones(2)) +%!error bbspdf (ones (2), ones (2), ones(3), ones(2)) +%!error bbspdf (ones (2), ones (2), ones(2), ones(3)) +%!error bbspdf (i, 4, 3, 2) +%!error bbspdf (1, i, 3, 2) +%!error bbspdf (1, 4, i, 2) +%!error bbspdf (1, 4, 3, i) + diff --git a/inst/bbsrnd.m b/inst/bbsrnd.m new file mode 100644 index 0000000..a88cf1c --- /dev/null +++ b/inst/bbsrnd.m @@ -0,0 +1,145 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} bbsrnd (@var{shape}, @var{scale}, @var{location}) +## @deftypefnx {} {} bbsrnd (@var{shape}, @var{scale}, @var{location}, @var{r}) +## @deftypefnx {} {} bbsrnd (@var{shape}, @var{scale}, @var{location}, @var{r}, @var{c}, @dots{}) +## @deftypefnx {} {} bbsrnd (@var{shape}, @var{scale}, @var{location}, [@var{sz}]) +## Return a matrix of random samples from the Birnbaum-Saunders +## distribution with parameters @var{location}, @var{scale} and @var{shape}. +## +## When called with a single size argument, return a square matrix with +## the dimension specified. When called with more than one scalar argument the +## first two arguments are taken as the number of rows and columns and any +## further arguments specify additional matrix dimensions. The size may also +## be specified with a vector of dimensions @var{sz}. +## +## If no size arguments are given then the result matrix is the common size of +## @var{location}, @var{scale} and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Random deviates from the Birnbaum-Saunders distribution + +function rnd = bbsrnd (shape, scale, location, varargin) + + if (nargin < 3) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar (shape)) + [retval, location, scale, shape] = common_size (location, scale, shape); + if (retval > 0) + error ("bbsrnd: LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (location) || iscomplex (scale) || iscomplex (shape)) + error ("bbsrnd: LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (nargin == 3) + sz = size (location); + elseif (nargin == 4) + if (isscalar (varargin{1}) && varargin{1} >= 0) + sz = [varargin{1}, varargin{1}]; + elseif (isrow (varargin{1}) && all (varargin{1} >= 0)) + sz = varargin{1}; + else + error ("bbsrnd: dimension vector must be row vector of non-negative integers"); + endif + elseif (nargin > 3) + if (any (cellfun (@(x) (! isscalar (x) || x < 0), varargin))) + error ("bbsrnd: dimensions must be non-negative integers"); + endif + sz = [varargin{:}]; + endif + + if (! isscalar (location) && ! isequal (size (location), sz)) + error ("bbsrnd: LOCATION, SCALE and SHAPE must be scalar or of size SZ"); + endif + + if (isa (location, "single") || isa (scale, "single") || isa (shape, "single")) + cls = "single"; + else + cls = "double"; + endif + + if (isscalar (location) && isscalar (scale) && isscalar (shape)) + if ((-Inf < location) && (location < Inf) ... + && (0 < scale) && (scale < Inf) ... + && (0 < shape) && (shape < Inf)) + rnd = rand(sz,cls); + y = shape * norminv (rnd); + rnd = location + scale * (y + sqrt (4 + y.^2)).^2 / 4; + else + rnd = NaN (sz, cls); + endif + else + rnd = NaN (sz, cls); + + k = (-Inf < location) & (location < Inf) ... + & (0 < scale) & (scale < Inf) ... + & (0 < shape) & (shape < Inf); + rnd(k) = rand(sum(k(:)),1); + y = shape(k) .* norminv (rnd(k)); + rnd(k) = location(k) + scale(k) .* (y + sqrt (4 + y.^2)).^2 / 4; + endif +endfunction + + +%!assert (size (bbsrnd (1, 1, 0)), [1 1]) +%!assert (size (bbsrnd (1, 1, zeros (2,1))), [2, 1]) +%!assert (size (bbsrnd (1, 1, zeros (2,2))), [2, 2]) +%!assert (size (bbsrnd (1, ones (2,1), 0)), [2, 1]) +%!assert (size (bbsrnd (1, ones (2,2), 0)), [2, 2]) +%!assert (size (bbsrnd (ones (2,1), 1, 0)), [2, 1]) +%!assert (size (bbsrnd (ones (2,2), 1, 0)), [2, 2]) +%!assert (size (bbsrnd (1, 1, 0, 3)), [3, 3]) +%!assert (size (bbsrnd (1, 1, 0, [4 1])), [4, 1]) +%!assert (size (bbsrnd (1, 1, 0, 4, 1)), [4, 1]) + +## Test class of input preserved +%!assert (class (bbsrnd (1,1,0)), "double") +%!assert (class (bbsrnd (1, 1, single (0))), "single") +%!assert (class (bbsrnd (1, 1, single ([0 0]))), "single") +%!assert (class (bbsrnd (1, single (1), 0)), "single") +%!assert (class (bbsrnd (1, single ([1 1]), 0)), "single") +%!assert (class (bbsrnd (single (1), 1, 0)), "single") +%!assert (class (bbsrnd (single ([1 1]), 1, 0)), "single") + +## Test input validation +%!error bbsrnd () +%!error bbsrnd (1) +%!error bbsrnd (1,2) +%!error bbsrnd (ones (3), ones (2), ones (2), 2) +%!error bbsrnd (ones (2), ones (3), ones (2), 2) +%!error bbsrnd (ones (2), ones (2), ones (3), 2) +%!error bbsrnd (i, 2, 3) +%!error bbsrnd (1, i, 3) +%!error bbsrnd (1, 2, i) +%!error bbsrnd (1,2,3, -1) +%!error bbsrnd (1,2,3, ones (2)) +%!error bbsrnd (1,2,3, [2 -1 2]) +%!error bbsrnd (2, 1, ones (2), 3) +%!error bbsrnd (2, 1, ones (2), [3, 2]) +%!error bbsrnd (2, 1, ones (2), 3, 2) + diff --git a/inst/betastat.m b/inst/betastat.m new file mode 100644 index 0000000..7b94ba6 --- /dev/null +++ b/inst/betastat.m @@ -0,0 +1,129 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} betastat (@var{a}, @var{b}) +## Compute mean and variance of the beta distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{a} is the first parameter of the beta distribution. @var{a} must be +## positive +## +## @item +## @var{b} is the second parameter of the beta distribution. @var{b} must be +## positive +## @end itemize +## @var{a} and @var{b} must be of common size or one of them must be scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the beta distribution +## +## @item +## @var{v} is the variance of the beta distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## a = 1:6; +## b = 1:0.2:2; +## [m, v] = betastat (a, b) +## @end group +## +## @group +## [m, v] = betastat (a, 1.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the beta distribution + +function [m, v] = betastat (a, b) + + if (nargin != 2) + print_usage (); + elseif (! isscalar (a) && ! isscalar (b) && ! size_equal (a, b)) + error ("betastat: a and b must be of common size or scalar"); + endif + + k = find (! (a > 0 & b > 0)); + + # Calculate moments + a_b = a + b; + m = a ./ (a_b); + m(k) = NaN; + + if (nargout > 1) + v = (a .* b) ./ ((a_b .^ 2) .* (a_b + 1)); + v(k) = NaN; + endif + +endfunction + +%!test +%! a = -2:6; +%! b = 0.4:0.2:2; +%! [m, v] = betastat (a, b); +%! expected_m = [NaN NaN NaN 1/2 2/3.2 3/4.4 4/5.6 5/6.8 6/8]; +%! expected_v = [NaN NaN NaN 0.0833, 0.0558, 0.0402, 0.0309, 0.0250, 0.0208]; +%! assert (m, expected_m, eps*100); +%! assert (v, expected_v, 0.001); + +%!test +%! a = -2:1:6; +%! [m, v] = betastat (a, 1.5); +%! expected_m = [NaN NaN NaN 1/2.5 2/3.5 3/4.5 4/5.5 5/6.5 6/7.5]; +%! expected_v = [NaN NaN NaN 0.0686, 0.0544, 0.0404, 0.0305, 0.0237, 0.0188]; +%! assert (m, expected_m); +%! assert (v, expected_v, 0.001); + +%!test +%! a = [14 Inf 10 NaN 10]; +%! b = [12 9 NaN Inf 12]; +%! [m, v] = betastat (a, b); +%! expected_m = [14/26 NaN NaN NaN 10/22]; +%! expected_v = [168/18252 NaN NaN NaN 120/11132]; +%! assert (m, expected_m); +%! assert (v, expected_v); + +%!assert (nthargout (1:2, @betastat, 5, []), {[], []}) +%!assert (nthargout (1:2, @betastat, [], 5), {[], []}) +%!assert (nthargout (1:2, @betastat, "", 5), {[], []}) +%!assert (nthargout (1:2, @betastat, true, 5), {1/6, 5/252}) + +%!assert (size (betastat (rand (10, 5, 4), rand (10, 5, 4))), [10 5 4]) +%!assert (size (betastat (rand (10, 5, 4), 7)), [10 5 4]) + diff --git a/inst/binostat.m b/inst/binostat.m new file mode 100644 index 0000000..f1bc371 --- /dev/null +++ b/inst/binostat.m @@ -0,0 +1,128 @@ +## Copyright (C) 2006, 2007 Arno Onken +## Copyright (C) 2015 Carnë Draug +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} binostat (@var{n}, @var{p}) +## Compute mean and variance of the binomial distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{n} is the first parameter of the binomial distribution. The elements +## of @var{n} must be natural numbers +## +## @item +## @var{p} is the second parameter of the binomial distribution. The +## elements of @var{p} must be probabilities +## @end itemize +## @var{n} and @var{p} must be of common size or one of them must be scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the binomial distribution +## +## @item +## @var{v} is the variance of the binomial distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## n = 1:6; +## p = 0:0.2:1; +## [m, v] = binostat (n, p) +## @end group +## +## @group +## [m, v] = binostat (n, 0.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the binomial distribution + +function [m, v] = binostat (n, p) + + if (nargin != 2) + print_usage (); + elseif (! isscalar (n) && ! isscalar (p) && ! size_equal (n, p)) + error ("binostat: N and P must be of common size or scalar"); + endif + + k = find (! (n > 0 & fix (n) == n & p >= 0 & p <= 1)); + + # Calculate moments + m = n .* p; + m(k) = NaN; + + if (nargout > 1) + v = m .* (1 - p); + v(k) = NaN; + endif + +endfunction + +%!test +%! n = 1:6; +%! p = 0:0.2:1; +%! [m, v] = binostat (n, p); +%! expected_m = [0.00, 0.40, 1.20, 2.40, 4.00, 6.00]; +%! expected_v = [0.00, 0.32, 0.72, 0.96, 0.80, 0.00]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! n = 1:6; +%! [m, v] = binostat (n, 0.5); +%! expected_m = [0.50, 1.00, 1.50, 2.00, 2.50, 3.00]; +%! expected_v = [0.25, 0.50, 0.75, 1.00, 1.25, 1.50]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! n = [-Inf -3 5 0.5 3 NaN 100, Inf]; +%! [m, v] = binostat (n, 0.5); +%! assert (isnan (m), [true true false true false true false false]) +%! assert (isnan (v), [true true false true false true false false]) +%! assert (m(end), Inf); +%! assert (v(end), Inf); + +%!assert (nthargout (1:2, @binostat, 5, []), {[], []}) +%!assert (nthargout (1:2, @binostat, [], 5), {[], []}) +%!assert (nthargout (1:2, @binostat, "", 5), {[], []}) +%!assert (nthargout (1:2, @binostat, true, 5), {NaN, NaN}) +%!assert (nthargout (1:2, @binostat, 5, true), {5, 0}) + +%!assert (size (binostat (randi (100, 10, 5, 4), rand (10, 5, 4))), [10 5 4]) +%!assert (size (binostat (randi (100, 10, 5, 4), 7)), [10 5 4]) diff --git a/inst/binotest.m b/inst/binotest.m new file mode 100644 index 0000000..ecdff54 --- /dev/null +++ b/inst/binotest.m @@ -0,0 +1,142 @@ +## Copyright (C) 2016 Andreas Stahel +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{h}, @var{pval}, @var{ci}] =} binotest (@var{pos},@var{N},@var{p0}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}] =} binotest (@var{pos},@var{N},@var{p0},@var{Name},@var{Value}) +## Test for probability @var{p} of a binomial sample +## +## Perform a test of the null hypothesis @var{p} == @var{p0} for a sample +## of size @var{N} with @var{pos} positive results +## +## +## Name-Value pair arguments can be used to set various options. +## @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). The option @qcode{"tail"}, +## can be used to select the desired alternative hypotheses. If the +## value is @qcode{"both"} (default) the null is tested against the two-sided +## alternative @code{@var{p} != @var{p0}}. The value of @var{pval} is +## determined by adding the probabilities of all event less or equally +## likely than the observed number @var{pos} of positive events. +## If the value of @qcode{"tail"} is @qcode{"right"} +## the one-sided alternative @code{@var{p} > @var{p0}} is considered. +## Similarly for @qcode{"left"}, the one-sided alternative +## @code{@var{p} < @var{p0}} is considered. +## +## If @var{h} is 0 the null hypothesis is accepted, if it is 1 the null +## hypothesis is rejected. The p-value of the test is returned in @var{pval}. +## A 100(1-alpha)% confidence interval is returned in @var{ci}. +## +## @end deftypefn + + +## Author: Andreas Stahel + +function [h, p, ci] = binotest(pos,n,p0,varargin) + + % Set default arguments + alpha = 0.05; + tail = 'both'; + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail,'char') + error('tail argument to vartest must be a string\n',[]); + end + + if (n<=0) + error('binotest: required n>0\n',[]); + end + if (p0<0)|(p0>1) + error('binotest: required 0<= p0 <= 1\n',[]); + end + if (pos<0)|(pos>n) + error('binotest: required 0<= pos <= n\n',[]); + end + + % Based on the "tail" argument determine the P-value, the critical values, + % and the confidence interval. + switch lower(tail) + case 'both' + A_low = binoinv(alpha/2,n,p0)/n; + A_high = binoinv(1-alpha/2,n,p0)/n; + p_pos = binopdf(pos,n,p0); + p_all = binopdf([0:n],n,p0); + ind = find(p_all <=p_pos); +% p = min(1,sum(p_all(ind))); + p = sum(p_all(ind)); + if pos==0 p_low = 0; + else p_low = fzero(@(pl)1-binocdf(pos-1,n,pl)-alpha/2,[0 1]); + endif + if pos==n p_high = 1; + else p_high = fzero(@(ph) binocdf(pos,n,ph) -alpha/2,[0,1]); + endif + ci = [p_low,p_high]; + case 'left' + p = 1-binocdf(pos-1,n,p0); + if pos==n p_high = 1; + else p_high = fzero(@(ph) binocdf(pos,n,ph) -alpha,[0,1]); + endif + ci = [0, p_high]; + case 'right' + p = binocdf(pos,n,p0); + if pos==0 p_low = 0; + else p_low = fzero(@(pl)1-binocdf(pos-1,n,pl)-alpha,[0 1]); + endif + ci = [p_low 1]; + otherwise + error('Invalid fifth (tail) argument to binotest\n',[]); + end + + % Determine the test outcome + % MATLAB returns this a double instead of a logical array + h = double(p < alpha); +end + +%!demo +%! % flip a coin 1000 times, showing 475 heads +%! % Hypothesis: coin is fair, i.e. p=1/2 +%! [h,p_val,ci] = binotest(475,1000,0.5) +%! % Result: h = 0 : null hypothesis not rejected, coin could be fair +%! % P value 0.12, i.e. hypothesis not rejected for alpha up to 12% +%! % 0.444 <= p <= 0.506 with 95% confidence + +%!demo +%! % flip a coin 100 times, showing 65 heads +%! % Hypothesis: coin shows less than 50% heads, i.e. p<=1/2 +%! [h,p_val,ci] = binotest(65,100,0.5,'tail','left','alpha',0.01) +%! % Result: h = 1 : null hypothesis is rejected, i.e. coin shows more heads than tails +%! % P value 0.0018, i.e. hypothesis not rejected for alpha up to 0.18% +%! % 0 <= p <= 0.76 with 99% confidence + +%!test #example from https://en.wikipedia.org/wiki/Binomial_test +%! [h,p_val,ci] = binotest (51,235,1/6); +%! assert (p_val, 0.0437, 0.00005) +%! [h,p_val,ci] = binotest (51,235,1/6,'tail','left'); +%! assert (p_val, 0.027, 0.0005) diff --git a/inst/boxplot.m b/inst/boxplot.m new file mode 100644 index 0000000..7d9adce --- /dev/null +++ b/inst/boxplot.m @@ -0,0 +1,911 @@ +## Copyright (C) 2002 Alberto Terruzzi +## Copyright (C) 2006 Alberto Pose +## Copyright (C) 2011 Pascal Dupuis +## Copyright (C) 2012 Juan Pablo Carbajal +## Copyright (C) 2016 Pascal Dupuis +## Copyright (C) 2020 Andreas Bertsatos +## Copyright (C) 2020 Philip Nienhuis (prnienhuis@users.sf.net) +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{s} =} boxplot (@var{data}) +## @deftypefnx {Function File} {@var{s} =} boxplot (@var{data}, @var{group}) +## @deftypefnx {Function File} {@var{s} =} boxplot (@var{data}, @var{notched}, @var{symbol}, @var{orientation}, @var{whisker}, @dots{}) +## @deftypefnx {Function File} {@var{s} =} boxplot (@var{data}, @var{group}, @var{notched}, @var{symbol}, @var{orientation}, @var{whisker}, @dots{}) +## @deftypefnx {Function File} {@var{s} =} boxplot (@var{data}, @var{options}) +## @deftypefnx {Function File} {@var{s} =} boxplot (@var{data}, @var{group}, @var{options}, @dots{}) +## @deftypefnx {Function File} {[@dots{} @var{h}]=} boxplot (@var{data}, @dots{}) +## Produce a box plot. +## +## A box plot is a graphical display that simultaneously describes several +## important features of a data set, such as center, spread, departure from +## symmetry, and identification of observations that lie unusually far from +## the bulk of the data. +## +## Input arguments (case-insensitive) recognized by boxplot are: +## +## @itemize +## @item +## @var{data} is a matrix with one column for each data set, or a cell vector +## with one cell for each data set. Each cell must contain a numerical row or +## column vector (NaN and NA are ignored) and not a nested vector of cells. +## +## @item +## @var{notched} = 1 produces a notched-box plot. Notches represent a robust +## estimate of the uncertainty about the median. +## +## @var{notched} = 0 (default) produces a rectangular box plot. +## +## @var{notched} within the interval (0,1) produces a notch of the specified +## depth. Notched values outside (0,1) are amusing if not exactly impractical. +## +## @item +## @var{symbol} sets the symbol for the outlier values. The default symbol +## for points that lie outside 3 times the interquartile range is 'o'; +## the default symbol for points between 1.5 and 3 times the interquartile +## range is '+'. @* +## Alternative @var{symbol} settings: +## +## @var{symbol} = '.': points between 1.5 and 3 times the IQR are marked with +## '.' and points outside 3 times IQR with 'o'. +## +## @var{symbol} = ['x','*']: points between 1.5 and 3 times the IQR are marked +## with 'x' and points outside 3 times IQR with '*'. +## +## @item +## @var{orientation} = 0 makes the boxes horizontally. @* +## @var{orientation} = 1 plots the boxes vertically (default). Alternatively, +## orientation can be passed as a string, e.g., 'vertical' or 'horizontal'. +## +## @item +## @var{whisker} defines the length of the whiskers as a function of the IQR +## (default = 1.5). If @var{whisker} = 0 then @code{boxplot} displays all data +## values outside the box using the plotting symbol for points that lie +## outside 3 times the IQR. +## +## @item +## @var{group} may be passed as an optional argument only in the second +## position after @var{data}. @var{group} contains a numerical vector defining +## separate categories, each plotted in a different box, for each set of +## @var{DATA} values that share the same @var{group} value or values. With +## the formalism (@var{data}, @var{group}), both must be vectors of the same +## length. +## +## @item +## @var{options} are additional paired arguments passed with the formalism +## (Name, Value) that provide extra functionality as listed below. +## @var{options} can be passed at any order after the initial arguments and +## are case-insensitive. +## +## @multitable {Name} {Value} {description} @columnfractions .2 .2 .6 +## @item 'Notch' @tab 'on' @tab Notched by 0.25 of the boxes width. +## @item @tab 'off' @tab Produces a straight box. +## @item @tab scalar @tab Proportional width of the notch. +## +## @item 'Symbol' @tab '.' @tab Defines only outliers between 1.5 and 3 IQR. +## @item @tab ['x','*'] @tab 2nd character defines outliers > 3 IQR +## +## @item 'Orientation' @tab 'vertical' @tab Default value, can also be defined +## with numerical 1. +## @item @tab 'horizontal' @tab Can also be defined with numerical 0. +## +## @item 'Whisker' @tab scalar @tab Multiplier of IQR (default is 1.5). +## +## @item 'OutlierTags' @tab 'on' or 1 @tab Plot the vector index of the outlier +## value next to its point. +## @item @tab 'off' or 0 @tab No tags are plotted (default value). +## +## @item 'Sample_IDs' @tab 'cell' @tab A cell vector with one cell for each +## data set containing a nested cell vector with each sample's ID (should be +## a string). If this option is passed, then all outliers are tagged with +## their respective sample's ID string instead of their vector's index. +## +## @item 'BoxWidth' @tab 'proportional' @tab Create boxes with their width +## proportional to the number of samples in their respective dataset (default +## value). +## @item @tab 'fixed' @tab Make all boxes with equal width. +## +## @item 'Widths' @tab scalar @tab Scaling factor for box widths (default +## value is 0.4). +## +## @item 'CapWidths' @tab scalar @tab Scaling factor for whisker cap widths +## (default value is 1, which results to 'Widths'/8 halflength) +## +## @item 'BoxStyle' @tab 'outline' @tab Draw boxes as outlines (default value). +## @item @tab 'filled' @tab Fill boxes with a color (outlines are still +## plotted). +## +## @item 'Positions' @tab vector @tab Numerical vector that defines the +## position of each data set. It must have the same length as the number of +## groups in a desired manner. This vector merely defines the points along +## the group axis, which by default is [1:number of groups]. +## +## @item 'Labels' @tab cell @tab A cell vector of strings containing the names +## of each group. By default each group is labeled numerically according to +## its order in the data set +## +## @item 'Colors' @tab character string or Nx3 numerical matrix @tab If just +## one character or 1x3 vector of RGB values, specify the fill color of all +## boxes when BoxStyle = 'filled'. If a character string or Nx3 matrix is +## entered, box #1's fill color corrresponds to the first character or first +## matrix row, and the next boxes' fill colors corresponds to the next +## characters or rows. If the char string or Nx3 array is exhausted the color +## selection wraps around. +## @end multitable +## @end itemize +## +## Supplemental arguments not described above (@dots{}) are concatenated and +## passed to the plot() function. +## +## The returned matrix @var{s} has one column for each data set as follows: +## +## @multitable @columnfractions .1 .8 +## @item 1 @tab Minimum +## @item 2 @tab 1st quartile +## @item 3 @tab 2nd quartile (median) +## @item 4 @tab 3rd quartile +## @item 5 @tab Maximum +## @item 6 @tab Lower confidence limit for median +## @item 7 @tab Upper confidence limit for median +## @end multitable +## +## The returned structure @var{h} contains handles to the plot elements, +## allowing customization of the visualization using set/get functions. +## +## Example +## +## @example +## title ("Grade 3 heights"); +## axis ([0,3]); +## set(gca (), "xtick", [1 2], "xticklabel", @{"girls", "boys"@}); +## boxplot (@{randn(10,1)*5+140, randn(13,1)*8+135@}); +## @end example +## +## @end deftypefn + +function [s_o, hs_o] = boxplot (data, varargin) + + ## Assign parameter defaults + if (nargin < 1) + print_usage; + endif + + ## Check data + if (! (isnumeric (data) || iscell (data))) + error ("boxplot: numerical array or cell array containing data expected."); + elseif (iscell (data)) + ## Check if cell contain numerical data + if (! all (cellfun ("isnumeric", data))) + error ("boxplot: data cells must contain numerical data."); + endif + endif + + ## Default values + maxwhisker = 1.5; + orientation = 1; + symbol = ["+", "o"]; + notched = 0; + plot_opts = {}; + groups = []; + sample_IDs = {}; + outlier_tags = 0; + box_width = "proportional"; + widths = 0.4; + capwid = 1; + box_style = 0; + positions = []; + labels = {}; + nug = 0; + bcolor = "y"; + + ## Optional arguments analysis + numarg = nargin - 1; + indopt = 1; + group_exists = 0; + while (numarg) + dummy = varargin{indopt++}; + if ((! ischar (dummy) || iscellstr (dummy)) && indopt < 6) + ## MATLAB allows passing the second argument as a grouping vector + if (length (dummy) > 1) + if (2 != indopt) + error ("boxplot: grouping vector may only be passed as second arg."); + endif + if (isnumeric (dummy)) + groups = dummy; + group_exists = 1; + else + error ("boxplot: grouping vector must be numerical"); + endif + elseif (length (dummy) == 1) + ## Old way: positional argument + switch indopt - group_exists + case 2 + notched = dummy; + case 4 + orientation = dummy; + case 5 + maxwhisker = dummy; + otherwise + error("boxplot: no positional argument allowed at position %d", ... + --indopt); + endswitch + endif + numarg--; + continue; + else + if (3 == (indopt - group_exists) && length (dummy) <= 2) + symbol = dummy; + numarg--; + continue; + else + ## Check for additional paired arguments + switch lower (dummy) + case "notch" + notched = varargin{indopt}; + ## Check for string input: "on" or "off" + if (ischar (notched)) + if (strcmpi (notched, "on")) + notched = 1; + elseif (strcmpi (notched, "off")) + notched = 0; + else + msg = ["boxplot: 'Notch' input argument accepts only 'on',", ... + " 'off' or a numeric scalar as value"]; + error (msg); + endif + elseif (! (isnumeric (notched) && isreal (notched))) + error ("boxplot: illegal Notch value"); + endif + + case "symbol" + symbol = varargin{indopt}; + if (! ischar (symbol)) + error ("boxplot; Symbol(s) must be character(s)"); + endif + + case "orientation" + orientation = varargin{indopt}; + if (ischar (orientation)) + ## Check for string input: "vertical" or "horizontal" + if (strcmpi (orientation, "vertical")) + orientation = 1; + elseif (strcmpi (orientation, "horizontal")) + orientation = 0; + else + msg = ["boxplot: 'Orientation' input argument accepts only", ... + " 'vertical' (or 1) or 'horizontal' (or 0) as value"]; + error (msg); + endif + elseif (! (isnumeric (orientation) && isreal (orientation))) + error ("boxplot: illegal Orientation value"); + endif + + case "whisker" + maxwhisker = varargin{indopt}; + if (! isscalar (maxwhisker) || ... + ! (isnumeric (maxwhisker) && isreal (maxwhisker))) + msg = ["boxplot: 'Whisker' input argument accepts only", ... + " a real scalar value as input parameter"]; + error(msg); + endif + + case "outliertags" + outlier_tags = varargin{indopt}; + ## Check for string input: "on" or "off" + if (ischar (outlier_tags)) + if (strcmpi (outlier_tags, "on")) + outlier_tags = 1; + elseif (strcmpi (outlier_tags, "off")) + outlier_tags = 0; + else + msg = ["boxplot: 'OutlierTags' input argument accepts only", ... + " 'on' (or 1) or 'off' (or 0) as value"]; + error (msg); + endif + elseif (! (isnumeric (outlier_tags) && isreal (outlier_tags))) + error ("boxplot: illegal OutlierTags value"); + endif + + case "sample_ids" + sample_IDs = varargin{indopt}; + if (! iscell (sample_IDs)) + msg = ["boxplot: 'Sample_IDs' input argument accepts only", ... + " a cell array as value"]; + error (msg); + endif + outlier_tags = 1; + + case "boxwidth" + box_width = varargin{indopt}; + ## Check for string input: "fixed" or "proportional" + if (! ischar (box_width) || ... + ! ismember (lower (box_width), {"fixed", "proportional"})) + msg = ["boxplot: 'BoxWidth' input argument accepts only", ... + " 'fixed' or 'proportional' as value"]; + error (msg); + endif + box_width = lower (box_width); + + case "widths" + widths = varargin{indopt}; + if (! isscalar (widths) || ! (isnumeric (widths) && isreal (widths))) + msg = ["boxplot: 'Widths' input argument accepts only", ... + " a real scalar value as value"]; + error (msg); + endif + + case "capwidths" + capwid = varargin{indopt}; + if (! isscalar (capwid) || ! (isnumeric (capwid) && isreal (capwid))) + msg = ["boxplot: 'CapWidths' input argument accepts only", ... + " a real scalar value as value"]; + error (msg); + endif + + case "boxstyle" + box_style = varargin{indopt}; + ## Check for string input: "outline" or "filled" + if (! ischar (box_style) || ... + ! ismember (lower (box_style), {"outline", "filled"})) + msg = ["boxplot: 'BoxStyle' input argument accepts only", ... + " 'outline' or 'filled' as value"]; + error (msg); + endif + box_style = lower (box_style); + + case "positions" + positions = varargin{indopt}; + if (! isvector (positions) || ! isnumeric (positions)) + msg = ["boxplot: 'Positions' input argument accepts only", ... + " a numeric vector as value"]; + error (msg); + endif + + case "labels" + labels = varargin{indopt}; + if (! iscellstr (labels)) + msg = ["boxplot: 'Labels' input argument accepts only", ... + " a cellstr array as value"]; + error (msg); + endif + + case "colors" + bcolor = varargin{indopt}; + if (! (ischar (bcolor) || ... + (isnumeric (bcolor) && size (bcolor, 2) == 3))) + msg = ["boxplot: 'Colors' input argument accepts only", ... + " a character (string) or Nx3 numeric array as value"]; + error (msg); + endif + + otherwise + ## Take two args and append them to plot_opts + plot_opts(1, end+1:end+2) = {dummy, varargin{indopt}}; + endswitch + endif + numarg -= 2; + indopt++; + endif + endwhile + + if (1 == length (symbol)) + symbol(2) = symbol(1); + endif + + if (1 == notched) + notched = 0.25; + endif + a = 1-notched; + + ## Figure out how many data sets we have + if (isempty (groups)) + if (iscell (data)) + nc = nug = length (data); + for ind_c = (1:nc) + lc(ind_c) = length (data{ind_c}); + endfor + else + if (isvector (data)) + data = data(:); + endif + nc = nug = columns (data); + lc = ones (1, nc) * rows (data); + endif + groups = (1:nc); + ## In case sample_IDs exists. check that it has same size as data + if (! isempty (sample_IDs) && length (sample_IDs) == 1) + for ind_c = (1:nc) + if (lc(ind_c) != length (sample_IDs)) + error ("boxplot: Sample_IDs must match the data"); + endif + endfor + elseif (! isempty (sample_IDs) && length (sample_IDs) == nc) + for ind_c = (1:nc) + if (lc(ind_c) != length (sample_IDs{ind_c})) + error ("boxplot: Sample_IDs must match the data"); + endif + endfor + elseif (! isempty (sample_IDs) && length (sample_IDs) != nc) + error ("boxplot: Sample_IDs must match the data"); + endif + ## Create labels according to number of datasets as ordered in data + ## in case they are not provided by the user as optional argument + if (isempty (labels)) + for i = 1:nc + column_label = num2str (groups(i)); + labels(i) = {column_label}; + endfor + endif + else + if (! isvector (data)) + error ("boxplot: with the formalism (data, group), both must be vectors"); + endif + ## If sample IDs given, check that their size matches the data + if (! isempty (sample_IDs)) + if (length (sample_IDs) != 1 || length (sample_IDs{1}) != length (data)) + error ("boxplot: Sample_IDs must match the data"); + endif + nug = unique (groups); + dummy_data = cell (1, length (nug)); + dummy_sIDs = cell (1, length (nug)); + ## Check if groups are parsed as a numeric vector + if (isnumeric (groups)) + for ind_c = (1:length (nug)) + dummy_data(ind_c) = data(groups == nug(ind_c)); + dummy_sIDs(ind_c) = {sample_IDs{1}(groups == nug(ind_c))}; + endfor + ## Create labels according to unique numeric groups in case + ## they are not provided by the user as optional argument + if (isempty (labels)) + for i = 1:nug + column_label = num2str (groups(i)); + labels(i) = {column_label}; + endfor + endif + ## Check if groups are parsed as a cell string vector + elseif iscellstr (groups) + for ind_c = (1:length (nug)) + dummy_data(ind_c) = data(ismember (group, nug(ind_c))); + dummy_sIDs(ind_c) = {sample_IDs{1}(ismember (group, nug(ind_c)))}; + endfor + ## Create labels according to unique cell string groups in case + ## they are not provided by the user as optional argument + if (isempty (labels)) + labels = nug; + endif + else + error ("boxplot: group argument must be numeric or cell string vector"); + endif + data = dummy_data; + groups = nug(:).'; + nc = length (nug); + sample_IDs = dummy_sIDs; + else + nug = unique (groups); + dummy_data = cell (1, length (nug)); + ## Check if groups are parsed as a numeric vector + if (isnumeric (groups)) + for ind_c = (1:length (nug)) + dummy_data(ind_c) = data(groups == nug(ind_c)); + endfor + ## Create labels according to unique numeric groups in case + ## they are not provided by the user as optional argument + if (isempty (labels)) + for i = 1:nug + column_label = num2str (groups(i)); + labels(i) = {column_label}; + endfor + endif + ## Check if groups are parsed as a cell string vector + elseif (iscellstr (groups)) + for ind_c = (1:length (nug)) + dummy_data(ind_c) = data(ismember (group, nug(ind_c))); + endfor + ## Create labels according to unique cell string groups in case + ## they are not provided by the user as optional argument + if (isempty (labels)) + labels = nug; + endif + else + error ("boxplot: group argument must be numeric vector or cell string"); + endif + data = dummy_data; + nc = length (nug); + if (iscell (groups)) + groups = [1:nc]; + else + groups = nug(:).'; + endif + endif + endif + + ## Compute statistics. + ## s will contain + ## 1,5 min and max + ## 2,3,4 1st, 2nd and 3rd quartile + ## 6,7 lower and upper confidence intervals for median + s = zeros (7, nc); + box = zeros (1, nc); + ## Arrange the boxes into desired positions (if requested, otherwise leave + ## default 1:nc) + if (! isempty (positions)) + groups = positions; + endif + ## Initialize whisker matrices to correct size and all necessary outlier + ## variables + whisker_x = ones (2, 1) * [groups, groups]; + whisker_y = zeros (2, 2 * nc); + outliers_x = []; + outliers_y = []; + outliers_idx = []; + outliers_IDs = {}; + outliers2_x = []; + outliers2_y = []; + outliers2_idx = []; + outliers2_IDs = {}; + + for indi = (1:nc) + ## Get the next data set from the array or cell array + if (iscell (data)) + col = data{indi}(:); + if (!isempty (sample_IDs)) + sIDs = sample_IDs{indi}; + else + sIDs = num2cell([1:length(col)]); + endif + else + col = data(:, indi); + sIDs = num2cell([1:length(col)]); + endif + ## Skip missing data (NaN, NA) and remove respective sample IDs. + ## Do this only on nonempty data + if (length (col) > 0) + remove_samples = find (col(isnan (col) | isna (col))); + if (length (remove_samples) > 0) + col(remove_samples) = []; + sIDs(remove_samples) = []; + endif + endif + ## Remember data length + nd = length (col); + box(indi) = nd; + if (nd > 1) + ## Min, max and quartiles + s(1:5, indi) = statistics (col)(1:5); + ## Confidence interval for the median + est = 1.57 * (s(4, indi) - s(2, indi)) / sqrt (nd); + s(6, indi) = max ([s(3, indi) - est, s(2, indi)]); + s(7, indi) = min ([s(3, indi) + est, s(4, indi)]); + ## Whiskers out to the last point within the desired inter-quartile range + IQR = maxwhisker * (s(4, indi) - s(2, indi)); + whisker_y(:, indi) = [min(col(col >= s(2, indi) - IQR)); s(2, indi)]; + whisker_y(:, nc+indi) = [max(col(col <= s(4, indi) + IQR)); s(4, indi)]; + ## Outliers beyond 1 and 2 inter-quartile ranges + outliers = col((col < s(2, indi) - IQR & col >= s(2, indi) - 2 * IQR) | ... + (col > s(4, indi) + IQR & col <= s(4, indi) + 2 * IQR)); + outliers2 = col(col < s(2, indi) - 2 * IQR | col > s(4, indi) + 2 * IQR); + ## Get outliers indices from this dataset + if (length (outliers) > 0) + for out_i = 1:length (outliers) + outliers_idx = [outliers_idx; (find (col == outliers(out_i)))]; + outliers_IDs = {outliers_IDs{:}, sIDs{(find (col == outliers(out_i)))}}; + endfor + endif + if (length (outliers2) > 0) + for out_i = 1:length (outliers2) + outliers2_idx = [outliers2_idx; find(col == outliers2(out_i))]; + outliers2_IDs = {outliers2_IDs{:}, sIDs{find(col == outliers2(out_i))}}; + endfor + endif + outliers_x = [outliers_x; (groups(indi) * ones (size (outliers)))]; + outliers_y = [outliers_y; outliers]; + outliers2_x = [outliers2_x; (groups(indi) * ones (size (outliers2)))]; + outliers2_y = [outliers2_y; outliers2]; + elseif (1 == nd) + ## All statistics collapse to the value of the point + s(:, indi) = col; + ## Single point data sets are plotted as outliers. + outliers_x = [outliers_x; groups(indi)]; + outliers_y = [outliers_y; col]; + ## Append the single point's index to keep the outliers' vector aligned + outliers_idx = [outliers_idx; 1]; + outliers_IDs = {outliers_IDs{:}, sIDs{:}}; + else + ## No statistics if no points + s(:, indi) = NaN; + endif + endfor + + ## Note which boxes don't have enough stats + chop = find (box <= 1); + + ## Replicate widths (if scalar or shorter vector) to match the number of boxes + widths = widths(repmat (1:length (widths), 1, nc)); + ## Truncate just in case :) + widths([nc+1:end]) = []; + ## Draw a box around the quartiles, with box width being fixed or proportional + ## to the number of items in the box. + if (strcmpi (box_width, "proportional")) + box = box .* (widths ./ max (box)); + else + box = box .* (widths ./ box); + endif + ## Draw notches if desired. + quartile_x = ones (11, 1) * groups + ... + [-a; -1; -1; 1 ; 1; a; 1; 1; -1; -1; -a] * box; + quartile_y = s([3, 7, 4, 4, 7, 3, 6, 2, 2, 6, 3], :); + + ## Draw a line through the median + median_x = ones (2, 1) * groups + [-a; +a] * box; + median_y = s([3, 3], :); + + ## Chop all boxes which don't have enough stats + quartile_x(:, chop) = []; + quartile_y(:, chop) = []; + whisker_x(:, [chop, chop + nc]) = []; + whisker_y(:, [chop, chop + nc]) = []; + median_x(:, chop) = []; + median_y(:, chop) = []; + box(chop) = []; + + ## Add caps to the remaining whiskers + cap_x = whisker_x; + if (strcmpi (box_width, "proportional")) + cap_x(1, :) -= repmat (((capwid * box .* (widths ./ max (box))) / 8), 1, 2); + cap_x(2, :) += repmat (((capwid * box .* (widths ./ max (box))) / 8), 1, 2); + else + cap_x(1, :) -= repmat ((capwid * widths / 8), 1, 2); + cap_x(2, :) += repmat ((capwid * widths / 8), 1, 2); + endif + cap_y = whisker_y([1, 1], :); + + ## Calculate coordinates for outlier tags + outliers_tags_x = outliers_x + 0.08; + outliers_tags_y = outliers_y; + outliers2_tags_x = outliers2_x + 0.08; + outliers2_tags_y = outliers2_y; + + ## Do the plot + if (orientation) + ## Define outlier_tags' vertical alignment + outlier_tags_alignment = {"horizontalalignment", "left"}; + if (box_style) + f = fillbox (quartile_x, quartile_y, bcolor); + endif + h = plot (quartile_x, quartile_y, "b;;", + whisker_x, whisker_y, "b;;", + cap_x, cap_y, "b;;", + median_x, median_y, "r;;", + outliers_x, outliers_y, [symbol(1), "r;;"], + outliers2_x, outliers2_y, [symbol(2), "r;;"], plot_opts{:}); + ## Print outlier tags + if (outlier_tags == 1 && outliers_x > 0) + t1 = plot_tags (outliers_tags_x, outliers_tags_y, outliers_idx, + outliers_IDs, sample_IDs, outlier_tags_alignment); + endif + if (outlier_tags == 1 && outliers2_x > 0) + t2 = plot_tags (outliers2_tags_x, outliers2_tags_y, outliers2_idx, + outliers2_IDs, sample_IDs, outlier_tags_alignment); + endif + else + ## Define outlier_tags' horizontal alignment + outlier_tags_alignment = {"horizontalalignment", "left", "rotation", 90}; + if (box_style) + f = fillbox (quartile_y, quartile_x, bcolor); + endif + h = plot (quartile_y, quartile_x, "b;;", + whisker_y, whisker_x, "b;;", + cap_y, cap_x, "b;;", + median_y, median_x, "r;;", + outliers_y, outliers_x, [symbol(1), "r;;"], + outliers2_y, outliers2_x, [symbol(2), "r;;"], plot_opts{:}); + ## Print outlier tags + if (outlier_tags == 1 && outliers_x > 0) + t1 = plot_tags (outliers_tags_y, outliers_tags_x, outliers_idx, + outliers_IDs, sample_IDs, outlier_tags_alignment); + endif + if (outlier_tags == 1 && outliers2_x > 0) + t2 = plot_tags (outliers2_tags_y, outliers2_tags_x, outliers2_idx, + outliers2_IDs, sample_IDs, outlier_tags_alignment); + endif + endif + + ## Distribute handles for box outlines and box fill (if any) + nq = 1 : size (quartile_x, 2); + hs.box = h(nq); + if (box_style) + nf = 1 : length (groups); + hs.box_fill = f(nf); + else + hs.box_fill = []; + endif + + ## Distribute handles for whiskers (including caps) and median lines + nw = nq(end) + [1 : 2 * (size (whisker_x, 2))]; + hs.whisker = h(nw); + nm = nw(end)+ [1 : (size (median_x, 2))]; + hs.median = h(nm); + ## Distribute handles for outliers (if any) and their respective tags + ## (if applicable) + no = nm; + if (! isempty (outliers_y)) + no = nm(end) + [1 : size(outliers_y, 2)]; + hs.outliers = h(no); + if (outlier_tags == 1) + nt = 1 : length (outliers_tags_y); + hs.out_tags = t1(nt); + else + hs.out_tags = []; + endif + else + hs.outliers = []; + hs.out_tags = []; + endif + ## Distribute handles for extreme outliers (if any) and their respective tags + ## (if applicable) + if (! isempty (outliers2_y)) + no2 = no(end) + [1 : size(outliers2_y, 2)]; + hs.outliers2 = h(no2); + if (outlier_tags == 1) + nt2 = 1 : length (outliers2_tags_y); + hs.out_tags2 = t2(nt2); + else + hs.out_tags2 = []; + endif + else + hs.outliers2 = []; + hs.out_tags2 = []; + end + + ## Redraw the median lines to avoid colour overlapping in case of 'filled' + ## BoxStyle + if (box_style) + set (hs.median, "color", "r"); + endif + + ## Print labels according to orientation and return handle + if (orientation) + set (gca(), "xtick", groups, "xticklabel", labels); + hs.labels = get (gcf, "currentaxes"); + else + set (gca(), "ytick", groups, "yticklabel", labels); + hs.labels = get (gcf, "currentaxes"); + endif + hold off; + + ## Return output arguments if desired + if (nargout >= 1) + s_o = s; + endif + if (nargout == 2) + hs_o = hs; + endif + +endfunction + + +function htags = plot_tags (out_tags_x, out_tags_y, out_idx, out_IDs, ... + sample_IDs, opt) + + for i=1 : length (out_tags_x) + if (! isempty (sample_IDs)) + htags(i) = text (out_tags_x(i), out_tags_y(i), out_IDs{i}, opt{:}); + else + htags(i) = text (out_tags_x(i), out_tags_y(i), num2str (out_idx(i)), ... + opt{:}); + endif + endfor + +endfunction + + +function f = fillbox (quartile_y, quartile_x, bcolor) + + f = []; + for icol = 1 : columns (quartile_x) + if (ischar (bcolor)) + f = [ f; fill(quartile_y(:, icol), quartile_x(:, icol), ... + bcolor(mod (icol-1, numel (bcolor))+1)) ]; + else + f = [ f; fill(quartile_y(:, icol), quartile_x(:, icol), ... + bcolor(mod (icol-1, size (bcolor, 1))+1, :)) ]; + endif + hold on; + endfor + +endfunction + + +%!demo +%! axis ([0, 3]); +%! boxplot ({(randn(10, 1) * 5 + 140), (randn (13, 1) * 8 + 135)}); +%! set (gca (), "xtick", [1 2], "xticklabel", {"girls", "boys"}) +%! title ("Grade 3 heights"); + +%!demo +%! data = [(randn (10, 1) * 5 + 140); (randn (25, 1) * 8 + 135); ... +%! (randn (20, 1) * 6 + 165)]; +%! groups = [(ones (10, 1)); (ones (25, 1) * 2); (ones (20, 1) * 3)]; +%! labels = {"Team A", "Team B", "Team C"}; +%! pos = [2, 1, 3]; +%! boxplot (data, groups, "Notch", "on", "Labels", labels, "Positions", pos, ... +%! "OutlierTags", "on", "BoxStyle", "filled"); +%! title ("Example of Group splitting with paired vectors"); + +%!demo +%! boxplot (randn (100, 9), "notch", "on", "boxstyle", "filled", ... +%! "colors", "ygcwkmb", "whisker", 1.2); +%! title ("Example of different colors specified with characters"); + +%!demo +%! colors = [0.7 0.7 0.7; ... +%! 0.0 0.4 0.9; ... +%! 0.7 0.4 0.3; ... +%! 0.7 0.1 0.7; ... +%! 0.8 0.7 0.4; ... +%! 0.1 0.8 0.5; ... +%! 0.9 0.9 0.2]; +%! boxplot (randn (100, 13), "notch", "on", "boxstyle", "filled", ... +%! "colors", colors, "whisker", 1.3, "boxwidth", "proportional"); +%! title ("Example of different colors specified as RGB values"); + +%% Input data validation +%!error boxplot ("a") +%!error boxplot ({[1 2 3], "a"}) +%!error boxplot ([1 2 3], 1, {2, 3}) +%!error boxplot ([1 2 3], {"a", "b"}) +%!error <'Notch' input argument accepts> boxplot ([1:10], "notch", "any") +%!error boxplot ([1:10], "notch", i) +%!error boxplot ([1:10], "notch", {}) +%!error boxplot (1, "symbol", 1) +%!error <'Orientation' input argument accepts only> boxplot (1, "orientation", "diagonal") +%!error boxplot (1, "orientation", {}) +%!error <'Whisker' input argument accepts only> boxplot (1, "whisker", "a") +%!error <'Whisker' input argument accepts only> boxplot (1, "whisker", [1 3]) +%!error <'OutlierTags' input argument accepts only> boxplot (3, "OutlierTags", "maybe") +%!error boxplot (3, "OutlierTags", {}) +%!error <'Sample_IDs' input argument accepts only> boxplot (1, "sample_IDs", 1) +%!error <'BoxWidth' input argument accepts only> boxplot (1, "boxwidth", 2) +%!error <'BoxWidth' input argument accepts only> boxplot (1, "boxwidth", "anything") +%!error <'Widths' input argument accepts only> boxplot (5, "widths", "a") +%!error <'Widths' input argument accepts only> boxplot (5, "widths", [1:4]) +%!error <'Widths' input argument accepts only> boxplot (5, "widths", []) +%!error <'CapWidths' input argument accepts only> boxplot (5, "capwidths", "a") +%!error <'CapWidths' input argument accepts only> boxplot (5, "capwidths", [1:4]) +%!error <'CapWidths' input argument accepts only> boxplot (5, "capwidths", []) +%!error <'BoxStyle' input argument accepts only> boxplot (1, "Boxstyle", 1) +%!error <'BoxStyle' input argument accepts only> boxplot (1, "Boxstyle", "garbage") +%!error <'Positions' input argument accepts only> boxplot (1, "positions", "aa") +%!error <'Labels' input argument accepts only> boxplot (3, "labels", [1 5]) +%!error <'Colors' input argument accepts only> boxplot (1, "colors", {}) +%!error <'Colors' input argument accepts only> boxplot (2, "colors", [1 2 3 4]) +%!error boxplot (randn (10, 3), 'Sample_IDs', {"a", "b"}) +%!error boxplot (rand (3, 3), [1 2]) + +%!test +%! h = figure ("visible", "off"); +%! [a, b] = boxplot (rand (10, 3)); +%! close (h); +%! assert (size (a), [7, 3]); +%! assert (numel (b.box), 3); +%! assert (numel (b.whisker), 12); +%! assert (numel (b.median), 3); + +%!test +%! h = figure ("visible", "off"); +%! [~, b] = boxplot (rand (10, 3), "BoxStyle", "filled", "colors", "ybc"); +%! close (h); +%! assert (numel (b.box_fill), 3); + diff --git a/inst/burrcdf.m b/inst/burrcdf.m new file mode 100644 index 0000000..0143ad0 --- /dev/null +++ b/inst/burrcdf.m @@ -0,0 +1,98 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} burrcdf (@var{x}, @var{c}, @var{k}) +## For each element of @var{x}, compute the cumulative distribution function +## (CDF) at @var{x} of the Burr distribution with scale parameter @var{alpha} +## and shape parameters @var{c} and @var{k}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: CDF of the Burr distribution + +function cdf = burrcdf (x, alpha, c, k) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (alpha) || ! isscalar (c) || ! isscalar (k) ) + [retval, x, alpha, c, k] = common_size (x, alpha, c, k); + if (retval > 0) + error ("burrcdf: X, ALPHA, C AND K must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex(alpha) || iscomplex (c) || iscomplex (k)) + error ("burrcdf: X, ALPHA, C AND K must not be complex"); + endif + + if (isa (x, "single") || isa (alpha, "single") || isa (c, "single") ... + || isa (k, "single")) + cdf = zeros (size (x), "single"); + else + cdf = zeros (size (x)); + endif + + j = isnan (x) | ! (alpha > 0) | ! (c > 0) | ! (k > 0); + cdf(j) = NaN; + + j = (x > 0) & (0 < alpha) & (alpha < Inf) & (0 < c) & (c < Inf) ... + & (0 < k) & (k < Inf); + if (isscalar (alpha) && isscalar(c) && isscalar(k)) + cdf(j) = 1 - (1 + (x(j) / alpha).^c).^(-k); + else + cdf(j) = 1 - (1 + (x(j) ./ alpha(j)).^c(j)).^(-k(j)); + endif + +endfunction + + +%!shared x,y +%! x = [-1, 0, 1, 2, Inf]; +%! y = [0, 0, 1/2, 2/3, 1]; +%!assert (burrcdf (x, ones(1,5), ones (1,5), ones (1,5)), y, eps) +%!assert (burrcdf (x, 1, 1, 1), y, eps) +%!assert (burrcdf (x, [1, 1, NaN, 1, 1], 1, 1), [y(1:2), NaN, y(4:5)], eps) +%!assert (burrcdf (x, 1, [1, 1, NaN, 1, 1], 1), [y(1:2), NaN, y(4:5)], eps) +%!assert (burrcdf (x, 1, 1, [1, 1, NaN, 1, 1]), [y(1:2), NaN, y(4:5)], eps) +%!assert (burrcdf ([x, NaN], 1, 1, 1), [y, NaN], eps) + +## Test class of input preserved +%!assert (burrcdf (single ([x, NaN]), 1, 1, 1), single ([y, NaN]), eps('single')) +%!assert (burrcdf ([x, NaN], single (1), 1, 1), single ([y, NaN]), eps('single')) +%!assert (burrcdf ([x, NaN], 1, single (1), 1), single ([y, NaN]), eps('single')) +%!assert (burrcdf ([x, NaN], 1, 1, single (1)), single ([y, NaN]), eps('single')) + +## Test input validation +%!error burrcdf () +%!error burrcdf (1) +%!error burrcdf (1,2) +%!error burrcdf (1,2,3) +%!error burrcdf (1,2,3,4,5) +%!error burrcdf (ones (3), ones (2), ones(2), ones(2)) +%!error burrcdf (ones (2), ones (3), ones(2), ones(2)) +%!error burrcdf (ones (2), ones (2), ones(3), ones(2)) +%!error burrcdf (ones (2), ones (2), ones(2), ones(3)) +%!error burrcdf (i, 2, 2, 2) +%!error burrcdf (2, i, 2, 2) +%!error burrcdf (2, 2, i, 2) +%!error burrcdf (2, 2, 2, i) + diff --git a/inst/burrinv.m b/inst/burrinv.m new file mode 100644 index 0000000..af0f778 --- /dev/null +++ b/inst/burrinv.m @@ -0,0 +1,102 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} burrinv (@var{x}, @var{alpha}, @var{c}, @var{k}) +## For each element of @var{x}, compute the quantile (the inverse of the CDF) +## at @var{x} of the Burr distribution with scale parameter @var{alpha} and +## shape parameters @var{c} and @var{k}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Quantile function of the Burr distribution + +function inv = burrinv (x, alpha, c, k) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (alpha) || ! isscalar (c) || ! isscalar (k) ) + [retval, x, alpha, c, k] = common_size (x, alpha, c, k); + if (retval > 0) + error ("burrinv: X, ALPHA, C AND K must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex(alpha) || iscomplex (c) || iscomplex (k)) + error ("burrinv: X, ALPHA, C AND K must not be complex"); + endif + + if (isa (x, "single") || isa (alpha, "single") || isa (c, "single") ... + || isa (k, "single")) + inv = zeros (size (x), "single"); + else + inv = zeros (size (x)); + endif + + j = isnan (x) | (x < 0) | (x > 1) | ! (alpha > 0) | ! (c > 0) | ! (k > 0); + inv(j) = NaN; + + j = (x == 1) & (0 < alpha) & (alpha < Inf) & (0 < c) & (c < Inf) ... + & (0 < k) & (k < Inf); + inv(j) = Inf; + + j = (0 < x) & (x < 1) & (0 < alpha) & (alpha < Inf) & (0 < c) & (c < Inf) ... + & (0 < k) & (k < Inf); + if (isscalar (alpha) && isscalar(c) && isscalar(k)) + inv(j) = ((1 - x(j) / alpha).^(-1 / k) - 1).^(1 / c) ; + else + inv(j) = ((1 - x(j) ./ alpha(j)).^(-1 ./ k(j)) - 1).^(1 ./ c(j)) ; + endif + +endfunction + + +%!shared x,y +%! x = [-Inf, -1, 0, 1/2, 1, 2, Inf]; +%! y = [NaN, NaN, 0, 1 , Inf, NaN, NaN]; +%!assert (burrinv (x, ones (1,7), ones (1,7), ones(1,7)), y, eps) +%!assert (burrinv (x, 1, 1, 1), y, eps) +%!assert (burrinv (x, [1, 1, 1, NaN, 1, 1, 1], 1, 1), [y(1:3), NaN, y(5:7)], eps) +%!assert (burrinv (x, 1, [1, 1, 1, NaN, 1, 1, 1], 1), [y(1:3), NaN, y(5:7)], eps) +%!assert (burrinv (x, 1, 1, [1, 1, 1, NaN, 1, 1, 1]), [y(1:3), NaN, y(5:7)], eps) +%!assert (burrinv ([x, NaN], 1, 1, 1), [y, NaN], eps) + +## Test class of input preserved +%!assert (burrinv (single ([x, NaN]), 1, 1, 1), single ([y, NaN]), eps('single')) +%!assert (burrinv ([x, NaN], single (1), 1, 1), single ([y, NaN]), eps('single')) +%!assert (burrinv ([x, NaN], 1, single (1), 1), single ([y, NaN]), eps('single')) +%!assert (burrinv ([x, NaN], 1, 1, single (1)), single ([y, NaN]), eps('single')) + +## Test input validation +%!error burrinv () +%!error burrinv (1) +%!error burrinv (1,2) +%!error burrinv (1,2,3) +%!error burrinv (1,2,3,4,5) +%!error burrinv (ones (3), ones (2), ones(2), ones(2)) +%!error burrinv (ones (2), ones (3), ones(2), ones(2)) +%!error burrinv (ones (2), ones (2), ones(3), ones(2)) +%!error burrinv (ones (2), ones (2), ones(2), ones(3)) +%!error burrinv (i, 2, 2, 2) +%!error burrinv (2, i, 2, 2) +%!error burrinv (2, 2, i, 2) +%!error burrinv (2, 2, 2, i) + diff --git a/inst/burrpdf.m b/inst/burrpdf.m new file mode 100644 index 0000000..99556e1 --- /dev/null +++ b/inst/burrpdf.m @@ -0,0 +1,100 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} burrpdf (@var{x}, @var{alpha}, @var{c}, @var{k}) +## For each element of @var{x}, compute the probability density function (PDF) +## at @var{x} of the Burr distribution with scale parameter @var{alpha} and +## shape parameters @var{c} and @var{k}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: PDF of the Burr distribution + +function pdf = burrpdf (x, alpha, c, k) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (alpha) || ! isscalar (c) || ! isscalar (k) ) + [retval, x, alpha, c, k] = common_size (x, alpha, c, k); + if (retval > 0) + error ("burrpdf: X, ALPHA, C AND K must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex(alpha) || iscomplex (c) || iscomplex (k)) + error ("burrpdf: X, ALPHA, C AND K must not be complex"); + endif + + if (isa (x, "single") || isa (alpha, "single") ... + || isa (c, "single") || isa (k, "single")) + pdf = zeros (size (x), "single"); + else + pdf = zeros (size (x)); + endif + + j = isnan (x) | ! (alpha > 0) | ! (c > 0) | ! (k > 0); + pdf(j) = NaN; + + j = (x > 0) & (0 < alpha) & (alpha < Inf) & (0 < c) & (c < Inf) ... + & (0 < k) & (k < Inf); + if (isscalar (alpha) && isscalar (c) && isscalar(k)) + pdf(j) = (c * k / alpha) .* (x(j) / alpha).^(c-1) ./ ... + (1 + (x(j) / alpha).^c).^(k + 1); + else + pdf(j) = (c(j) .* k(j) ./ alpha(j) ).* x(j).^(c(j)-1) ./ ... + (1 + (x(j) ./ alpha(j) ).^c(j) ).^(k(j) + 1); + endif + +endfunction + + +%!shared x,y +%! x = [-1, 0, 1, 2, Inf]; +%! y = [0, 0, 1/4, 1/9, 0]; +%!assert (burrpdf (x, ones(1,5), ones (1,5), ones (1,5)), y) +%!assert (burrpdf (x, 1, 1, 1), y) +%!assert (burrpdf (x, [1, 1, NaN, 1, 1], 1, 1), [y(1:2), NaN, y(4:5)]) +%!assert (burrpdf (x, 1, [1, 1, NaN, 1, 1], 1), [y(1:2), NaN, y(4:5)]) +%!assert (burrpdf (x, 1, 1, [1, 1, NaN, 1, 1]), [y(1:2), NaN, y(4:5)]) +%!assert (burrpdf ([x, NaN], 1, 1, 1), [y, NaN]) + +## Test class of input preserved +%!assert (burrpdf (single ([x, NaN]), 1, 1, 1), single ([y, NaN])) +%!assert (burrpdf ([x, NaN], single (1), 1, 1), single ([y, NaN])) +%!assert (burrpdf ([x, NaN], 1, single (1), 1), single ([y, NaN])) +%!assert (burrpdf ([x, NaN], 1, 1, single (1)), single ([y, NaN])) + +## Test input validation +%!error burrpdf () +%!error burrpdf (1) +%!error burrpdf (1,2) +%!error burrpdf (1,2,3) +%!error burrpdf (1,2,3,4,5) +%!error burrpdf (ones (3), ones (2), ones(2), ones(2)) +%!error burrpdf (ones (2), ones (3), ones(2), ones(2)) +%!error burrpdf (ones (2), ones (2), ones(3), ones(2)) +%!error burrpdf (ones (2), ones (2), ones(2), ones(3)) +%!error burrpdf (i, 2, 2, 2) +%!error burrpdf (2, i, 2, 2) +%!error burrpdf (2, 2, i, 2) +%!error burrpdf (2, 2, 2, i) + diff --git a/inst/burrrnd.m b/inst/burrrnd.m new file mode 100644 index 0000000..a19bef9 --- /dev/null +++ b/inst/burrrnd.m @@ -0,0 +1,141 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} burrrnd (@var{alpha}, @var{c}, @var{k}) +## @deftypefnx {} {} burrrnd (@var{alpha}, @var{c}, @var{k}, @var{r}) +## @deftypefnx {} {} burrrnd (@var{alpha}, @var{c}, @var{k}, @var{r}, @var{c}, @dots{}) +## @deftypefnx {} {} burrrnd (@var{alpha}, @var{c}, @var{k}, [@var{sz}]) +## Return a matrix of random samples from the generalized Pareto distribution +## with scale parameter @var{alpha} and shape parameters @var{c} and @var{k}. +## +## When called with a single size argument, return a square matrix with +## the dimension specified. When called with more than one scalar argument the +## first two arguments are taken as the number of rows and columns and any +## further arguments specify additional matrix dimensions. The size may also +## be specified with a vector of dimensions @var{sz}. +## +## If no size arguments are given then the result matrix is the common size of +## @var{alpha}, @var{c} and @var{k}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Random deviates from the generalized extreme value (GEV) distribution + +function rnd = burrrnd (alpha, c, k, varargin) + + if (nargin < 3) + print_usage (); + endif + + if (! isscalar (alpha) || ! isscalar (c) || ! isscalar (k)) + [retval, alpha, c, k] = common_size (alpha, c, k); + if (retval > 0) + error ("burrrnd: ALPHA, C and K must be of common size or scalars"); + endif + endif + + if (iscomplex (alpha) || iscomplex (c) || iscomplex (k)) + error ("burrrnd: ALPHA, C and K must not be complex"); + endif + + if (nargin == 3) + sz = size (alpha); + elseif (nargin == 4) + if (isscalar (varargin{1}) && varargin{1} >= 0) + sz = [varargin{1}, varargin{1}]; + elseif (isrow (varargin{1}) && all (varargin{1} >= 0)) + sz = varargin{1}; + else + error ("burrrnd: dimension vector must be row vector of non-negative integers"); + endif + elseif (nargin > 4) + if (any (cellfun (@(x) (! isscalar (x) || x < 0), varargin))) + error ("burrrnd: dimensions must be non-negative integers"); + endif + sz = [varargin{:}]; + endif + + if (! isscalar (alpha) && ! isequal (size (c), sz) && ! isequal (size (k), sz)) + error ("burrrnd: ALPHA, C and K must be scalar or of size SZ"); + endif + + if (isa (alpha, "single") || isa (c, "single") || isa (k, "single")) + cls = "single"; + else + cls = "double"; + endif + + if (isscalar (alpha) && isscalar (c) && isscalar(k)) + if ((0 < alpha) && (alpha < Inf) && (0 < c) && (c < Inf) ... + && (0 < k) && (k < Inf)) + rnd = rand (sz, cls); + rnd(:) = ((1 - rnd(:) / alpha).^(-1 / k) - 1).^(1 / c); + else + rnd = NaN (sz, cls); + endif + else + rnd = NaN (sz, cls); + + j = (0 < alpha) && (alpha < Inf) && (0 < c) && (c < Inf) ... + && (0 < k) && (k < Inf); + rnd(k) = rand(sum(j(:)),1); + rnd(k) = ((1 - rnd(j) / alpha(j)).^(-1 ./ k(j)) - 1).^(1 ./ c(j)); + endif + +endfunction + + +%!assert (size (burrrnd (1, 1, 1)), [1 1]) +%!assert (size (burrrnd (ones (2,1), 1, 1)), [2, 1]) +%!assert (size (burrrnd (ones (2,2), 1, 1)), [2, 2]) +%!assert (size (burrrnd (1, ones (2,1), 1)), [2, 1]) +%!assert (size (burrrnd (1, ones (2,2), 1)), [2, 2]) +%!assert (size (burrrnd (1, 1, ones (2,1))), [2, 1]) +%!assert (size (burrrnd (1, 1, ones (2,2))), [2, 2]) +%!assert (size (burrrnd (1, 1, 1, 3)), [3, 3]) +%!assert (size (burrrnd (1, 1, 1, [4 1])), [4, 1]) +%!assert (size (burrrnd (1, 1, 1, 4, 1)), [4, 1]) + +## Test class of input preserved +%!assert (class (burrrnd (1,1,1)), "double") +%!assert (class (burrrnd (single (1),1,1)), "single") +%!assert (class (burrrnd (single ([1 1]),1,1)), "single") +%!assert (class (burrrnd (1,single (1),1)), "single") +%!assert (class (burrrnd (1,single ([1 1]),1)), "single") +%!assert (class (burrrnd (1,1,single (1))), "single") +%!assert (class (burrrnd (1,1,single ([1 1]))), "single") + +## Test input validation +%!error burrrnd () +%!error burrrnd (1) +%!error burrrnd (1,2) +%!error burrrnd (ones (3), ones (2), ones (2), 2) +%!error burrrnd (ones (2), ones (3), ones (2), 2) +%!error burrrnd (ones (2), ones (2), ones (3), 2) +%!error burrrnd (i, 2, 2) +%!error burrrnd (2, i, 2) +%!error burrrnd (2, 2, i) +%!error burrrnd (4,2,2, -1) +%!error burrrnd (4,2,2, ones (2)) +%!error burrrnd (4,2,2, [2 -1 2]) +%!error burrrnd (4*ones (2),2,2, 3) +%!error burrrnd (4*ones (2),2,2, [3, 2]) +%!error burrrnd (4*ones (2),2,2, 3, 2) + diff --git a/inst/canoncorr.m b/inst/canoncorr.m new file mode 100644 index 0000000..f4fb3d0 --- /dev/null +++ b/inst/canoncorr.m @@ -0,0 +1,94 @@ +function [A,B,r,U,V,stats] = canoncorr (X,Y) + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{A} @var{B} @var{r} @var{U} @var{V}] =} canoncorr (@var{X}, @var{Y}) +## Canonical correlation analysis +## +## Given @var{X} (size @var{k}*@var{m}) and @var{Y} (@var{k}*@var{n}), returns projection matrices of canonical coefficients @var{A} (size @var{m}*@var{d}, where @var{d} is the smallest of @var{m}, @var{n}, @var{d}) and @var{B} (size @var{m}*@var{d}); the canonical correlations @var{r} (1*@var{d}, arranged in decreasing order); the canonical variables @var{U}, @var{V} (both @var{k}*@var{d}, with orthonormal columns); and @var{stats}, a structure containing results from Bartlett's chi-square and Rao's F tests of significance. +## +## References: @* +## William H. Press (2011), Canonical Correlation Clarified by Singular Value Decomposition, http://numerical.recipes/whp/notes/CanonCorrBySVD.pdf @* +## Philip B. Ender, Multivariate Analysis: Canonical Correlation Analysis, http://www.philender.com/courses/multivariate/notes2/can1.html +## +## @seealso{princomp} +## @end deftypefn + +# Copyright (C) 2016-2019 by Nir Krakauer + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; If not, see . + + +k = size (X, 1); #should also be size (Y, 1) +m = size (X, 2); +n = size (Y, 2); +d = min ([k m n]); + +X = center (X); +Y = center (Y); + +[Qx Rx] = qr (X, 0); +[Qy Ry] = qr (Y, 0); + +[U S V] = svd (Qx' * Qy, "econ"); + +A = Rx \ U(:, 1:d); +B = Ry \ V(:, 1:d); + +#A, B are scaled to make the covariance matrices of the outputs U, V identity matrices +f = sqrt (k-1); +A .*= f; +B .*= f; + +if isargout(3) || isargout(6) + r = max(0, min(diag(S), 1))'; +endif +if isargout (4) + U = X * A; +endif +if isargout (5) + V = Y * B; +endif + +if isargout (6) + Wilks = fliplr(cumprod(fliplr((1 - r .^ 2)))); + chisq = - (k - 1 - (m + n + 1)/2) * log(Wilks); + df1 = (m - (1:d) + 1) .* (n - (1:d) + 1); + pChisq = 1 - chi2cdf (chisq, df1); + s = sqrt((df1.^2 - 4) ./ ((m - (1:d) + 1).^2 + (n - (1:d) + 1).^2 - 5)); + df2 = (k - 1 - (m + n + 1)/2) * s - df1/2 + 1; + ls = Wilks .^ (1 ./ s); + F = (1 ./ ls - 1) .* (df2 ./ df1); + pF = 1 - fcdf (F, df1, df2); + stats.Wilks = Wilks; + stats.df1 = df1; + stats.df2 = df2; + stats.F = F; + stats.pF = pF; + stats.chisq = chisq; + stats.pChisq = pChisq; +endif + +%!shared X,Y,A,B,r,U,V,k +%! k = 10; +%! X = [1:k; sin(1:k); cos(1:k)]'; Y = [tan(1:k); tanh((1:k)/k)]'; +%! [A,B,r,U,V,stats] = canoncorr (X,Y); +%!assert (A, [-0.329229 0.072908; 0.074870 1.389318; -0.069302 -0.024109], 1E-6); +%!assert (B, [-0.017086 -0.398402; -4.475049 -0.824538], 1E-6); +%!assert (r, [0.99590 0.26754], 1E-5); +%!assert (U, center(X) * A, 10*eps); +%!assert (V, center(Y) * B, 10*eps); +%!assert (cov(U), eye(size(U, 2)), 10*eps); +%!assert (cov(V), eye(size(V, 2)), 10*eps); +%! rand ("state", 1); [A,B,r] = canoncorr (rand(5, 10),rand(5, 20)); +%!assert (r, ones(1, 5), 10*eps); diff --git a/inst/carbig.mat b/inst/carbig.mat new file mode 100644 index 0000000..3d970e4 Binary files /dev/null and b/inst/carbig.mat differ diff --git a/inst/caseread.m b/inst/caseread.m new file mode 100644 index 0000000..cd45281 --- /dev/null +++ b/inst/caseread.m @@ -0,0 +1,62 @@ +## Copyright (C) 2008 Bill Denney +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{names} =} caseread (@var{filename}) +## Read case names from an ascii file. +## +## Essentially, this reads all lines from a file as text and returns +## them in a string matrix. +## @seealso{casewrite, tblread, tblwrite, csv2cell, cell2csv, fopen} +## @end deftypefn + +## Author: Bill Denney +## Description: Read strings from a file + +function names = caseread (f="") + + ## Check arguments + if nargin != 1 + print_usage (); + endif + if isempty (f) + ## FIXME: open a file dialog box in this case when a file dialog box + ## becomes available + error ("caseread: filename must be given") + endif + + [fid msg] = fopen (f, "rt"); + if fid < 0 || (! isempty (msg)) + error ("caseread: cannot open %s: %s", f, msg); + endif + + names = {}; + t = fgetl (fid); + while ischar (t) + names{end+1} = t; + t = fgetl (fid); + endwhile + if (fclose (fid) < 0) + error ("caseread: error closing f") + endif + names = strvcat (names); + +endfunction + +## Tests +%!shared n, casereadfile +%! n = ["a ";"bcd";"ef "]; +%! casereadfile = file_in_loadpath("test/caseread.dat"); +%!assert (caseread (casereadfile), n); diff --git a/inst/casewrite.m b/inst/casewrite.m new file mode 100644 index 0000000..200c039 --- /dev/null +++ b/inst/casewrite.m @@ -0,0 +1,70 @@ +## Copyright (C) 2008 Bill Denney +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} casewrite (@var{strmat}, @var{filename}) +## Write case names to an ascii file. +## +## Essentially, this writes all lines from @var{strmat} to +## @var{filename} (after deblanking them). +## @seealso{caseread, tblread, tblwrite, csv2cell, cell2csv, fopen} +## @end deftypefn + +## Author: Bill Denney +## Description: Write strings from a file + +function names = casewrite (s="", f="") + + ## Check arguments + if nargin != 2 + print_usage (); + endif + if isempty (f) + ## FIXME: open a file dialog box in this case when a file dialog box + ## becomes available + error ("casewrite: filename must be given") + endif + if isempty (s) + error ("casewrite: strmat must be given") + elseif ! ischar (s) + error ("casewrite: strmat must be a character matrix") + elseif ndims (s) != 2 + error ("casewrite: strmat must be two dimensional") + endif + + [fid msg] = fopen (f, "wt"); + if fid < 0 || (! isempty (msg)) + error ("casewrite: cannot open %s for writing: %s", f, msg); + endif + + for i = 1:rows (s) + status = fputs (fid, sprintf ("%s\n", deblank (s(i,:)))); + endfor + if (fclose (fid) < 0) + error ("casewrite: error closing f") + endif + +endfunction + +%!test +%! fname = [tempname() ".dat"]; +%! unwind_protect +%! s = ["a ";"bcd";"ef "]; +%! casewrite (s, fname) +%! names = caseread (fname); +%! unwind_protect_cleanup +%! unlink (fname); +%! end_unwind_protect +%! assert(names, s); diff --git a/inst/cdf.m b/inst/cdf.m new file mode 100644 index 0000000..8976151 --- /dev/null +++ b/inst/cdf.m @@ -0,0 +1,109 @@ +## Copyright (C) 2013 Pantxo Diribarne +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{retval} =} cdf (@var{name}, @var{X}, @dots{}) +## Return cumulative density function of @var{name} function for value +## @var{x}. +## This is a wrapper around various @var{name}cdf and @var{name}_cdf +## functions. See the individual functions help to learn the signification of +## the arguments after @var{x}. Supported functions and corresponding number of +## additional arguments are: +## +## @multitable @columnfractions 0.02 0.3 0.45 0.2 +## @headitem @tab function @tab alternative @tab args +## @item @tab "beta" @tab "beta" @tab 2 +## @item @tab "bino" @tab "binomial" @tab 2 +## @item @tab "cauchy" @tab @tab 2 +## @item @tab "chi2" @tab "chisquare" @tab 1 +## @item @tab "discrete" @tab @tab 2 +## @item @tab "exp" @tab "exponential" @tab 1 +## @item @tab "f" @tab @tab 2 +## @item @tab "gam" @tab "gamma" @tab 2 +## @item @tab "geo" @tab "geometric" @tab 1 +## @item @tab "gev" @tab "generalized extreme value" @tab 3 +## @item @tab "hyge" @tab "hypergeometric" @tab 3 +## @item @tab "kolmogorov_smirnov" @tab @tab 1 +## @item @tab "laplace" @tab @tab 0 +## @item @tab "logistic" @tab @tab 0 +## @item @tab "logn" @tab "lognormal" @tab 2 +## @item @tab "norm" @tab "normal" @tab 2 +## @item @tab "poiss" @tab "poisson" @tab 1 +## @item @tab "rayl" @tab "rayleigh" @tab 1 +## @item @tab "t" @tab @tab 1 +## @item @tab "unif" @tab "uniform" @tab 2 +## @item @tab "wbl" @tab "weibull" @tab 2 +## @end multitable +## +## @seealso{betacdf, binocdf, cauchy_cdf, chi2cdf, discrete_cdf, +## expcdf, fcdf, gamcdf, geocdf, gevcdf, hygecdf, +## kolmogorov_smirnov_cdf, laplace_cdf, logistic_cdf, logncdf, +## normcdf, poisscdf, raylcdf, tcdf, unifcdf, wblcdf} +## @end deftypefn + +function [retval] = cdf (varargin) + ## implemented functions + persistent allcdf = {{"beta", "beta"}, @betacdf, 2, ... + {"bino", "binomial"}, @binocdf, 2, ... + {"cauchy"}, @cauchy_cdf, 2, ... + {"chi2", "chisquare"}, @chi2cdf, 1, ... + {"discrete"}, @discrete_cdf, 2, ... + {"exp", "exponential"}, @expcdf, 1, ... + {"f"}, @fcdf, 2, ... + {"gam", "gamma"}, @gamcdf, 2, ... + {"geo", "geometric"}, @geocdf, 1, ... + {"gev", "generalized extreme value"}, @gevcdf, 3, ... + {"hyge", "hypergeometric"}, @hygecdf, 3, ... + {"kolmogorov_smirnov"}, @kolmogorov_smirnov_cdf, 1, ... + {"laplace"}, @laplace_cdf, 0, ... + {"logistic"}, @logistic_cdf, 0, ... # ML has 2 args here + {"logn", "lognormal"}, @logncdf, 2, ... + {"norm", "normal"}, @normcdf, 2, ... + {"poiss", "poisson"}, @poisscdf, 1, ... + {"rayl", "rayleigh"}, @raylcdf, 1, ... + {"t"}, @tcdf, 1, ... + {"unif", "uniform"}, @unifcdf, 2, ... + {"wbl", "weibull"}, @wblcdf, 2}; + + if (numel (varargin) < 2 || ! ischar (varargin{1})) + print_usage (); + endif + + name = varargin{1}; + x = varargin{2}; + + varargin(1:2) = []; + nargs = numel (varargin); + + cdfnames = allcdf(1:3:end); + cdfhdl = allcdf(2:3:end); + cdfargs = allcdf(3:3:end); + + idx = cellfun (@(x) any (strcmpi (name, x)), cdfnames); + + if (any (idx)) + if (nargs == cdfargs{idx}) + retval = feval (cdfhdl{idx}, x, varargin{:}); + else + error ("cdf: %s requires %d arguments", name, cdfargs{idx}) + endif + else + error ("cdf: %s not implemented", name); + endif + +endfunction + +%!test +%! assert(cdf ('norm', 1, 0, 1), normcdf (1, 0, 1)) \ No newline at end of file diff --git a/inst/chi2stat.m b/inst/chi2stat.m new file mode 100644 index 0000000..2648dfc --- /dev/null +++ b/inst/chi2stat.m @@ -0,0 +1,92 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} chi2stat (@var{n}) +## Compute mean and variance of the chi-square distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{n} is the parameter of the chi-square distribution. The elements +## of @var{n} must be positive +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the chi-square distribution +## +## @item +## @var{v} is the variance of the chi-square distribution +## @end itemize +## +## @subheading Example +## +## @example +## @group +## n = 1:6; +## [m, v] = chi2stat (n) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the chi-square distribution + +function [m, v] = chi2stat (n) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (n) && ! ismatrix (n)) + error ("chi2stat: n must be a numeric matrix"); + endif + + # Calculate moments + m = n; + v = 2 .* n; + + # Continue argument check + k = find (! (n > 0) | ! (n < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! n = 1:6; +%! [m, v] = chi2stat (n); +%! assert (m, n); +%! assert (v, [2, 4, 6, 8, 10, 12], 0.001); diff --git a/inst/cl_multinom.m b/inst/cl_multinom.m new file mode 100644 index 0000000..bf55c56 --- /dev/null +++ b/inst/cl_multinom.m @@ -0,0 +1,124 @@ +## Copyright (C) 2009 Levente Torok +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## +## @deftypefn {Function File} {@var{CL} =} cl_multinom (@var{x}, @var{N}, @var{b}, @var{calculation_type} ) - Confidence level of multinomial portions +## Returns confidence level of multinomial parameters estimated @math{ p = x / sum(x) } with predefined confidence interval @var{b}. +## Finite population is also considered. +## +## This function calculates the level of confidence at which the samples represent the true distribution +## given that there is a predefined tolerance (confidence interval). +## This is the upside down case of the typical excercises at which we want to get the confidence interval +## given the confidence level (and the estimated parameters of the underlying distribution). +## But once we accept (lets say at elections) that we have a standard predefined +## maximal acceptable error rate (e.g. @var{b}=0.02 ) in the estimation and we just want to know that how sure we +## can be that the measured proportions are the same as in the +## entire population (ie. the expected value and mean of the samples are roghly the same) we need to use this function. +## +## @subheading Arguments +## @itemize @bullet +## @item @var{x} : int vector : sample frequencies bins +## @item @var{N} : int : Population size that was sampled by x. If N 4) + print_usage; + elseif (!ischar (calculation_type)) + error ("Argument calculation_type must be a string"); + endif + + k = rows(x); + nn = sum(x); + p = x / nn; + + if (isscalar( b )) + if (b==0) b=0.02; endif + b = ones( rows(x), 1 ) * b; + + if (b<0) b=1 ./ max( x, 1 ); endif + endif + bb = b .* b; + + if (N==nn) + CL = 1; + return; + endif + + if (N +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{T} =} cluster (@var{Z},'Cutoff', @var{C}) +## @deftypefnx {Function File} @ +## {@var{T} =} cluster (@var{Z}, 'Cutoff', @var{C}, 'Depth', @var{D}) +## @deftypefnx {Function File} @ +## {@var{T} =} cluster (@var{Z}, 'Cutoff', @var{C}, 'Criterion', @var{criterion}) +## @deftypefnx {Function File} @ +## {@var{T} =} cluster (@var{Z}, 'MaxClust', @var{N}) +## +## Define clusters from an agglomerative hierarchical cluster tree. +## +## Given a hierarchical cluster tree @var{Z} generated by the @code{linkage} +## function, @code{cluster} defines clusters, using a threshold value @var{C} to +## identify new clusters ('Cutoff') or according to a maximum number of desired +## clusters @var{N} ('MaxClust'). +## +## @var{criterion} is used to choose the criterion for defining clusters, which +## can be either "inconsistent" (default) or "distance". When using +## "inconsistent", @code{cluster} compares the threshold value @var{C} to the +## inconsistency coefficient of each link; when using "distance", @code{cluster} +## compares the threshold value @var{C} to the height of each link. +## @var{D} is the depth used to evaluate the inconsistency coefficient, its +## default value is 2. +## +## @code{cluster} uses "distance" as a criterion for defining new clusters when +## it is used the 'MaxClust' method. +## +## @end deftypefn +## +## @seealso{clusterdata,dendrogram,inconsistent,kmeans,linkage,pdist} + +## Author: Stefano Guidoni + +function T = cluster (Z, opt, varargin) + switch (lower (opt)) + ## check the input + case "cutoff" + if (nargin < 3) + print_usage (); + else + C = varargin{1}; + D = 2; + criterion = "inconsistent"; + if (nargin > 3) + pair_index = 2; + while (pair_index < (nargin - 2)) + switch (lower (varargin{pair_index})) + case "depth" + D = varargin{pair_index + 1}; + case "criterion" + criterion = varargin{pair_index + 1}; + otherwise + error ("cluster: unknown property %s", varargin{pair_index}); + endswitch + pair_index += 2; + endwhile + endif + endif + if ((! (isscalar (C) || isvector (C))) || (C < 0)) + error ... + (["cluster: C must be a positive scalar or a vector of positive"... + "numbers"]); + endif + + case "maxclust" + if (nargin != 3) + print_usage (); + else + N = varargin{1}; + C = []; + endif + if ((! (isscalar (N) || isvector (N))) || (N < 0)) + error ... + (["cluster: N must be a positive number or a vector of positive"... + "numbers"]); + endif + + otherwise + error ("cluster: unknown option %s", opt); + endswitch + + if ((columns (Z) != 3) || (! isnumeric (Z)) ... + (! (max (Z(end, 1:2)) == rows (Z) * 2))) + error ("cluster: Z must be a matrix generated by the linkage function"); + endif + + ## number of observations + n = rows (Z) + 1; + + ## vector of values used by the threshold check + vThresholds = []; + + ## starting number of clusters + nClusters = 1; + + ## the return value is the matrix T, constituted by one or more vector vT + T = []; + vT = zeros (1, n); + + ## main logic + ## a few checks and computations before launching the recursive function + switch (lower (opt)) + case "cutoff" + switch (lower (criterion)) + case "inconsistent" + vThresholds = inconsistent (Z, D)(:, 4); + case "distance" + vThresholds = Z(:, 3); + otherwise + error ("cluster: unkown criterion %s", criterion); + endswitch + case "maxclust" + ## the MaxClust case can be regarded as a Cutoff case with distance + ## criterion, where the threshold is set to the height of the highest node + ## that allows us to have N different clusters + vThresholds = Z(:, 3); + + ## let's build a vector with the apt threshold values + for k = 1:length (N); + if (N(k) > n) + C(end+1) = 0; + elseif (N(k) < 2) + C(end+1) = Z(end, 3) + 1; + else + C(end+1) = Z((end + 2 - N(k)), 3); + endif + endfor + endswitch + + for c_index = 1:length (C) + cluster_cutoff_recursive (rows (Z), nClusters, c_index); + T = [T; vT]; + endfor + + T = T'; # return value + + ## recursive function + ## for each link check if the cutoff criteria (a threshold value) are met, + ## then call recursively this function for every node below that; + ## when we find a leaf, we add the index of its cluster to the return value + function cluster_cutoff_recursive (index, cluster_number, c_index) + + vClusterNumber = [cluster_number, cluster_number]; + + ## check the threshold value + if (vThresholds(index) >= C(c_index)) + ## create a new cluster + nClusters++; + vClusterNumber(2) = nClusters; + endif; + + ## go on, down the tree + for j = 1:2 + if (Z(index,j) > n) + new_index = Z(index,j) - n; + cluster_cutoff_recursive (new_index, vClusterNumber(j), c_index); + else + ## if the next node is a leaf, add the index of its cluster to the + ## result at the correct position, i.e. the leaf number; + ## if leaf 14 belongs to cluster 3: + ## vT(14) = 3; + vT(Z(index,j)) = vClusterNumber(j); + endif + endfor + endfunction + +endfunction + + +## Test input validation +%!error cluster () +%!error cluster ([1 1], "Cutoff", 1) +%!error cluster ([1 2 1], "Bogus", 1) +%!error cluster ([1 2 1], "Cutoff", -1) +%!error cluster ([1 2 1], "Cutoff", 1, "Bogus", 1) + +## Test output +%!test +% X = [(randn (10, 2) * 0.25) + 1; (randn (10, 2) * 0.25) - 1]; +% Z = linkage(X, "ward"); +% T = [ones (10, 1); 2 * ones (10, 1)]; +% assert (cluster (Z, "MaxClust", 2), T); + diff --git a/inst/clusterdata.m b/inst/clusterdata.m new file mode 100644 index 0000000..8d6aef5 --- /dev/null +++ b/inst/clusterdata.m @@ -0,0 +1,117 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{T} =} clusterdata (@var{X}, @var{cutoff}) +## @deftypefnx {Function File} @ +## {@var{T} =} clusterdata (@var{X}, @var{Name}, @var{Value}) +## +## Wrapper function for @code{linkage} and @code{cluster}. +## +## If @var{cutoff} is used, then @code{clusterdata} calls @code{linkage} and +## @code{cluster} with default value, using @var{cutoff} as a threshold value +## for @code{cluster}. If @var{cutoff} is an integer and greater or equal to 2, +## then @var{cutoff} is interpreted as the maximum number of cluster desired +## and the 'MaxClust' option is used for @code{cluster}. +## +## If @var{cutoff} is not used, then @code{clusterdata} expects a list of pair +## arguments. Then you must specify either the 'Cutoff' or 'MaxClust' option +## for @code{cluster}. The method and metric used by @code{linkage}, are +## defined through the 'linkage' and 'distance' arguments. +## +## @end deftypefn +## +## @seealso{cluster,dendrogram,inconsistent,kmeans,linkage,pdist} + +## Author: Stefano Guidoni + +function T = clusterdata (X, varargin) + + if (nargin < 2) + print_usage (); + + else + linkage_criterion = "single"; + distance_method = "euclidean"; + savememory = "off"; + clustering_method = []; + criterion = "inconsistent"; + D = 2; + + if (isnumeric (varargin{1})) # clusterdata (X, cutoff) + if (isinteger (varargin{1}) && (varargin{1} >= 2)) + clustering_method = "MaxClust"; + else + clustering_method = "Cutoff"; + endif + C = varargin{1}; + + else # clusterdata (Name, Value) + pair_index = 1; + while (pair_index < (nargin - 1)) + switch (lower (varargin{pair_index})) + case "criterion" + criterion = varargin{pair_index + 1}; + case "cutoff" + clustering_method = "Cutoff"; + C = varargin{pair_index + 1}; + case "depth" + D = varargin{pair_index + 1}; + case "distance" + distance_method = varargin{pair_index + 1}; + case "linkage" + linkage_criterion = varargin{pair_index + 1}; + case "maxclust" + clustering_method = "MaxClust"; + C = varargin{pair_index + 1}; + case "savememory" + savememory = varargin{pair_index + 1}; + otherwise + error ("clusterdata: unknown property %s", varargin{pair_index}); + endswitch + pair_index += 2; + endwhile + endif + endif + + if (isempty (clustering_method)) + error ... + (["clusterdata: you must specify either 'MaxClust' or 'Cutoff' when" ... + "using name-value arguments"]); + endif + + ## main body + Z = linkage (X, linkage_criterion, distance_method, "savememory"); + if (strcmp (lower (clustering_method), "cutoff")) + T = cluster (Z, clustering_method, C, "Criterion", criterion, "Depth", D); + else + T = cluster (Z, clustering_method, C); + endif +endfunction + + +## Test input validation +%!error clusterdata () +%!error clusterdata (1) +%!error clusterdata ([1 1], "Bogus", 1) +%!error clusterdata ([1 1], "Depth", 1) + +## Demonstration +%!demo +%! X = [(randn (10, 2) * 0.25) + 1; (randn (20, 2) * 0.5) - 1]; +%! wnl = warning ("off", "Octave:linkage_savemem", "local"); +%! T = clusterdata (X, "linkage", "ward", "MaxClust", 2); +%! scatter (X(:,1), X(:,2), 36, T, "filled"); + diff --git a/inst/cmdscale.m b/inst/cmdscale.m new file mode 100644 index 0000000..cec91b3 --- /dev/null +++ b/inst/cmdscale.m @@ -0,0 +1,149 @@ +## Copyright (C) 2014 JD Walsh +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @var{Y} = cmdscale (@var{D}) +## @deftypefnx{Function File} [@var{Y}, @var{e} ] = cmdscale (@var{D}) +## Classical multidimensional scaling of a matrix. +## +## Takes an @var{n} by @var{n} distance (or difference, similarity, or +## dissimilarity) matrix @var{D}. Returns @var{Y}, a matrix of @var{n} points +## with coordinates in @var{p} dimensional space which approximate those +## distances (or differences, similarities, or dissimilarities). Also returns +## the eigenvalues @var{e} of +## @code{@var{B} = -1/2 * @var{J} * (@var{D}.^2) * @var{J}}, where +## @code{J = eye(@var{n}) - ones(@var{n},@var{n})/@var{n}}. @var{p}, the number +## of columns of @var{Y}, is equal to the number of positive real eigenvalues of +## @var{B}. +## +## @var{D} can be a full or sparse matrix or a vector of length +## @code{@var{n}*(@var{n}-1)/2} containing the upper triangular elements (like +## the output of the @code{pdist} function). It must be symmetric with +## non-negative entries whose values are further restricted by the type of +## matrix being represented: +## +## * If @var{D} is either a distance, dissimilarity, or difference matrix, then +## it must have zero entries along the main diagonal. In this case the points +## @var{Y} equal or approximate the distances given by @var{D}. +## +## * If @var{D} is a similarity matrix, the elements must all be less than or +## equal to one, with ones along the the main diagonal. In this case the points +## @var{Y} equal or approximate the distances given by +## @code{@var{D} = sqrt(ones(@var{n},@var{n})-@var{D})}. +## +## @var{D} is a Euclidean matrix if and only if @var{B} is positive +## semi-definite. When this is the case, then @var{Y} is an exact representation +## of the distances given in @var{D}. If @var{D} is non-Euclidean, @var{Y} only +## approximates the distance given in @var{D}. The approximation used by +## @code{cmdscale} minimizes the statistical loss function known as +## @var{strain}. +## +## The returned @var{Y} is an @var{n} by @var{p} matrix showing possible +## coordinates of the points in @var{p} dimensional space +## (@code{@var{p} < @var{n}}). The columns are correspond to the positive +## eigenvalues of @var{B} in descending order. A translation, rotation, or +## reflection of the coordinates given by @var{Y} will satisfy the same distance +## matrix up to the limits of machine precision. +## +## For any @code{@var{k} <= @var{p}}, if the largest @var{k} positive +## eigenvalues of @var{B} are significantly greater in absolute magnitude than +## its other eigenvalues, the first @var{k} columns of @var{Y} provide a +## @var{k}-dimensional reduction of @var{Y} which approximates the distances +## given by @var{D}. The optional return @var{e} can be used to consider various +## values of @var{k}, or to evaluate the accuracy of specific dimension +## reductions (e.g., @code{@var{k} = 2}). +## +## Reference: Ingwer Borg and Patrick J.F. Groenen (2005), Modern +## Multidimensional Scaling, Second Edition, Springer, ISBN: 978-0-387-25150-9 +## (Print) 978-0-387-28981-6 (Online) +## +## @seealso{pdist} +## @end deftypefn + +## Author: JD Walsh +## Created: 2014-10-31 +## Description: Classical multidimensional scaling +## Keywords: multidimensional-scaling mds distance clustering + +## TO DO: include missing functions `mdscale' and `procrustes' in @seealso + +function [Y, e] = cmdscale (D) + + % Check for matrix input + if ((nargin ~= 1) || ... + (~any(strcmp ({'matrix' 'scalar' 'range'}, typeinfo(D))))) + usage ('cmdscale: input must be vector or matrix; see help'); + endif + + % If vector, convert to matrix; otherwise, check for square symmetric input + if (isvector (D)) + D = squareform (D); + elseif ((~issquare (D)) || (norm (D - D', 1) > 0)) + usage ('cmdscale: matrix input must be square symmetric; see help'); + endif + + n = size (D,1); + % Check for valid format (see help above); If similarity matrix, convert + if (any (any (D < 0))) + usage ('cmdscale: entries must be nonnegative; see help'); + elseif (trace (D) ~= 0) + if ((~all (diag (D) == 1)) || (~all (D <= 1))) + usage ('cmdscale: input must be distance vector or matrix; see help'); + endif + D = sqrt (ones (n,n) - D); + endif + + % Build centering matrix, perform double centering, extract eigenpairs + J = eye (n) - ones (n,n) / n; + B = -1 / 2 * J * (D .^ 2) * J; + [Q, e] = eig (B); + e = diag (e); + etmp = e; + e = sort(e, 'descend'); + + % Remove complex eigenpairs (only possible due to machine approximation) + if (iscomplex (etmp)) + for i = 1 : size (etmp,1) + cmp(i) = (isreal (etmp(i))); + endfor + etmp = etmp(cmp); + Q = Q(:,cmp); + endif + + % Order eigenpairs + [etmp, ord] = sort (etmp, 'descend'); + Q = Q(:,ord); + + % Remove negative eigenpairs + cmp = (etmp > 0); + etmp = etmp(cmp); + Q = Q(:,cmp); + + % Test for n-dimensional results + if (size(etmp,1) == n) + etmp = etmp(1:n-1); + Q = Q(:, 1:n-1); + endif + + % Build output matrix Y + Y = Q * diag (sqrt (etmp)); + +endfunction + +%!shared m, n, X, D +%! m = randi(100) + 1; n = randi(100) + 1; X = rand(m, n); D = pdist(X); +%!assert(norm(pdist(cmdscale(D))), norm(D), sqrt(eps)) +%!assert(norm(pdist(cmdscale(squareform(D)))), norm(D), sqrt(eps)) + diff --git a/inst/combnk.m b/inst/combnk.m new file mode 100644 index 0000000..b9fbeb5 --- /dev/null +++ b/inst/combnk.m @@ -0,0 +1,92 @@ +## Copyright (C) 2010 Soren Hauberg +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{c} =} combnk (@var{data}, @var{k}) +## Return all combinations of @var{k} elements in @var{data}. +## @end deftypefn + +function retval = combnk (data, k) + ## Check input + if (nargin != 2) + print_usage; + elseif (! isvector (data)) + error ("combnk: first input argument must be a vector"); + elseif (!isreal (k) || k != round (k) || k < 0) + error ("combnk: second input argument must be a non-negative integer"); + endif + + ## Simple checks + n = numel (data); + if (k == 0 || k > n) + retval = resize (data, 0, k); + elseif (k == n) + retval = data (:).'; + else + retval = __combnk__ (data, k); + endif + + ## For some odd reason Matlab seems to treat strings differently compared to other data-types... + if (ischar (data)) + retval = flipud (retval); + endif +endfunction + +function retval = __combnk__ (data, k) + ## Recursion stopping criteria + if (k == 1) + retval = data (:); + else + ## Process data + n = numel (data); + if iscell (data) + retval = {}; + else + retval = []; + endif + for j = 1:n + C = __combnk__ (data ((j+1):end), k-1); + C = cat (2, repmat (data (j), rows (C), 1), C); + if (!isempty (C)) + if (isempty (retval)) + retval = C; + else + retval = [retval; C]; + endif + endif + endfor + endif +endfunction + +%!demo +%! c = combnk (1:5, 2); +%! disp ("All pairs of integers between 1 and 5:"); +%! disp (c); + +%!test +%! c = combnk (1:3, 2); +%! assert (c, [1, 2; 1, 3; 2, 3]); + +%!test +%! c = combnk (1:3, 6); +%! assert (isempty (c)); + +%!test +%! c = combnk ({1, 2, 3}, 2); +%! assert (c, {1, 2; 1, 3; 2, 3}); + +%!test +%! c = combnk ("hello", 2); +%! assert (c, ["lo"; "lo"; "ll"; "eo"; "el"; "el"; "ho"; "hl"; "hl"; "he"]); diff --git a/inst/confusionchart.m b/inst/confusionchart.m new file mode 100644 index 0000000..0f85e02 --- /dev/null +++ b/inst/confusionchart.m @@ -0,0 +1,293 @@ +## Copyright (C) 2020-2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {} {} confusionchart (@var{trueLabels}, @var{predictedLabels}) +## @deftypefnx {} {} confusionchart (@var{m}) +## @deftypefnx {} {} confusionchart (@var{m}, @var{classLabels}) +## @deftypefnx {} {} confusionchart (@var{parent}, @dots{}) +## @deftypefnx {} {} confusionchart (@dots{}, @var{prop}, @var{val}, @dots{}) +## @deftypefnx {} {@var{cm} =} confusionchart (@dots{}) +## +## Display a chart of a confusion matrix. +## +## The two vectors of values @var{trueLabels} and @var{predictedLabels}, which +## are used to compute the confusion matrix, must be defined with the same +## format as the inputs of @code{confusionmat}. +## Otherwise a confusion matrix @var{m} as computed by @code{confusionmat} can +## be given. +## +## @var{classLabels} is an array of labels, i.e. the list of the class names. +## +## If the first argument is a handle to a @code{figure} or to a @code{uipanel}, +## then the confusion matrix chart is displayed inside that object. +## +## Optional property/value pairs are passed directly to the underlying objects, +## e.g. @qcode{"xlabel"}, @qcode{"ylabel"}, @qcode{"title"}, @qcode{"fontname"}, +## @qcode{"fontsize"} etc. +## +## The optional return value @var{cm} is a @code{ConfusionMatrixChart} object. +## Specific properties of a @code{ConfusionMatrixChart} object are: +## @itemize @bullet +## @item @qcode{"DiagonalColor"} +## The color of the patches on the diagonal, default is [0.0, 0.4471, 0.7412]. +## +## @item @qcode{"OffDiagonalColor"} +## The color of the patches off the diagonal, default is [0.851, 0.3255, 0.098]. +## +## @item @qcode{"GridVisible"} +## Available values: @qcode{on} (default), @qcode{off}. +## +## @item @qcode{"Normalization"} +## Available values: @qcode{absolute} (default), @qcode{column-normalized}, +## @qcode{row-normalized}, @qcode{total-normalized}. +## +## @item @qcode{"ColumnSummary"} +## Available values: @qcode{off} (default), @qcode{absolute}, +## @qcode{column-normalized},@qcode{total-normalized}. +## +## @item @qcode{"RowSummary"} +## Available values: @qcode{off} (default), @qcode{absolute}, +## @qcode{row-normalized}, @qcode{total-normalized}. +## @end itemize +## +## Run @code{demo confusionchart} to see some examples. +## +## @end deftypefn +## +## @seealso{confusionmat, sortClasses} + +## Author: Stefano Guidoni + +function cm = confusionchart (varargin) + + ## check the input parameters + if (nargin < 1) + print_usage (); + endif + + p_i = 1; + + if (ishghandle (varargin{p_i})) + ## parameter is a parent figure + handle_type = get (varargin{p_i}, "type"); + if (strcmp (handle_type, "figure")) + h = figure (varargin{p_i}); + hax = axes ("parent", h); + elseif (strcmp (handle_type, "uipanel")) + h = varargin{p_i}; + hax = axes ("parent", varargin{p_i}); + else + ## MATLAB compatibility: on MATLAB are also available Tab objects, + ## TiledChartLayout objects, GridLayout objects + error ("confusionchart: invalid handle to parent object"); + endif + p_i++; + else + h = figure (); + hax = axes ("parent", h); + endif + + if (ismatrix (varargin{p_i}) && rows (varargin{p_i}) == ... + columns (varargin{p_i})) + ## parameter is a confusion matrix + conmat = varargin{p_i}; + p_i++; + + if (p_i <= nargin && ((isvector (varargin{p_i}) && ... + length (varargin{p_i}) == rows (conmat)) || ... + (ischar ( varargin{p_i}) && rows (varargin{p_i}) == rows (conmat)) ... + || iscellstr (varargin{p_i}))) + ## parameter is an array of labels + labarr = varargin{p_i}; + + if (isrow (labarr)) + labarr = vec (labarr); + endif + + p_i++; + else + labarr = [1 : (rows (conmat))]'; + endif + elseif (isvector (varargin{p_i})) + ## parameter must be a group for confusionmat + [conmat, labarr] = confusionmat (varargin{p_i}, varargin{p_i + 1}); + p_i = p_i + 2; + else + close (h); + error ("confusionchart: invalid argument"); + endif + + ## remaining parameters are stored + i = p_i; + args = {}; + while (i <= nargin) + args{end + 1} = varargin{i++}; + endwhile + + ## prepare the labels + if (!iscellstr (labarr)) + if (!ischar (labarr)) + labarr = num2str (labarr); + endif + + labarr = cellstr (labarr); + endif + + ## MATLAB compatibility: labels are sorted + [labarr, I] = sort (labarr); + conmat = conmat(I, :); + conmat = conmat(:, I); + + cm = ConfusionMatrixChart (hax, conmat, labarr, args); + +endfunction + + +## Test input validation + +## Get current figure visibility so it can be restored after tests +%!shared visibility_setting +%! visibility_setting = get (0, "DefaultFigureVisible"); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ()", "Invalid call"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 1; 2 2; 3 3])", "invalid argument"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'xxx', 1)", "invalid property"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'XLabel', 1)", "XLabel .* string"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'YLabel', [1 0])", ".* YLabel .* string"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'Title', .5)", ".* Title .* string"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'FontName', [])", ".* FontName .* string"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'FontSize', 'b')", ".* FontSize .* numeric"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'DiagonalColor', 'h')", ".* DiagonalColor .* color"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'OffDiagonalColor', [])", ".* OffDiagonalColor .* color"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'Normalization', '')", ".* invalid .* Normalization"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'ColumnSummary', [])", ".* invalid .* ColumnSummary"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'RowSummary', 1)", ".* invalid .* RowSummary"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'GridVisible', .1)", ".* invalid .* GridVisible"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'HandleVisibility', .1)", ".* invalid .* HandleVisibility"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'OuterPosition', .1)", ".* invalid .* OuterPosition"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'Position', .1)", ".* invalid .* Position"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ("confusionchart ([1 2], [0 1], 'Units', .1)", ".* invalid .* Units"); +%! set (0, "DefaultFigureVisible", visibility_setting); + +## Demonstration using the confusion matrix example from +## R.Bonnin, "Machine Learning for Developers", pp. 55-56 +%!demo "Setting the chart properties" +%! Yt = [8 5 6 8 5 3 1 6 4 2 5 3 1 4]'; +%! Yp = [8 5 6 8 5 2 3 4 4 5 5 7 2 6]'; +%! confusionchart (Yt, Yp, "Title", ... +%! "Demonstration with summaries","Normalization",... +%! "absolute","ColumnSummary", "column-normalized","RowSummary",... +%! "row-normalized") + +## example: confusion matrix and class labels +%!demo "Cellstr as inputs" +%! Yt = {'Positive', 'Positive', 'Positive', 'Negative', 'Negative' }; +%! Yp = {'Positive', 'Positive', 'Negative', 'Negative', 'Negative' }; +%! m = confusionmat ( Yt, Yp ); +%! confusionchart ( m, { 'Positive', 'Negative' } ); + +## example: editing the properties of an existing ConfusionMatrixChart object +%!demo "Editing the object properties" +%! Yt = {'Positive', 'Positive', 'Positive', 'Negative', 'Negative' }; +%! Yp = {'Positive', 'Positive', 'Negative', 'Negative', 'Negative' }; +%! cm = confusionchart ( Yt, Yp ); +%! cm.Title = "This is an example with a green diagonal"; +%! cm.DiagonalColor = [0.4660, 0.6740, 0.1880]; + +## example: drawing the chart inside a uipanel +%!demo "Confusion chart in a uipanel" +%! h = uipanel (); +%! Yt = {'Positive', 'Positive', 'Positive', 'Negative', 'Negative' }; +%! Yp = {'Positive', 'Positive', 'Negative', 'Negative', 'Negative' }; +%! cm = confusionchart ( h, Yt, Yp ); + +## example: sortClasses +%!demo "Sorting classes" +%! Yt = [8 5 6 8 5 3 1 6 4 2 5 3 1 4]'; +%! Yp = [8 5 6 8 5 2 3 4 4 5 5 7 2 6]'; +%! cm = confusionchart (Yt, Yp, "Title", ... +%! "Classes are sorted according to clusters"); +%! sortClasses (cm, "cluster"); diff --git a/inst/confusionmat.m b/inst/confusionmat.m new file mode 100644 index 0000000..c6a62c3 --- /dev/null +++ b/inst/confusionmat.m @@ -0,0 +1,224 @@ +## Copyright (C) 2020 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @ +## {@var{C} =} confusionmat (@var{group}, @var{grouphat}) +## @deftypefnx {Function File} @ +## {@var{C} =} confusionmat (@var{group}, @var{grouphat}, 'Order', @var{grouporder}) +## @deftypefnx {Function File} @ +## {[@var{C}, @var{order}] =} confusionmat (@var{group}, @var{grouphat}) +## +## Compute a confusion matrix for classification problems +## +## @code{confusionmat} returns the confusion matrix @var{C} for the group of +## actual values @var{group} and the group of predicted values @var{grouphat}. +## The row indices of the confusion matrix represent actual values, while the +## column indices represent predicted values. The indices are the same for both +## actual and predicted values, so the confusion matrix is a square matrix. +## Each element of the matrix represents the number of matches between a given +## actual value (row index) and a given predicted value (column index), hence +## correct matches lie on the main diagonal of the matrix. +## The order of the rows and columns is returned in @var{order}. +## +## @var{group} and @var{grouphat} must have the same number of observations +## and the same data type. +## Valid data types are numeric vectors, logical vectors, character arrays, +## string arrays (not implemented yet), cell arrays of strings. +## +## The order of the rows and columns can be specified by setting the +## @var{grouporder} variable. The data type of @var{grouporder} must be the +## same of @var{group} and @var{grouphat}. +## +## @end deftypefn +## +## @seealso{crosstab} + +## Author: Stefano Guidoni +## MATLAB compatibility: Octave misses string arrays, categorical vectors and +## undefined values. + +function [C, order] = confusionmat ( group, grouphat, opt = 'Order', grouporder ) + +## check the input parameters +if ( nargin < 2 ) || ( nargin > 4 ) + print_usage(); +endif + +y_true = group; +y_pred = grouphat; + +if class( y_true ) != class( y_pred ) + error( "confusionmat: group and grouphat must be of the same data type" ); +endif + +if length( y_true ) != length( y_pred ) + error( "confusionmat: group and grouphat must be of the same length" ); +endif + +if ( nargin > 3 ) && strcmp( opt, 'Order' ) + unique_tokens = grouporder; + + if class( y_true ) != class( unique_tokens ) + error( "confusionmat: group and grouporder must be of the same data type" ); + endif +endif + +if isvector( y_true ) + if isrow( y_true ) + y_true = vec( y_true ); + endif +else + error( "confusionmat: group must be a vector or array" ); +endif + +if isvector( y_pred ) + if isrow( y_pred ) + y_pred = vec( y_pred ); + endif +else + error( "confusionmat: grouphat must be a vector or array" ); +endif + +if exist( "unique_tokens", "var" ) + if isvector( unique_tokens ) + if isrow( unique_tokens ) + unique_tokens = vec( unique_tokens ); + endif + else + error( "confusionmat: grouporder must be a vector or array" ); + endif +endif + +## compute the confusion matrix +if isa( y_true, "numeric" ) || isa( y_true, "logical" ) + ## numeric or boolean vector + + ## MATLAB compatibility: + ## remove NaN values from grouphat + nan_indices = find( isnan( y_pred ) ); + y_pred(nan_indices) = []; + + ## MATLAB compatibility: + ## numeric and boolean values + ## are sorted in ascending order + if !exist( "unique_tokens", "var" ) + unique_tokens = union ( y_true, y_pred ); + endif + + y_true(nan_indices) = []; + + C_size = length ( unique_tokens ); + + C = zeros ( C_size ); + + for i = 1:length( y_true) + row_index = find( unique_tokens == y_true(i) ); + col_index = find( unique_tokens == y_pred(i) ); + + C(row_index, col_index)++; + endfor + +elseif iscellstr( y_true ) + ## string cells + + ## MATLAB compatibility: + ## remove empty values from grouphat + empty_indices = []; + for i = 1:length( y_pred ) + if isempty( y_pred{i} ) + empty_indices = [empty_indices; i]; + endif + endfor + + y_pred(empty_indices) = []; + + ## MATLAB compatibility: + ## string values are sorted according to their + ## first appearance in group and grouphat + if !exist( "unique_tokens", "var" ) + all_tokens = vertcat ( y_true, y_pred ); + unique_tokens = [all_tokens(1)]; + + for i = 2:length( all_tokens ) + if !any( strcmp( all_tokens(i), unique_tokens ) ) + unique_tokens = [unique_tokens; all_tokens(i)]; + endif + endfor + endif + + y_true(empty_indices) = []; + + C_size = length ( unique_tokens ); + + C = zeros ( C_size ); + + + for i = 1:length( y_true) + row_index = find( strcmp( y_true{i}, unique_tokens ) ); + col_index = find( strcmp( y_pred{i}, unique_tokens ) ); + + C(row_index, col_index)++; + endfor + +elseif ischar( y_true ) + ## character array + + ## MATLAB compatibility: + ## character values are sorted according to their + ## first appearance in group and grouphat + if !exist( "unique_tokens", "var" ) + all_tokens = vertcat ( y_true, y_pred ); + unique_tokens = [all_tokens(1)]; + + for i = 2:length( all_tokens ) + if !any( find( unique_tokens == all_tokens(i) ) ) + unique_tokens = [unique_tokens; all_tokens(i)]; + endif + endfor + endif + + C_size = length ( unique_tokens ); + + C = zeros ( C_size ); + + for i = 1:length( y_true) + row_index = find( unique_tokens == y_true(i) ); + col_index = find( unique_tokens == y_pred(i) ); + + C(row_index, col_index)++; + endfor + +elseif isstring( y_true ) + ## string array + ## FIXME: not implemented yet + + error( "confusionmat: string array not implemented yet" ); +else + error( "confusionmat: invalid data type" ); +endif + +order = unique_tokens; + +endfunction + +## Test the confusion matrix example from +## R.Bonnin, "Machine Learning for Developers", pp. 55-56 +%!test +%! Yt = [8 5 6 8 5 3 1 6 4 2 5 3 1 4]'; +%! Yp = [8 5 6 8 5 2 3 4 4 5 5 7 2 6]'; +%! C = [0 1 1 0 0 0 0 0; 0 0 0 0 1 0 0 0; 0 1 0 0 0 0 1 0; 0 0 0 1 0 1 0 0; ... +%! 0 0 0 0 3 0 0 0; 0 0 0 1 0 1 0 0; 0 0 0 0 0 0 0 0; 0 0 0 0 0 0 0 2]; +%! assert (confusionmat (Yt, Yp), C) diff --git a/inst/cophenet.m b/inst/cophenet.m new file mode 100644 index 0000000..87f5fff --- /dev/null +++ b/inst/cophenet.m @@ -0,0 +1,136 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{c}, @var{d}] =} cophenet (@var{Z}, @var{y}) +## +## Compute the cophenetic correlation coefficient. +## +## The cophenetic correlation coefficient @var{C} of a hierarchical cluster tree +## @var{Z} is the linear correlation coefficient between the cophenetic +## distances @var{d} and the euclidean distances @var{y}. +## @tex +## \def\frac#1#2{{\begingroup#1\endgroup\over#2}} +## $$ c = \frac {\sum _{i n) + l_v = nonzeros (N(l_n - n, :)); # the list of leaves from the left branch + else + l_v = l_n; + endif + + if (r_n > n) + r_v = nonzeros (N(r_n - n, :)); # the list of leaves from the right branch + else + r_v = r_n; + endif + + j_max = length (l_v); + k_max = length (r_v); + ## keep track of the leaves in each sub-branch, i.e. node; + ## this does not matter for the last node, which includes all leaves + if (i < m) + N(i, 1 : (j_max + k_max)) = [l_v' r_v']; + endif + + for j = 1 : j_max + for k = 1: k_max + ## d is in the same format as y + if (l_v(j) < r_v(k)) + index = (l_v(j) - 1) * m - sum (1 : (l_v(j) - 2)) + (r_v(k) - l_v(j)); + else + index = (r_v(k) - 1) * m - sum (1 : (r_v(k) - 2)) + (l_v(j) - r_v(k)); + endif + d(index) = Z(i, 3); + endfor + endfor + endfor + + ## compute the cophenetic correlation c + y_mean = mean (y); + z_mean = mean (d); + + Y_sigma = y - y_mean; + Z_sigma = d - z_mean; + + c = sum (Z_sigma .* Y_sigma) / sqrt (sum (Y_sigma .^ 2) * sum (Z_sigma .^ 2)); + +endfunction + + +## Test input validation +%!error cophenet () +%!error cophenet (ones (2,2), 1) +%!error cophenet ([1 2 1], "a") +%!error cophenet ([1 2 1], [1 2]) + +## Demonstration +%!demo "usage"; +%! X = randn (10,2); +%! y = pdist (X); +%! Z = linkage (y, "average"); +%! cophenet (Z, y) + diff --git a/inst/copulacdf.m b/inst/copulacdf.m new file mode 100644 index 0000000..32d05a4 --- /dev/null +++ b/inst/copulacdf.m @@ -0,0 +1,288 @@ +## Copyright (C) 2008 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} copulacdf (@var{family}, @var{x}, @var{theta}) +## @deftypefnx {Function File} {} copulacdf ('t', @var{x}, @var{theta}, @var{nu}) +## Compute the cumulative distribution function of a copula family. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{family} is the copula family name. Currently, @var{family} can +## be @code{'Gaussian'} for the Gaussian family, @code{'t'} for the +## Student's t family, @code{'Clayton'} for the Clayton family, +## @code{'Gumbel'} for the Gumbel-Hougaard family, @code{'Frank'} for +## the Frank family, @code{'AMH'} for the Ali-Mikhail-Haq family, or +## @code{'FGM'} for the Farlie-Gumbel-Morgenstern family. +## +## @item +## @var{x} is the support where each row corresponds to an observation. +## +## @item +## @var{theta} is the parameter of the copula. For the Gaussian and +## Student's t copula, @var{theta} must be a correlation matrix. For +## bivariate copulas @var{theta} can also be a correlation coefficient. +## For the Clayton family, the Gumbel-Hougaard family, the Frank family, +## and the Ali-Mikhail-Haq family, @var{theta} must be a vector with the +## same number of elements as observations in @var{x} or be scalar. For +## the Farlie-Gumbel-Morgenstern family, @var{theta} must be a matrix of +## coefficients for the Farlie-Gumbel-Morgenstern polynomial where each +## row corresponds to one set of coefficients for an observation in +## @var{x}. A single row is expanded. The coefficients are in binary +## order. +## +## @item +## @var{nu} is the degrees of freedom for the Student's t family. +## @var{nu} must be a vector with the same number of elements as +## observations in @var{x} or be scalar. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the cumulative distribution of the copula at each row of +## @var{x} and corresponding parameter @var{theta}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +## theta = [1; 2]; +## p = copulacdf ("Clayton", x, theta) +## @end group +## +## @group +## x = [0.2:0.2:0.6; 0.2:0.1:0.4]; +## theta = [0.2, 0.1, 0.1, 0.05]; +## p = copulacdf ("FGM", x, theta) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Roger B. Nelsen. @cite{An Introduction to Copulas}. Springer, +## New York, second edition, 2006. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: CDF of a copula family + +function p = copulacdf (family, x, theta, nu) + + # Check arguments + if (nargin != 3 && (nargin != 4 || ! strcmpi (family, "t"))) + print_usage (); + endif + + if (! ischar (family)) + error ("copulacdf: family must be one of 'Gaussian', 't', 'Clayton', 'Gumbel', 'Frank', 'AMH', and 'FGM'"); + endif + + if (! isempty (x) && ! ismatrix (x)) + error ("copulacdf: x must be a numeric matrix"); + endif + + [n, d] = size (x); + + lower_family = lower (family); + + # Check family and copula parameters + switch (lower_family) + + case {"gaussian", "t"} + # Family with a covariance matrix + if (d == 2 && isscalar (theta)) + # Expand a scalar to a correlation matrix + theta = [1, theta; theta, 1]; + endif + if (any (size (theta) != [d, d]) || any (diag (theta) != 1) || any (any (theta != theta')) || min (eig (theta)) <= 0) + error ("copulacdf: theta must be a correlation matrix"); + endif + if (nargin == 4) + # Student's t family + if (! isscalar (nu) && (! isvector (nu) || length (nu) != n)) + error ("copulacdf: nu must be a vector with the same number of rows as x or be scalar"); + endif + nu = nu(:); + endif + + case {"clayton", "gumbel", "frank", "amh"} + # Archimedian one parameter family + if (! isvector (theta) || (! isscalar (theta) && length (theta) != n)) + error ("copulacdf: theta must be a vector with the same number of rows as x or be scalar"); + endif + theta = theta(:); + if (n > 1 && isscalar (theta)) + theta = repmat (theta, n, 1); + endif + + case {"fgm"} + # Exponential number of parameters + if (! ismatrix (theta) || size (theta, 2) != (2 .^ d - d - 1) || (size (theta, 1) != 1 && size (theta, 1) != n)) + error ("copulacdf: theta must be a row vector of length 2^d-d-1 or a matrix of size n x (2^d-d-1)"); + endif + if (n > 1 && size (theta, 1) == 1) + theta = repmat (theta, n, 1); + endif + + otherwise + error ("copulacdf: unknown copula family '%s'", family); + + endswitch + + if (n == 0) + # Input is empty + p = zeros (0, 1); + else + # Truncate input to unit hypercube + x(x < 0) = 0; + x(x > 1) = 1; + + # Compute the cumulative distribution function according to family + switch (lower_family) + + case {"gaussian"} + # The Gaussian family + p = mvncdf (norminv (x), zeros (1, d), theta); + # No parameter bounds check + k = []; + + case {"t"} + # The Student's t family + p = mvtcdf (tinv (x, nu), theta, nu); + # No parameter bounds check + k = []; + + case {"clayton"} + # The Clayton family + p = exp (-log (max (sum (x .^ (repmat (-theta, 1, d)), 2) - d + 1, 0)) ./ theta); + # Product copula at columns where theta == 0 + k = find (theta == 0); + if (any (k)) + p(k) = prod (x(k, :), 2); + endif + # Check bounds + if (d > 2) + k = find (! (theta >= 0) | ! (theta < inf)); + else + k = find (! (theta >= -1) | ! (theta < inf)); + endif + + case {"gumbel"} + # The Gumbel-Hougaard family + p = exp (-(sum ((-log (x)) .^ repmat (theta, 1, d), 2)) .^ (1 ./ theta)); + # Check bounds + k = find (! (theta >= 1) | ! (theta < inf)); + + case {"frank"} + # The Frank family + p = -log (1 + (prod (expm1 (repmat (-theta, 1, d) .* x), 2)) ./ (expm1 (-theta) .^ (d - 1))) ./ theta; + # Product copula at columns where theta == 0 + k = find (theta == 0); + if (any (k)) + p(k) = prod (x(k, :), 2); + endif + # Check bounds + if (d > 2) + k = find (! (theta > 0) | ! (theta < inf)); + else + k = find (! (theta > -inf) | ! (theta < inf)); + endif + + case {"amh"} + # The Ali-Mikhail-Haq family + p = (theta - 1) ./ (theta - prod ((1 + repmat (theta, 1, d) .* (x - 1)) ./ x, 2)); + # Check bounds + if (d > 2) + k = find (! (theta >= 0) | ! (theta < 1)); + else + k = find (! (theta >= -1) | ! (theta < 1)); + endif + + case {"fgm"} + # The Farlie-Gumbel-Morgenstern family + # All binary combinations + bcomb = logical (floor (mod (((0:(2 .^ d - 1))' * 2 .^ ((1 - d):0)), 2))); + ecomb = ones (size (bcomb)); + ecomb(bcomb) = -1; + # Summation over all combinations of order >= 2 + bcomb = bcomb(sum (bcomb, 2) >= 2, end:-1:1); + # Linear constraints matrix + ac = zeros (size (ecomb, 1), size (bcomb, 1)); + # Matrix to compute p + ap = zeros (size (x, 1), size (bcomb, 1)); + for i = 1:size (bcomb, 1) + ac(:, i) = -prod (ecomb(:, bcomb(i, :)), 2); + ap(:, i) = prod (1 - x(:, bcomb(i, :)), 2); + endfor + p = prod (x, 2) .* (1 + sum (ap .* theta, 2)); + # Check linear constraints + k = false (n, 1); + for i = 1:n + k(i) = any (ac * theta(i, :)' > 1); + endfor + + endswitch + + # Out of bounds parameters + if (any (k)) + p(k) = NaN; + endif + + endif + +endfunction + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +%! theta = [1; 2]; +%! p = copulacdf ("Clayton", x, theta); +%! expected_p = [0.1395; 0.1767]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +%! p = copulacdf ("Gumbel", x, 2); +%! expected_p = [0.1464; 0.1464]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +%! theta = [1; 2]; +%! p = copulacdf ("Frank", x, theta); +%! expected_p = [0.0699; 0.0930]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +%! theta = [0.3; 0.7]; +%! p = copulacdf ("AMH", x, theta); +%! expected_p = [0.0629; 0.0959]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.1:0.4]; +%! theta = [0.2, 0.1, 0.1, 0.05]; +%! p = copulacdf ("FGM", x, theta); +%! expected_p = [0.0558; 0.0293]; +%! assert (p, expected_p, 0.001); diff --git a/inst/copulapdf.m b/inst/copulapdf.m new file mode 100644 index 0000000..a23841a --- /dev/null +++ b/inst/copulapdf.m @@ -0,0 +1,194 @@ +## Copyright (C) 2008 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} copulapdf (@var{family}, @var{x}, @var{theta}) +## Compute the probability density function of a copula family. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{family} is the copula family name. Currently, @var{family} can +## be @code{'Clayton'} for the Clayton family, @code{'Gumbel'} for the +## Gumbel-Hougaard family, @code{'Frank'} for the Frank family, or +## @code{'AMH'} for the Ali-Mikhail-Haq family. +## +## @item +## @var{x} is the support where each row corresponds to an observation. +## +## @item +## @var{theta} is the parameter of the copula. The elements of +## @var{theta} must be greater than or equal to @code{-1} for the +## Clayton family, greater than or equal to @code{1} for the +## Gumbel-Hougaard family, arbitrary for the Frank family, and greater +## than or equal to @code{-1} and lower than @code{1} for the +## Ali-Mikhail-Haq family. Moreover, @var{theta} must be non-negative +## for dimensions greater than @code{2}. @var{theta} must be a column +## vector with the same number of rows as @var{x} or be scalar. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the probability density of the copula at each row of +## @var{x} and corresponding parameter @var{theta}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +## theta = [1; 2]; +## p = copulapdf ("Clayton", x, theta) +## @end group +## +## @group +## p = copulapdf ("Gumbel", x, 2) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Roger B. Nelsen. @cite{An Introduction to Copulas}. Springer, +## New York, second edition, 2006. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: PDF of a copula family + +function p = copulapdf (family, x, theta) + + # Check arguments + if (nargin != 3) + print_usage (); + endif + + if (! ischar (family)) + error ("copulapdf: family must be one of 'Clayton', 'Gumbel', 'Frank', and 'AMH'"); + endif + + if (! isempty (x) && ! ismatrix (x)) + error ("copulapdf: x must be a numeric matrix"); + endif + + [n, d] = size (x); + + if (! isvector (theta) || (! isscalar (theta) && size (theta, 1) != n)) + error ("copulapdf: theta must be a column vector with the same number of rows as x or be scalar"); + endif + + if (n == 0) + # Input is empty + p = zeros (0, 1); + else + if (n > 1 && isscalar (theta)) + theta = repmat (theta, n, 1); + endif + + # Truncate input to unit hypercube + x(x < 0) = 0; + x(x > 1) = 1; + + # Compute the cumulative distribution function according to family + lowerarg = lower (family); + + if (strcmp (lowerarg, "clayton")) + # The Clayton family + log_cdf = -log (max (sum (x .^ (repmat (-theta, 1, d)), 2) - d + 1, 0)) ./ theta; + p = prod (repmat (theta, 1, d) .* repmat (0:(d - 1), n, 1) + 1, 2) .* exp ((1 + theta .* d) .* log_cdf - (theta + 1) .* sum (log (x), 2)); + # Product copula at columns where theta == 0 + k = find (theta == 0); + if (any (k)) + p(k) = 1; + endif + # Check theta + if (d > 2) + k = find (! (theta >= 0) | ! (theta < inf)); + else + k = find (! (theta >= -1) | ! (theta < inf)); + endif + elseif (strcmp (lowerarg, "gumbel")) + # The Gumbel-Hougaard family + g = sum ((-log (x)) .^ repmat (theta, 1, d), 2); + c = exp (-g .^ (1 ./ theta)); + p = ((prod (-log (x), 2)) .^ (theta - 1)) ./ prod (x, 2) .* c .* (g .^ (2 ./ theta - 2) + (theta - 1) .* g .^ (1 ./ theta - 2)); + # Check theta + k = find (! (theta >= 1) | ! (theta < inf)); + elseif (strcmp (lowerarg, "frank")) + # The Frank family + if (d != 2) + error ("copulapdf: Frank copula PDF implemented as bivariate only"); + endif + p = (theta .* exp (theta .* (1 + sum (x, 2))) .* (exp (theta) - 1))./ (exp (theta) - exp (theta + theta .* x(:, 1)) + exp (theta .* sum (x, 2)) - exp (theta + theta .* x(:, 2))) .^ 2; + # Product copula at columns where theta == 0 + k = find (theta == 0); + if (any (k)) + p(k) = 1; + endif + # Check theta + k = find (! (theta > -inf) | ! (theta < inf)); + elseif (strcmp (lowerarg, "amh")) + # The Ali-Mikhail-Haq family + if (d != 2) + error ("copulapdf: Ali-Mikhail-Haq copula PDF implemented as bivariate only"); + endif + z = theta .* prod (x - 1, 2) - 1; + p = (theta .* (1 - sum (x, 2) - prod (x, 2) - z) - 1) ./ (z .^ 3); + # Check theta + k = find (! (theta >= -1) | ! (theta < 1)); + else + error ("copulapdf: unknown copula family '%s'", family); + endif + + if (any (k)) + p(k) = NaN; + endif + + endif + +endfunction + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +%! theta = [1; 2]; +%! p = copulapdf ("Clayton", x, theta); +%! expected_p = [0.9872; 0.7295]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2:0.2:0.6; 0.2:0.2:0.6]; +%! p = copulapdf ("Gumbel", x, 2); +%! expected_p = [0.9468; 0.9468]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2, 0.6; 0.2, 0.6]; +%! theta = [1; 2]; +%! p = copulapdf ("Frank", x, theta); +%! expected_p = [0.9378; 0.8678]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = [0.2, 0.6; 0.2, 0.6]; +%! theta = [0.3; 0.7]; +%! p = copulapdf ("AMH", x, theta); +%! expected_p = [0.9540; 0.8577]; +%! assert (p, expected_p, 0.001); diff --git a/inst/copularnd.m b/inst/copularnd.m new file mode 100644 index 0000000..10033e0 --- /dev/null +++ b/inst/copularnd.m @@ -0,0 +1,281 @@ +## Copyright (C) 2012 Arno Onken +## +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{x} =} copularnd (@var{family}, @var{theta}, @var{n}) +## @deftypefnx {Function File} {} copularnd (@var{family}, @var{theta}, @var{n}, @var{d}) +## @deftypefnx {Function File} {} copularnd ('t', @var{theta}, @var{nu}, @var{n}) +## Generate random samples from a copula family. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{family} is the copula family name. Currently, @var{family} can be +## @code{'Gaussian'} for the Gaussian family, @code{'t'} for the Student's t +## family, or @code{'Clayton'} for the Clayton family. +## +## @item +## @var{theta} is the parameter of the copula. For the Gaussian and Student's t +## copula, @var{theta} must be a correlation matrix. For bivariate copulas +## @var{theta} can also be a correlation coefficient. For the Clayton family, +## @var{theta} must be a vector with the same number of elements as samples to +## be generated or be scalar. +## +## @item +## @var{nu} is the degrees of freedom for the Student's t family. @var{nu} must +## be a vector with the same number of elements as samples to be generated or +## be scalar. +## +## @item +## @var{n} is the number of rows of the matrix to be generated. @var{n} must be +## a non-negative integer and corresponds to the number of samples to be +## generated. +## +## @item +## @var{d} is the number of columns of the matrix to be generated. @var{d} must +## be a positive integer and corresponds to the dimension of the copula. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{x} is a matrix of random samples from the copula with @var{n} samples +## of distribution dimension @var{d}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## theta = 0.5; +## x = copularnd ("Gaussian", theta); +## @end group +## +## @group +## theta = 0.5; +## nu = 2; +## x = copularnd ("t", theta, nu); +## @end group +## +## @group +## theta = 0.5; +## n = 2; +## x = copularnd ("Clayton", theta, n); +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Roger B. Nelsen. @cite{An Introduction to Copulas}. Springer, New York, +## second edition, 2006. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Random samples from a copula family + +function x = copularnd (family, theta, nu, n) + + # Check arguments + if (nargin < 2) + print_usage (); + endif + + if (! ischar (family)) + error ("copularnd: family must be one of 'Gaussian', 't', and 'Clayton'"); + endif + + lower_family = lower (family); + + # Check family and copula parameters + switch (lower_family) + + case {"gaussian"} + # Gaussian family + if (isscalar (theta)) + # Expand a scalar to a correlation matrix + theta = [1, theta; theta, 1]; + endif + if (! ismatrix (theta) || any (diag (theta) != 1) || any (any (theta != theta')) || min (eig (theta)) <= 0) + error ("copularnd: theta must be a correlation matrix"); + endif + if (nargin > 3) + d = n; + if (! isscalar (d) || d != size (theta, 1)) + error ("copularnd: d must correspond to dimension of theta"); + endif + else + d = size (theta, 1); + endif + if (nargin < 3) + n = 1; + else + n = nu; + if (! isscalar (n) || (n < 0) || round (n) != n) + error ("copularnd: n must be a non-negative integer"); + endif + endif + + case {"t"} + # Student's t family + if (nargin < 3) + print_usage (); + endif + if (isscalar (theta)) + # Expand a scalar to a correlation matrix + theta = [1, theta; theta, 1]; + endif + if (! ismatrix (theta) || any (diag (theta) != 1) || any (any (theta != theta')) || min (eig (theta)) <= 0) + error ("copularnd: theta must be a correlation matrix"); + endif + if (! isscalar (nu) && (! isvector (nu) || length (nu) != n)) + error ("copularnd: nu must be a vector with the same number of rows as x or be scalar"); + endif + nu = nu(:); + if (nargin < 4) + n = 1; + else + if (! isscalar (n) || (n < 0) || round (n) != n) + error ("copularnd: n must be a non-negative integer"); + endif + endif + + case {"clayton"} + # Archimedian one parameter family + if (nargin < 4) + # Default is bivariate + d = 2; + else + d = n; + if (! isscalar (d) || (d < 2) || round (d) != d) + error ("copularnd: d must be an integer greater than 1"); + endif + endif + if (nargin < 3) + # Default is one sample + n = 1; + else + n = nu; + if (! isscalar (n) || (n < 0) || round (n) != n) + error ("copularnd: n must be a non-negative integer"); + endif + endif + if (! isvector (theta) || (! isscalar (theta) && size (theta, 1) != n)) + error ("copularnd: theta must be a column vector with the number of rows equal to n or be scalar"); + endif + if (n > 1 && isscalar (theta)) + theta = repmat (theta, n, 1); + endif + + otherwise + error ("copularnd: unknown copula family '%s'", family); + + endswitch + + if (n == 0) + # Input is empty + x = zeros (0, d); + else + + # Draw random samples according to family + switch (lower_family) + + case {"gaussian"} + # The Gaussian family + x = normcdf (mvnrnd (zeros (1, d), theta, n), 0, 1); + # No parameter bounds check + k = []; + + case {"t"} + # The Student's t family + x = tcdf (mvtrnd (theta, nu, n), nu); + # No parameter bounds check + k = []; + + case {"clayton"} + # The Clayton family + u = rand (n, d); + if (d == 2) + x = zeros (n, 2); + # Conditional distribution method for the bivariate case which also + # works for theta < 0 + x(:, 1) = u(:, 1); + x(:, 2) = (1 + u(:, 1) .^ (-theta) .* (u(:, 2) .^ (-theta ./ (1 + theta)) - 1)) .^ (-1 ./ theta); + else + # Apply the algorithm by Marshall and Olkin: + # Frailty distribution for Clayton copula is gamma + y = randg (1 ./ theta, n, 1); + x = (1 - log (u) ./ repmat (y, 1, d)) .^ (-1 ./ repmat (theta, 1, d)); + endif + k = find (theta == 0); + if (any (k)) + # Produkt copula at columns k + x(k, :) = u(k, :); + endif + # Continue argument check + if (d == 2) + k = find (! (theta >= -1) | ! (theta < inf)); + else + k = find (! (theta >= 0) | ! (theta < inf)); + endif + + endswitch + + # Out of bounds parameters + if (any (k)) + x(k, :) = NaN; + endif + + endif + +endfunction + +%!test +%! theta = 0.5; +%! x = copularnd ("Gaussian", theta); +%! assert (size (x), [1, 2]); +%! assert (all ((x >= 0) & (x <= 1))); + +%!test +%! theta = 0.5; +%! nu = 2; +%! x = copularnd ("t", theta, nu); +%! assert (size (x), [1, 2]); +%! assert (all ((x >= 0) & (x <= 1))); + +%!test +%! theta = 0.5; +%! x = copularnd ("Clayton", theta); +%! assert (size (x), [1, 2]); +%! assert (all ((x >= 0) & (x <= 1))); + +%!test +%! theta = 0.5; +%! n = 2; +%! x = copularnd ("Clayton", theta, n); +%! assert (size (x), [n, 2]); +%! assert (all ((x >= 0) & (x <= 1))); + +%!test +%! theta = [1; 2]; +%! n = 2; +%! d = 3; +%! x = copularnd ("Clayton", theta, n, d); +%! assert (size (x), [n, d]); +%! assert (all ((x >= 0) & (x <= 1))); diff --git a/inst/crossval.m b/inst/crossval.m new file mode 100644 index 0000000..e06a3c7 --- /dev/null +++ b/inst/crossval.m @@ -0,0 +1,174 @@ +## Copyright (C) 2014 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{results} =} crossval (@var{f}, @var{X}, @var{y}[, @var{params}]) +## Perform cross validation on given data. +## +## @var{f} should be a function that takes 4 inputs @var{xtrain}, @var{ytrain}, +## @var{xtest}, @var{ytest}, fits a model based on @var{xtrain}, @var{ytrain}, +## applies the fitted model to @var{xtest}, and returns a goodness of fit +## measure based on comparing the predicted and actual @var{ytest}. +## @code{crossval} returns an array containing the values returned by @var{f} +## for every cross-validation fold or resampling applied to the given data. +## +## @var{X} should be an @var{n} by @var{m} matrix of predictor values +## +## @var{y} should be an @var{n} by @var{1} vector of predicand values +## +## @var{params} may include parameter-value pairs as follows: +## +## @table @asis +## @item @qcode{"KFold"} +## Divide set into @var{k} equal-size subsets, using each one successively +## for validation. +## +## @item @qcode{"HoldOut"} +## Divide set into two subsets, training and validation. If the value +## @var{k} is a fraction, that is the fraction of values put in the +## validation subset (by default @var{k}=0.1); if it is a positive integer, +## that is the number of values in the validation subset. +## +## @item @qcode{"LeaveOut"} +## Leave-one-out partition (each element is placed in its own subset). +## The value is ignored. +## +## @item @qcode{"Partition"} +## The value should be a @var{cvpartition} object. +## +## @item @qcode{"Given"} +## The value should be an @var{n} by @var{1} vector specifying in which +## partition to put each element. +## +## @item @qcode{"stratify"} +## The value should be an @var{n} by @var{1} vector containing class +## designations for the elements, in which case the @qcode{"KFold"} and +## @qcode{"HoldOut"} partitionings attempt to ensure each partition +## represents the classes proportionately. +## +## @item @qcode{"mcreps"} +## The value should be a positive integer specifying the number of times +## to resample based on different partitionings. Currently only works with +## the partition type @qcode{"HoldOut"}. +## +## @end table +## +## Only one of @qcode{"KFold"}, @qcode{"HoldOut"}, @qcode{"LeaveOut"}, +## @qcode{"Given"}, @qcode{"Partition"} should be specified. If none is +## specified, the default is @qcode{"KFold"} with @var{k} = 10. +## +## @seealso{cvpartition} +## @end deftypefn + +## Author: Nir Krakauer + +function results = crossval (f, X, y, varargin) + + [n m] = size (X); + + if numel(y) != n + error('X, y sizes incompatible') + endif + + #extract optional parameter-value argument pairs + if numel(varargin) > 1 + vargs = varargin; + nargs = numel (vargs); + values = vargs(2:2:nargs); + names = vargs(1:2:nargs)(1:numel(values)); + validnames = {'KFold', 'HoldOut', 'LeaveOut', 'Partition', 'Given', 'stratify', 'mcreps'}; + for i = 1:numel(names) + names(i) = validatestring (names(i){:}, validnames); + end + for i = 1:numel(validnames) + name = validnames(i){:}; + name_pos = strmatch (name, names); + if !isempty(name_pos) + eval([name ' = values(name_pos){:};']) + endif + endfor + endif + + #construct CV partition + if exist ("Partition", "var") + P = Partition; + elseif exist ("Given", "var") + P = cvpartition (Given, "Given"); + elseif exist ("KFold", "var") + if !exist ("stratify", "var") + stratify = n; + endif + P = cvpartition (stratify, "KFold", KFold); + elseif exist ("HoldOut", "var") + if !exist ("stratify", "var") + stratify = n; + endif + P = cvpartition (stratify, "HoldOut", HoldOut); + if !exist ("mcreps", "var") || isempty (mcreps) + mcreps = 1; + endif + elseif exist ("LeaveOut", "var") + P = cvpartition (n, "LeaveOut"); + else #KFold + if !exist ("stratify", "var") + stratify = n; + endif + P = cvpartition (stratify, "KFold"); + endif + + nr = get(P, "NumTestSets"); #number of test sets to do cross validation on + nreps = 1; + if strcmp(get(P, "Type"), 'holdout') && exist("mcreps", "var") && mcreps > 1 + nreps = mcreps; + endif + results = nan (nreps, nr); + for rep = 1:nreps + if rep > 1 + P = repartition (P); + endif + for i = 1:nr + inds_train = training (P, i); + inds_test = test (P, i); + result = f (X(inds_train, :), y(inds_train), X(inds_test, :), y(inds_test)); + results(rep, i) = result; + endfor + endfor + +endfunction + +%!test +%! load fisheriris.txt +%! y = fisheriris(:, 2); +%! X = [ones(size(y)) fisheriris(:, 3:5)]; +%! f = @(X1, y1, X2, y2) meansq (y2 - X2*regress(y1, X1)); +%! results0 = crossval (f, X, y); +%! results1 = crossval (f, X, y, 'KFold', 10); +%! folds = 5; +%! results2 = crossval (f, X, y, 'KFold', folds); +%! results3 = crossval (f, X, y, 'Partition', cvpartition (numel (y), 'KFold', folds)); +%! results4 = crossval (f, X, y, 'LeaveOut', 1); +%! mcreps = 2; n_holdout = 20; +%! results5 = crossval (f, X, y, 'HoldOut', n_holdout, 'mcreps', mcreps); +%! +%! ## ensure equal representation of iris species in the training set -- tends +%! ## to slightly reduce cross-validation mean square error +%! results6 = crossval (f, X, y, 'KFold', 5, 'stratify', fisheriris(:, 1)); +%! +%! assert (results0, results1); +%! assert (results2, results3); +%! assert (size(results4), [1 numel(y)]); +%! assert (mean(results4), 4.5304, 1E-4); +%! assert (size(results5), [mcreps 1]); + diff --git a/inst/datasample.m b/inst/datasample.m new file mode 100644 index 0000000..cc1f1f8 --- /dev/null +++ b/inst/datasample.m @@ -0,0 +1,230 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} datasample (@var{data}, @var{k}) +## @deftypefnx {Function File} @ +## {@var{y} =} datasample (@var{data}, @var{k}, @var{dim}) +## @deftypefnx {Function File} @ +## {@var{y} =} datasample (@dots{}, Name, Value) +## @deftypefnx {Function File} @ +## {[@var{y} @var{idcs}] =} datasample (@dots{}) +## +## Randomly sample data. +## +## Return @var{k} observations randomly sampled from @var{data}. @var{data} can +## be a vector or a matrix of any data. When @var{data} is a matrix or a +## n-dimensional array, the samples are the subarrays of size n - 1, taken along +## the dimension @var{dim}. The default value for @var{dim} is 1, that is the +## row vectors when sampling a matrix. +## +## Output @var{y} is the returned sampled data. Optional output @var{idcs} is +## the vector of the indices to build @var{y} from @var{data}. +## +## Additional options are set through pairs of parameter name and value. The +## known parameters are: +## +## @table @code +## @item @qcode{Replace} +## a logical value that can be @code{true} (default) or @code{false}: when set +## to @code{true}, @code{datasample} returns data sampled with replacement. +## +## @item @qcode{Weigths} +## a vector of positive numbers that sets the probability of each element. It +## must have the same size as @var{data} along dimension @var{dim}. +## +## @end table +## +## +## @end deftypefn +## +## @seealso{rand, randi, randperm, randsample} + +## MATLAB compatibility: there are no random number streams in Octave +## Author: Stefano Guidoni + +function [y, idcs] = datasample (data, k, varargin) + + ## check input + if ( nargin < 2 ) + print_usage (); + endif + + ## data: some data, any type, any format but cell + ## MATLAB compatibility: there are no "table" or "dataset array" types in + ## Octave + if (iscell (data)) + error ("datasample: data must be a vector or matrix"); + endif + + ## k, a positive integer + if ((! isnumeric (k) || ! isscalar (k)) || (! (floor (k) == k)) || (k <= 0)) + error ("datasample: k must be a positive integer scalar"); + endif + + dim = 1; + replace = true; + weights = []; + if ( nargin > 2 ) + pair_index = 1; + + if (! ischar (varargin{1})) + ## it must be dim + dim = varargin{1}; + + ## the (Name, Value) pairs start further + pair_index += 1; + + ## dim, another positive integer + if ((! isscalar (dim)) || (! (floor (dim) == dim)) || (dim <= 0)) + error ("datasample: DIM must be a positive integer scalar"); + endif + endif + + ## (Name, Value) pairs + while (pair_index < (nargin - 2)) + switch (lower (varargin{pair_index})) + case "replace" + if (! islogical (varargin{pair_index + 1})) + error ("datasample: expected a logical value for 'Replace'"); + endif + replace = varargin{pair_index + 1}; + case "weights" + if ((! isnumeric (varargin{pair_index + 1})) || + (! isvector (varargin{pair_index + 1})) || + (any (varargin{pair_index + 1} < 0))) + error (["datasample: the sampling weights must be defined as a " ... + "vector of positive values"]); + endif + weights = varargin{pair_index + 1}; + otherwise + error ("datasample: unknown property %s", varargin{pair_index}); + endswitch + pair_index += 2; + endwhile + endif + + ## get the size of the population to sample + if (isvector (data)) + imax = length (data); + else + imax = size (data, dim); + endif + + if (isempty (weights)) + ## all elements have the same probability of being chosen + ## this is easy + + ## with or without replacement + if (replace) + idcs = randi (imax, k, 1); + else + idcs = randperm (imax, k); + endif + else + ## first check if the weights vector is right + if (imax != length (weights)) + error (["datasample: the size of the vector of sampling weights must"... + " be equal to the size of the sampled data"]); + endif + + if (replace) + ## easy case: + ## normalize the weights, + weights_n = cumsum (weights ./ sum (weights)); + weights_n(end) = 1; # just to be sure + ## then choose k numbers uniformly between 0 and 1 + samples = rand (k, 1); + + ## we have subdivided the space between 0 and 1 accordingly to the + ## weights vector: we have just to map back the random numbers to the + ## indices of the orginal dataset + for iter = 1 : k + idcs(iter) = find (weights_n >= samples(iter), 1); + endfor + else + ## complex case + ## choose k numbers uniformly between 0 and 1 + samples = rand (k, 1); + + for iter = 1 : k + ## normalize the weights + weights_n = cumsum (weights ./ sum (weights)); + weights_n(end) = 1; # just to be sure + + idcs(iter) = find (weights_n >= samples(iter), 1); + + ## remove the element from the set, i. e. set its probability to zero + weights(idcs(iter)) = 0; + endfor + endif + endif + + ## let's get the actual data from the original set + if (isvector (data)) + ## data is a vector + y = data(idcs); + else + vS = size (data); + + if (length (vS) == 2) + ## data is a 2-dimensional matrix + if (dim == 1) + y = data(idcs, :); + else + y = data(:, idcs); + endif + else + ## data is an n-dimensional matrix + s = "y = data("; + for iter = 1 : length (vS) + if (iter == dim) + s = [s "idcs,"]; + else + s = [s ":,"]; + endif + endfor + s = [s ":);"]; + eval (s); + endif + endif + +endfunction + +## some tests +%!error datasample(); +%!error datasample(1); +%!error datasample({1, 2, 3}, 1); +%!error datasample([1 2], -1); +%!error datasample([1 2], 1.5); +%!error datasample([1 2], [1 1]); +%!error datasample([1 2], 'g', [1 1]); +%!error datasample([1 2], 1, -1); +%!error datasample([1 2], 1, 1.5); +%!error datasample([1 2], 1, [1 1]); +%!error datasample([1 2], 1, 1, "Replace", -2); +%!error datasample([1 2], 1, 1, "Weights", "abc"); +%!error datasample([1 2], 1, 1, "Weights", [1 -2 3]); +%!error datasample([1 2], 1, 1, "Weights", ones (2)); +%!error datasample([1 2], 1, 1, "Weights", [1 2 3]); + +%!test +%! dat = randn (10, 4); +%! assert (size (datasample (dat, 3, 1)), [3 4]); + +%!test +%! dat = randn (10, 4); +%! assert (size (datasample (dat, 3, 2)), [10 3]); + diff --git a/inst/dcov.m b/inst/dcov.m new file mode 100644 index 0000000..51cafa8 --- /dev/null +++ b/inst/dcov.m @@ -0,0 +1,142 @@ +## Copyright (C) 2014 - Maria L. Rizzo and Gabor J. Szekely +## Copyright (C) 2014 Juan Pablo Carbajal +## This work is derived from the R energy package. It was adapted +## for Octave by Juan Pablo Carbajal. +## +## This progrm is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## Author: Juan Pablo Carbajal + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{dCor}, @var{dCov}, @var{dVarX}, @var{dVarY}] =} dcov (@var{x}, @var{y}, @var{index}=1) +## Distance correlation, covariance and correlation statistics. +## +## It returns distace correlation (@var{dCor}), +## distance covariance (@var{dCov}), diatance variace on x (@var{dVarX}) and +## distance variance on y (@var{dVarY}). +## +## Reference: https://en.wikipedia.org/wiki/Distance_correlation +## +## @seealso{corr, cov} +## @end deftypefn + +function [dCor, dCov, dVarX, dVarY] = dcov (x,y,index=1.0) + %x = abs(x - x.'); + %y = abs(y - y.'); + x = abs (bsxfun (@minus, x, x.')); + y = abs (bsxfun (@minus, y, y.')); + + [n nc] = size (x); + [m mc] = size (y); + if (n != m) + error ("Octave:invalid-input-arg", "Sample sizes must agree."); + endif + + if any (isnan (x) | isnan (y)) + error ("Octave:invalid-input-arg","Data contains missing or infinite values."); + endif + + if index < 0 || index > 2 + warning ("Octave:invalid-input-arg","index must be in [0,2), using default index=1"); + index = 1.0; + endif + + A = Akl (x, index); + B = Akl (y, index); + + dCov = sqrt (mean (A(:) .* B(:))); + dVarX = sqrt (mean (A(:).^2) ); + dVarY = sqrt (mean (B(:).^2) ); + V = sqrt (dVarX .* dVarY); + + if V > 0 + dCor = dCov / V; + else + dCor = 0; + end + +endfunction + +function c = Akl (x, index) +# Double centered distance + d = x .^ index; + rm = mean (d, 2); # row mean + gm = mean (d(:)); # grand mean + c = d - bsxfun (@plus, rm, rm.') + gm; +endfunction + +%!demo +%! base=@(x) (x- min(x))./(max(x)-min(x)); +%! N = 5e2; +%! x = randn (N,1); x = base (x); +%! z = randn (N,1); z = base (z); +%! # Linear relations +%! cy = [1 0.55 0.3 0 -0.3 -0.55 -1]; +%! ly = x .* cy; +%! ly(:,[1:3 5:end]) = base (ly(:,[1:3 5:end])); +%! # Correlated Gaussian +%! cz = 1 - abs (cy); +%! gy = base ( ly + cz.*z); +%! # Shapes +%! sx = repmat (x,1,7); +%! sy = zeros (size (ly)); +%! v = 2 * rand (size(x,1),2) - 1; +%! sx(:,1) = v(:,1); sy(:,1) = cos(2*pi*sx(:,1)) + 0.5*v(:,2).*exp(-sx(:,1).^2/0.5); +%! R =@(d) [cosd(d) sind(d); -sind(d) cosd(d)]; +%! tmp = R(35) * v.'; +%! sx(:,2) = tmp(1,:); sy(:,2) = tmp(2,:); +%! tmp = R(45) * v.'; +%! sx(:,3) = tmp(1,:); sy(:,3) = tmp(2,:); +%! sx(:,4) = v(:,1); sy(:,4) = sx(:,4).^2 + 0.5*v(:,2); +%! sx(:,5) = v(:,1); sy(:,5) = 3*sign(v(:,2)).*(sx(:,5)).^2 + v(:,2); +%! sx(:,6) = cos (2*pi*v(:,1)) + 0.5*(x-0.5); +%! sy(:,6) = sin (2*pi*v(:,1)) + 0.5*(z-0.5); +%! sx(:,7) = x + sign(v(:,1)); sy(:,7) = z + sign(v(:,2)); +%! sy = base (sy); +%! sx = base (sx); +%! # scaled shape +%! sc = 1/3; +%! ssy = (sy-0.5) * sc + 0.5; +%! n = size (ly,2); +%! ym = 1.2; +%! xm = 0.5; +%! fmt={'horizontalalignment','center'}; +%! ff = "% .2f"; +%! figure (1) +%! for i=1:n +%! subplot(4,n,i); +%! plot (x, gy(:,i), '.b'); +%! axis tight +%! axis off +%! text (xm,ym,sprintf (ff, dcov (x,gy(:,i))),fmt{:}) +%! +%! subplot(4,n,i+n); +%! plot (x, ly(:,i), '.b'); +%! axis tight +%! axis off +%! text (xm,ym,sprintf (ff, dcov (x,ly(:,i))),fmt{:}) +%! +%! subplot(4,n,i+2*n); +%! plot (sx(:,i), sy(:,i), '.b'); +%! axis tight +%! axis off +%! text (xm,ym,sprintf (ff, dcov (sx(:,i),sy(:,i))),fmt{:}) +%! v = axis (); +%! +%! subplot(4,n,i+3*n); +%! plot (sx(:,i), ssy(:,i), '.b'); +%! axis (v) +%! axis off +%! text (xm,ym,sprintf (ff, dcov (sx(:,i),ssy(:,i))),fmt{:}) +%! endfor diff --git a/inst/dendrogram.m b/inst/dendrogram.m new file mode 100644 index 0000000..26c00fa --- /dev/null +++ b/inst/dendrogram.m @@ -0,0 +1,410 @@ +## Copyright (C) 2021 Stefano Guidoni +## Copyright (c) 2012 Juan Pablo Carbajal +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} dendrogram (@var{tree}) +## @deftypefnx {Function File} {} dendrogram (@var{tree}, @var{p}) +## @deftypefnx {Function File} {} dendrogram (@var{tree}, @var{prop}, @var{val}) +## @deftypefnx {Function File} @ +## {} dendrogram (@var{tree}, @var{p}, @var{prop}, @var{val} ) +## @deftypefnx {Function File} {@var{h} = } dendrogram (@dots{}) +## @deftypefnx {Function File} @ +## {[@var{h}, @var{t}, @var{perm}] = } dendrogram (@dots{}) +## +## Plot a dendrogram of a hierarchical binary cluster tree. +## +## Given @var{tree}, a hierarchical binary cluster tree as the output of +## @code{linkage}, plot a dendrogram of the tree. The number of leaves shown by +## the dendrogram plot is limited to @var{p}. The default value for @var{p} is +## 30. Set @var{p} to 0 to plot all leaves. +## +## The optional outputs are @var{h}, @var{t} and @var{perm}: +## @itemize @bullet +## @item @var{h} is a handle to the lines of the plot. +## +## @item @var{t} is the vector with the numbers assigned to each leaf. +## Each element of @var{t} is a leaf of @var{tree} and its value is the number +## shown in the plot. +## When the dendrogram plot is collapsed, that is when the number of shown +## leaves @var{p} is inferior to the total number of leaves, a single leaf of +## the plot can represent more than one leaf of @var{tree}: in that case +## multiple elements of @var{t} share the same value, that is the same leaf of +## the plot. +## When the dendrogram plot is not collapsed, each leaf of the plot is the leaf +## of @var{tree} with the same number. +## +## @item @var{perm} is the vector list of the leaves as ordered as in the plot. +## @end itemize +## +## Additional input properties can be specified by pairs of properties and +## values. Known properties are: +## @itemize @bullet +## @item @qcode{"Reorder"} +## Reorder the leaves of the dendrogram plot using a numerical vector of size n, +## the number of leaves. When @var{p} is smaller than @var{n}, the reordering +## cannot break the @var{p} groups of leaves. +## +## @item @qcode{"Orientation"} +## Change the orientation of the plot. Available values: @qcode{top} (default), +## @qcode{bottom}, @qcode{left}, @qcode{right}. +## +## @item @qcode{"CheckCrossing"} +## Check if the lines of a reordered dendrogram cross each other. Available +## values: @qcode{true} (default), @qcode{false}. +## +## @item @qcode{"ColorThreshold"} +## Not implemented. +## +## @item @qcode{"Labels"} +## Use a char, string or cellstr array of size @var{n} to set the label for each +## leaf; the label is dispayed only for nodes with just one leaf. +## @end itemize +## +## @end deftypefn +## +## @seealso{cluster, clusterdata, cophenet, inconsistent, linkage, pdist} + +function [H, T, perm] = dendrogram (tree, varargin) + + [m d] = size (tree); + if ((d != 3) || (! isnumeric (tree)) || ... + (! (max (tree(end, 1:2)) == m * 2))) + error (["dendrogram: tree must be a matrix as generated by the " ... + "linkage function"]); + end + + pair_index = 1; + + ## node count + n = m + 1; + + P = 30; # default value + vReorder = []; + csLabels = {}; + checkCrossing = 1; + orientation = "top"; + if (nargin > 1) + if (isnumeric (varargin{1}) && isscalar (varargin{1})) + ## dendrogram (tree, P) + P = varargin{1}; + pair_index++; + endif + + ## dendrogram (..., Name, Value) + while (pair_index < (nargin - 1)) + switch (lower (varargin{pair_index})) + case "reorder" + if (isvector (varargin{pair_index + 1}) && ... + isnumeric (varargin{pair_index + 1}) && ... + length (varargin{pair_index + 1}) == n ) + vReorder = varargin{pair_index + 1}; + else + error (["dendrogram: 'reorder' must be a numeric vector of size" ... + "n, the number of leaves"]); + endif + case "checkcrossing" + if (ischar (varargin{pair_index + 1})) + switch (lower (varargin{pair_index + 1})) + case "true" + checkCrossing = 1; + case "false" + checkCrossing = 0; + otherwise + error ("dendrogram: unknown value '%s' for CheckCrossing", ... + varargin{pair_index + 1}); + endswitch + elseif + error (["dendrogram: the value of property CheckCrossing must ", ... + "be either 'true' or 'false'"]); + endif + case "colorthreshold" + warning ("dendrogram: property '%s' not implemented",... + varargin{pair_index}); + case "orientation" + orientation = varargin{pair_index + 1}; # validity check below + case "labels" + if (ischar (varargin{pair_index + 1}) && ... + (isvector (varargin{pair_index + 1}) && ... + length (varargin{pair_index + 1}) == n) || ... + (ismatrix (varargin{pair_index + 1}) && ... + rows (varargin{pair_index + 1}) == n)) + csLabels = cellstr (varargin{pair_index + 1}); + elseif (iscellstr (varargin{pair_index + 1}) && + length (varargin{pair_index + 1}) == n) + csLabels = varargin{pair_index + 1}; + else + error (["dendrogram: labels must be a char or string or" ... + "cellstr array of size n"]); + endif + otherwise + error ("dendrogram: unknown property '%s'", varargin{pair_index}); + endswitch + + pair_index += 2; + endwhile + endif + + ## MATLAB compatibility: + ## P <= 0 to plot all leaves + if (P < 1) + P = n; + endif + + if (n > P) + level_0 = tree((n - P), 3); + else + P = n; + level_0 = 0; + endif + + vLeafPosition = zeros((n + m), 1); + T = (1:n)'; + nodecnt = 1; + + ## main + dendrogram_recursive (m, 0); + + ## T reordering + ## MATLAB compatibility: when n > P, each node group is renamed with a number + ## between 1 and P, according to the smallest node index of each group; + ## the group with the node 1 is always group 1, while group 2 is the group + ## with the smallest node index outside of group 1, and group 3 is the group + ## with the smallest node index outside of groups 1 and 2... + newT = 1 : (length (T)); + if (n > P) + uniqueT = unique (T); + minT = zeros (uniqueT, 1); + counter = 1; + + for i = 1 : length (uniqueT) # it should be exactly equal to P + idcs = find (T == uniqueT(i)); + minT(i) = min (idcs); + endfor + + minT = minT(find (minT > 0)); # to prevent a strange bug + [minT, minTidcs] = sort (minT); + uniqueT = uniqueT(minTidcs); + for i = 1 : length (uniqueT) + idcs = find (T == uniqueT(i)); + newT(idcs) = counter++; + endfor + endif + + ## leaf reordering + if (! isempty(vReorder)) + if (P < n) + checkT = newT(vReorder(:)); + for i = 1 : P + idcs = find (checkT == i); + if (length (idcs) > 1) + if (max (idcs) - min (idcs) >= length (idcs)) + error (["dendrogram: invalid reordering that redefines the 'P'"... + "groups of leaves"]); + endif + endif + endfor + checkT = unique (checkT, "stable"); + vNewLeafPosition = zeros (n, 1); + uT = unique (T, "stable"); + for i = 1 : P + vNewLeafPosition(uT(checkT(i))) = i; + endfor + vLeafPosition = vNewLeafPosition; + else + for i = 1 : length (vReorder) + vLeafPosition(vReorder(i)) = i; + endfor + endif + endif + + ## figure + x = []; + + hd = figure (gcf); + + ## ticks and tricks + xticks = 1:P; + perm = zeros (P, 1); + for i = 1 : length (vLeafPosition) + if (vLeafPosition(i) != 0) + idcs = find (T == i); + perm(vLeafPosition(i)) = newT(idcs(1)); + endif + endfor + T = newT; # this should be unnecessary for n <= P + + ## lines + for i = (n - P + 1) : m + vLeafPosition(n + i) = mean (vLeafPosition(tree(i, 1:2), 1)); + x(end + 1,1:4) = [vLeafPosition(tree(i, 1:2))' tree(i, [3 3])]; + for j = 1 : 2 + x0 = 0; + if (tree(i,j) > (2 * n - P)) + x0 = tree(tree(i, j) - n, 3); + endif + x(end + 1, 1:4) = [vLeafPosition(tree(i, [j j]))' x0 tree(i, 3)]; + endfor + endfor + + ## plot stuff + if (strcmp (orientation, "top")) + H = line (x(:, 1:2)', x(:, 3:4)', "color", "blue"); + set (gca, "xticklabel", perm, "xtick", xticks); + elseif (strcmp (orientation, "bottom")) + H = line (x(:, 1:2)', x(:, 3:4)', "color", "blue"); + set (gca, "xticklabel", perm, "xtick", xticks, "xaxislocation", "top"); + axis ("ij"); + elseif (strcmp (orientation, "left")) + H = line (x(:, 3:4)', x(:, 1:2)', "color", "blue"); + set (gca, "yticklabel", perm, "ytick", xticks, "xdir", "reverse",... + "yaxislocation", "right"); + elseif (strcmp (orientation, "right")) + H = line (x(:, 3:4)', x(:, 1:2)', "color", "blue"); + set (gca, "yticklabel", perm, "ytick", xticks); + else + close (hd); + error ("dendrogram: invalid orientation '%s'", orientation); + endif + + ## labels + if (! isempty (csLabels)) + csCurrent = cellstr (num2str (perm)); + + for i = 1 : n + ## when there is just one leaf, use the named label for that leaf + if (1 == length (find (T == i))) + csCurrent(find (perm == i)) = csLabels(find (T == i)); + endif + endfor + + switch (orientation) + case {"top", "bottom"} + xticklabels (csCurrent); + case {"left", "right"} + yticklabels (csCurrent); + endswitch + endif + + ## check crossings + if (checkCrossing && ! isempty(vReorder)) + for j = 1 : rows (x) + if (x(j, 3) == x(j, 4)) # an horizontal line + for i = 1 : rows (x) + if (x(i, 1) == x(i, 2) && ... # orthogonal lines + (x(i, 1) > x(j, 1) && x(i, 1) < x(j, 2)) && ... + (x(j, 3) > x(i, 3) && x(j, 3) < x(i, 4))) + warning ("dendrogram: line intersection detected"); + endif + endfor + endif + endfor + endif + + ## dendrogram_recursive + function dendrogram_recursive (k, cn) + if (tree(k, 3) > level_0) + for j = 1:2 + if (tree(k, j) > n) + dendrogram_recursive (tree(k, j) - n, 0) + else + vLeafPosition(tree(k, j)) = nodecnt++; + T(tree(k, j)) = tree(k, j); + endif + endfor + else + for j = 1:2 + if (cn == 0) + cn = n + k; + vLeafPosition(cn) = nodecnt++; + endif + + if (tree(k, j) > n) + dendrogram_recursive (tree(k, j) - n, cn) + else + T(tree(k, j)) = cn; + endif + endfor + endif + endfunction +endfunction + + +## Get current figure visibility so it can be restored after tests +%!shared visibility_setting +%! visibility_setting = get (0, "DefaultFigureVisible"); + +## Test input validation +%!error dendrogram () +%!error dendrogram (ones (2, 2), 1) +%!error dendrogram ([1 2 1], 1, "xxx", "xxx") +%!error dendrogram ([1 2 1], "Reorder", "xxx") +%!error dendrogram ([1 2 1], "Reorder", [1 2 3 4]) + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! fail ('dendrogram ([1 2 1], "Orientation", "north")', "invalid orientation .*") +%! set (0, "DefaultFigureVisible", visibility_setting); + + +## Demonstrations +## 1. +%!demo 1 +%! y = [4 5; 2 6; 3 7; 8 9; 1 10]; +%! y(:,3) = 1:5; +%! figure (gcf); clf; +%! dendrogram (y); +## 2. +%!demo 2 +%! v = 2 * rand (30, 1) - 1; +%! d = abs (bsxfun (@minus, v(:, 1), v(:, 1)')); +%! y = linkage (squareform (d, "tovector")); +%! figure (gcf); clf; +%! dendrogram (y); +## 3. collapsed tree +%!demo "collapsed tree, find all the leaves of node 5" +%! X = randn (60, 2); +%! D = pdist (X); +%! y = linkage (D, "average"); +%! figure (gcf); clf; +%! subplot (2, 1, 1); +%! title ("original tree"); +%! dendrogram (y, 0); +%! subplot (2, 1, 2); +%! title ("collapsed tree"); +%! [~, t] = dendrogram (y, 20); +%! find(t == 5) +## 4. optimal leaf order +%!demo "optimal leaf order" +%! X = randn (30, 2); +%! D = pdist (X); +%! y = linkage (D, "average"); +%! order = optimalleaforder (y, D); +%! figure (gcf); clf; +%! subplot (2, 1, 1); +%! title ("original leaf order"); +%! dendrogram (y); +%! subplot (2, 1, 2); +%! title ("optimal leaf order"); +%! dendrogram (y, "Reorder", order); +## 5. orientation +%!demo "horizontal orientation and labels" +%! X = randn (8, 2); +%! D = pdist (X); +%! L = ["Snow White"; "Doc"; "Grumpy"; "Happy"; "Sleepy"; "Bashful"; ... +%! "Sneezy"; "Dopey"]; +%! y = linkage (D, "average"); +%! dendrogram (y, "Orientation", "left", "Labels", L); + diff --git a/inst/evalclusters.m b/inst/evalclusters.m new file mode 100644 index 0000000..e79c2f4 --- /dev/null +++ b/inst/evalclusters.m @@ -0,0 +1,362 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{eva} =} evalclusters (@var{x}, @var{clust}, @var{criterion}) +## @deftypefnx {Function File} {@var{eva} =} evalclusters (@dots{}, @qcode{Name}, @qcode{Value}) +## +## Create a clustering evaluation object to find the optimal number of clusters. +## +## @code{evalclusters} creates a clustering evaluation object to evaluate the +## optimal number of clusters for data @var{x}, using criterion @var{criterion}. +## The input data @var{x} is a matrix with @code{n} observations of @code{p} +## variables. +## The evaluation criterion @var{criterion} is one of the following: +## @table @code +## @item @qcode{CalinskiHarabasz} +## to create a @code{CalinskiHarabaszEvaluation} object. +## +## @item @qcode{DaviesBouldin} +## to create a @code{DaviesBouldinEvaluation} object. +## +## @item @qcode{gap} +## to create a @code{GapEvaluation} object. +## +## @item @qcode{silhouette} +## to create a @code{SilhouetteEvaluation} object. +## +## @end table +## The clustering algorithm @var{clust} is one of the following: +## @table @code +## @item @qcode{kmeans} +## to cluster the data using @code{kmeans} with @code{EmptyAction} set to +## @code{singleton} and @code{Replicates} set to 5. +## +## @item @qcode{linkage} +## to cluster the data using @code{clusterdata} with @code{linkage} set to +## @code{Ward}. +## +## @item @qcode{gmdistribution} +## to cluster the data using @code{fitgmdist} with @code{SharedCov} set to +## @code{true} and @code{Replicates} set to 5. +## +## @end table +## If the @var{criterion} is @code{CalinskiHarabasz}, @code{DaviesBouldin}, or +## @code{silhouette}, @var{clust} can also be a function handle to a function +## of the form @code{c = clust(x, k)}, where @var{x} is the input data, +## @var{k} the number of clusters to evaluate and @var{c} the clustering result. +## The clustering result can be either an array of size @code{n} with @code{k} +## different integer values, or a matrix of size @code{n} by @code{k} with a +## likelihood value assigned to each one of the @code{n} observations for each +## one of the @var{k} clusters. In the latter case, each observation is assigned +## to the cluster with the higher value. +## If the @var{criterion} is @code{CalinskiHarabasz}, @code{DaviesBouldin}, or +## @code{silhouette}, @var{clust} can also be a matrix of size @code{n} by +## @code{k}, where @code{k} is the number of proposed clustering solutions, so +## that each column of @var{clust} is a clustering solution. +## +## In addition to the obligatory @var{x}, @var{clust} and @var{criterion} inputs +## there is a number of optional arguments, specified as pairs of @code{Name} +## and @code{Value} options. The known @code{Name} arguments are: +## @table @code +## @item @qcode{KList} +## a vector of positive integer numbers, that is the cluster sizes to evaluate. +## This option is necessary, unless @var{clust} is a matrix of proposed +## clustering solutions. +## +## @item @qcode{Distance} +## a distance metric as accepted by the chosen @var{clust}. It can be the +## name of the distance metric as a string or a function handle. When +## @var{criterion} is @code{silhouette}, it can be a vector as created by +## function @code{pdist}. Valid distance metric strings are: @code{sqEuclidean} +## (default), @code{Euclidean}, @code{cityblock}, @code{cosine}, +## @code{correlation}, @code{Hamming}, @code{Jaccard}. +## Only used by @code{silhouette} and @code{gap} evaluation. +## +## @item @qcode{ClusterPriors} +## the prior probabilities of each cluster, which can be either @code{empirical} +## (default), or @code{equal}. When @code{empirical} the silhouette value is +## the average of the silhouette values of all points; when @code{equal} the +## silhouette value is the average of the average silhouette value of each +## cluster. Only used by @code{silhouette} evaluation. +## +## @item @qcode{B} +## the number of reference datasets generated from the reference distribution. +## Only used by @code{gap} evaluation. +## +## @item @qcode{ReferenceDistribution} +## the reference distribution used to create the reference data. It can be +## @code{PCA} (default) for a distribution based on the principal components of +## @var{X}, or @code{uniform} for a uniform distribution based on the range of +## the observed data. @code{PCA} is currently not implemented. +## Only used by @code{gap} evaluation. +## +## @item @qcode{SearchMethod} +## the method for selecting the optimal value with a @code{gap} evaluation. It +## can be either @code{globalMaxSE} (default) for selecting the smallest number +## of clusters which is inside the standard error of the maximum gap value, or +## @code{firstMaxSE} for selecting the first number of clusters which is inside +## the standard error of the following cluster number. +## Only used by @code{gap} evaluation. +## +## @end table +## +## Output @var{eva} is a clustering evaluation object. +## +## @end deftypefn +## +## @seealso{CalinskiHarabaszEvaluation, DaviesBouldinEvaluation, GapEvaluation, +## SilhouetteEvaluation} + +function cc = evalclusters (x, clust, criterion, varargin) + + ## input check + if (nargin < 3) + print_usage (); + endif + + ## parsing input data + if ((! ismatrix (x)) || (! isnumeric (x))) + error ("evalclusters: 'x' must be a numeric matrix"); + endif + + ## useful values for input check + n = rows (x); + p = columns (x); + + ## parsing the clustering algorithm + if (ischar (clust)) + clust = lower (clust); + if (! any (strcmpi (clust, {"kmeans", "linkage", "gmdistribution"}))) + error ("evalclusters: unknown clustering algorithm '%s'", clust); + endif + elseif (! isscalar (clust)) + if ((! isnumeric (clust)) || (length (size (clust)) != 2) || ... + (rows (clust) != n)) + error ("evalclusters: invalid matrix of clustering solutions"); + endif + elseif (! isa (clust, "function_handle")) + error ("evalclusters: invalid argument for 'clust'"); + endif + + ## parsing the criterion parameter + ## we check the rest later, as the check depends on the chosen criterion + if (! ischar (criterion)) + error ("evalclusters: invalid criterion, it must be a string"); + else + criterion = lower (criterion); + if (! any (strcmpi (criterion, {"calinskiharabasz", "daviesbouldin", ... + "silhouette", "gap"}))) + error ("evalclusters: unknown criterion '%s'", criterion); + endif + endif + + ## some default value + klist = []; + distance = "sqeuclidean"; + clusterpriors = "empirical"; + b = 100; + referencedistribution = "pca"; + searchmethod = "globalmaxse"; + + ## parse the name/value pairs + pair_index = 1; + while (pair_index < (nargin - 3)) + ## type check + if (! ischar (varargin{pair_index})) + error ("evalclusters: invalid property, string expected"); + endif + + ## now parse the parameter + switch (lower (varargin{pair_index})) + case "klist" + ## klist must be an array of positive interger numbers; + ## there is a special case when it can be empty, but that is not the + ## suggested way to use it (it is better to omit it instead) + if (isempty (varargin{pair_index + 1})) + if (ischar (clust) || isa (clust, "function_handle")) + error (["evalclusters: 'KList' can be empty only when 'clust' "... + "is a matrix"]); + endif + elseif ((! isnumeric (varargin{pair_index + 1})) || ... + (! isvector (varargin{pair_index + 1})) || ... + any (find (varargin{pair_index + 1} < 1)) || ... + any (floor (varargin{pair_index + 1}) != varargin{pair_index + 1})) + error ("evalclusters: 'KList' must be an array of positive integers"); + endif + klist = varargin{pair_index + 1}; + + case "distance" + ## used by silhouette and gap + if (! (strcmpi (criterion, "silhouette") || strcmpi (criterion, "gap"))) + error (["evalclusters: distance metric cannot be used with '%s'"... + " criterion"], criterion); + endif + if (ischar (varargin{pair_index + 1})) + if (! any (strcmpi (varargin{pair_index + 1}, ... + {"sqeuclidean", "euclidean", "cityblock", "cosine", ... + "correlation", "hamming", "jaccard"}))) + error ("evalclusters: unknown distance criterion '%s'", ... + varargin{pair_index + 1}); + endif + elseif (! isa (varargin{pair_index + 1}, "function_handle") || + ! ((isvector (varargin{pair_index + 1}) && ... + isnumeric (varargin{pair_index + 1})))) + error ("evalclusters: invalid distance metric"); + endif + distance = varargin{pair_index + 1}; + + case "clusterpriors" + ## used by silhouette evaluation + if (! strcmpi (criterion, "silhouette")) + error (["evalclusters: cluster prior probabilities cannot be used "... + "with '%s' criterion"], criterion); + endif + if (any (strcmpi (varargin{pair_index + 1}, {"empirical", "equal"}))) + clusterpriors = lower (varargin{pair_index + 1}); + else + error ("evalclusters: invalid cluster prior probabilities value"); + endif + + case "b" + ## used by gap evaluation + if (! isnumeric (varargin{pair_index + 1}) || ... + ! isscalar (varargin{pair_index + 1}) || ... + varargin{pair_index + 1} != floor (varargin{pair_index + 1}) || ... + varargin{pair_index + 1} < 1) + error ("evalclusters: b must a be positive integer number"); + endif + b = varargin{pair_index + 1}; + + case "referencedistribution" + ## used by gap evaluation + if (! ischar (varargin{pair_index + 1}) || any (strcmpi ... + (varargin{pair_index + 1}, {"pca", "uniform"}))) + error (["evalclusters: the reference distribution must be either" ... + "'PCA' or 'uniform'"]); + endif + referencedistribution = lower (varargin{pair_index + 1}); + + case "searchmethod" + ## used by gap evaluation + if (! ischar (varargin{pair_index + 1}) || any (strcmpi ... + (varargin{pair_index + 1}, {"globalmaxse", "uniform"}))) + error (["evalclusters: the search method must be either" ... + "'globalMaxSE' or 'firstmaxse'"]); + endif + searchmethod = lower (varargin{pair_index + 1}); + + otherwise + error ("evalclusters: unknown property %s", varargin{pair_index}); + + endswitch + + pair_index += 2; + endwhile + + ## check if there are parameters without a value or a name left + if (nargin - 2 - pair_index) + if (ischar (varargin{pair_index})) + error ("evalclusters: invalid parameter '%s'", varargin{pair_index}); + else + error ("evalclusters: invalid parameter '%d'", varargin{pair_index}); + endif + endif + + ## another check on klist + if (isempty (klist) && (ischar (clust) || isa (clust, "function_handle"))) + error (["evalclusters: 'KList' can be empty only when 'clust' ", ... + "is a matrix"]); + endif + + ## main + switch (lower (criterion)) + case "calinskiharabasz" + ## further compatibility checks between the chosen parameters are + ## delegated to the class constructor + if (isempty (klist)) + klist = 1 : columns (clust); + endif + cc = CalinskiHarabaszEvaluation (x, clust, klist); + + case "daviesbouldin" + ## further compatibility checks between the chosen parameters are + ## delegated to the class constructor + if (isempty (klist)) + klist = 1 : columns (clust); + endif + cc = DaviesBouldinEvaluation (x, clust, klist); + + case "silhouette" + ## further compatibility checks between the chosen parameters are + ## delegated to the class constructor + if (isempty (klist)) + klist = 1 : columns (clust); + endif + cc = SilhouetteEvaluation (x, clust, klist, distance, clusterpriors); + + case "gap" + ## gap cannot be used with a pre-computed solution, i.e. a matrix for + ## 'clust', and klist must be specified + if (isnumeric (clust)) + error (["evalclusters: 'clust' must be a clustering algorithm when "... + "using the gap criterion"]); + endif + if (isempty (klist)) + error (["evalclusters: 'klist' cannot be empty when using the gap " ... + "criterion"]); + endif + cc = GapEvaluation (x, clust, klist, b, distance, ... + referencedistribution, searchmethod); + + otherwise + error ("evalclusters: invalid criterion '%s'", criterion); + + endswitch + +endfunction + + +## input tests +%!error evalclusters () +%!error evalclusters ([1 1;0 1]) +%!error evalclusters ([1 1;0 1], "kmeans") +%!error <'x' must be a numeric*> evalclusters ("abc", "kmeans", "gap") +%!error evalclusters ([1 1;0 1], "xxx", "gap") +%!error evalclusters ([1 1;0 1], [1 2], "gap") +%!error evalclusters ([1 1;0 1], 1.2, "gap") +%!error evalclusters ([1 1;0 1], [1; 2], 123) +%!error evalclusters ([1 1;0 1], [1; 2], "xxx") +%!error <'KList' can be empty*> evalclusters ([1 1;0 1], "kmeans", "gap") +%!error evalclusters ([1 1;0 1], [1; 2], "gap", 1) +%!error evalclusters ([1 1;0 1], [1; 2], "gap", 1, 1) +%!error evalclusters ([1 1;0 1], [1; 2], "gap", "xxx", 1) +%!error <'KList'*> evalclusters ([1 1;0 1], [1; 2], "gap", "KList", [-1 0]) +%!error <'KList'*> evalclusters ([1 1;0 1], [1; 2], "gap", "KList", [1 .5]) +%!error <'KList'*> evalclusters ([1 1;0 1], [1; 2], "gap", "KList", [1 1; 1 1]) +%!error evalclusters ([1 1;0 1], [1; 2], "gap", ... +%! "distance", "a") +%!error evalclusters ([1 1;0 1], [1; 2], "daviesbouldin", ... +%! "distance", "a") +%!error evalclusters ([1 1;0 1], [1; 2], "gap", ... +%! "clusterpriors", "equal") +%!error evalclusters ([1 1;0 1], [1; 2], ... +%! "silhouette", "clusterpriors", "xxx") +%!error <'clust' must be a clustering*> evalclusters ([1 1;0 1], [1; 2], "gap") + +## demonstration +%!demo +%! load fisheriris; +%! eva = evalclusters(meas, "kmeans", "calinskiharabasz", "KList", [1:6]) diff --git a/inst/expfit.m b/inst/expfit.m new file mode 100644 index 0000000..8cb5069 --- /dev/null +++ b/inst/expfit.m @@ -0,0 +1,267 @@ +## Copyright (C) 2021 Nicholas R. Jankowski +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{mu} =} expfit (@var{s}) +## @deftypefnx {Function File} {[@var{mu}, @var{ci}] =} expfit (@var{s}) +## @deftypefnx {Function File} {[@var{mu}, @var{ci}] =} expfit (@var{s}, @var{alpha}) +## @deftypefnx {Function File} {@dots{} =} expfit (@var{s}, @var{alpha}, @var{c}) +## @deftypefnx {Function File} {@dots{} =} expfit (@var{s}, @var{alpha}, @var{c}, @var{f}) +## +## Estimate the mean of the exponential probability distribution function from +## which sample data @var{s} has been taken. @var{s} is expected to be a +## non-negative vector. If @var{s} is an array, the mean will be computed for +## each column of @var{s}. If any elements of @var{s} are NaN, that vector's +## mean will be returned as NaN. +## +## If the optional output variable @var{ci} is requested, @code{expfit} will +## also return the confidence interval bounds for the estimate as a two element +## column vector. If @var{s} is an array, each column of data will have a +## confidence interval returned as a two row array. +## +## The optional scalar input @var{alpha} can be used to define the +## (1-@var{alpha}) confidence interval to be applied to all estimates as a +## value between 0 and 1. The default is 0.05, resulting in a 0.95 or 95% CI. +## Any invalid values for alpha will return NaN for both CI bounds. +## +## The optional input @var{c} is a logical or numeric array of zeros and ones +## the same size as @var{s}, used to right-censor individual elements of +## @var{s}. A value of 1 indicates the data should be censored from +## the mean estimation. Any nonzero values in @var{c} are treated as a 1. +## +## The optional input @var{f} is a numeric array the same size as @var{s}, used +## to specify occurrence frequencies for the elements in @var{s}. Values of +## @var{f} need not be integers. Any NaN elements in the frequency array will +## produce a NaN output for @var{mu}. +## +## Options can be skipped by using [] to revert to the default. +## +## Matlab incompatibility: Matlab's @code{expfit} produces unpredictable results +## for some cases with higher dimensions (specifically 1 x m x n x ... arrays). +## Octave's implementation allows for n-D arrays, consistently performing +## calculations on individual column vectors. Additionally, @var{c} and @var{f} +## can be used with arrays of any size, whereas Matlab only allows their use +## when @var{s} is a vector. +## +## @end deftypefn +## +## @seealso{expcdf, expinv, explike, exppdf, exprnd, expstat} +## @seealso{expstat, exprnd, expcdf, expinv} + +function [m, v] = expfit (s, alpha = 0.05, c = [], f = []) + + ## Check arguments + if (nargin ==0 || nargin > 4 || nargout > 2) + print_usage (); + endif + + if ! (isnumeric (s) || islogical (s)) + s = double(s); + endif + + ## guarantee working with column vectors + if isvector (s) + + s = s(:); + endif + + if any (s(:) < 0) + error("expfit: input data S cannot be negative"); + endif + + sz_s = size (s); + + if (isempty (alpha)) + alpha = 0.05; + elseif !(isscalar (alpha)) + error ("expfit: ALPHA must be a scalar quantity"); + endif + + if (isempty (c) && isempty (f)) + ##simple case without f or c, shortcut other validations + m = mean (s, 1); + + if (nargout == 2) + S = sum (s, 1); + v = [2*S ./ chi2inv(1 - alpha/2, 2*sz_s(1));... + 2*S ./ chi2inv(alpha/2, 2*sz_s(1))]; + endif + else + + ## input validation for c and f + + if (isempty (c)) + ##expand to full c with values that don't affect results + c = zeros (sz_s); + elseif (! (isnumeric(c) || islogical (c))) + #check for incorrect f type + error ("expfit: C must be a numeric or logical array") + elseif (isvector (c)) + ## guarantee working with a column vector + c = c(:); + endif + + if (isempty (f)) + ##expand to full c with values that don't affect results + f = ones (sz_s); + elseif (! (isnumeric(f) || islogical (f))) + #check for incorrect f type + error ("expfit: F must be a numeric or logical array") + elseif (isvector (f)) + ## guarantee working with a column vector + f = f(:); + endif + + #check that size of c and f match s + if !(isequal(size (c), sz_s)) + error("expfit: C must be the same size as S"); + elseif (! isequal(size (f), sz_s)) + error("expfit: F must be the same size as S"); + endif + + ## trivial case where c and f have no effect + if (all (c(:) == 0 & f(:) == 1)) + + m = mean (s, 1); + + if (nargout == 2) + S = sum (s, 1); + v = [2*S ./ chi2inv(1 - alpha/2, 2*sz_s(1));... + 2*S ./ chi2inv(alpha/2, 2*sz_s(1))]; + endif + + ## no censoring, just adjust sample counts for f + elseif (all (c(:) == 0)) + + S = sum (s.*f, 1); + n = sum (f, 1); + m = S ./ n; + + if (nargout == 2) + v = [2*S ./ chi2inv(1 - alpha/2, 2*n);... + 2*S ./ chi2inv(alpha/2, 2*n)]; + endif + + ## censoring, but no sample counts adjustment + elseif (all (f(:) == 1)) + + c = logical(c); ##convert any numeric c's to 0s and 1s + S = sum (s, 1); + r = sz_s(1) - sum (c, 1); + m = S ./ r; + + if (nargout == 2) + v = [2*S ./ chi2inv(1 - alpha/2, 2*r);... + 2*S ./ chi2inv(alpha/2, 2*r)]; + endif + + ## both censoring and sample count adjustment + else + + c = logical(c); ##convert any numeric c's to 0s and 1s + S = sum (s.*f , 1); + r = sum (f.*(!c), 1); + m = S ./ r; + + if (nargout == 2) + v = [2*S ./ chi2inv(1 - alpha/2, 2*r);... + 2*S ./ chi2inv(alpha/2, 2*r)]; + endif + endif + + ## compatibility check, NaN for columns where all c's or f's remove all samples + null_columns = all (c) | ! all (f); + m(null_columns) = NaN; + + if (nargout == 2) + v(:,null_columns) = NaN; + endif + endif + +endfunction + +##tests for mean +%!assert (expfit (1), 1) +%!assert (expfit (1:3), 2) +%!assert (expfit ([1:3]'), 2) +%!assert (expfit (1:3, []), 2) +%!assert (expfit (1:3, [], [], []), 2) +%!assert (expfit (magic (3)), [5 5 5]) +%!assert (expfit (cat (3, magic (3), 2*magic (3))), cat (3,[5 5 5], [10 10 10])) +%!assert (expfit (1:3, 0.1, [0 0 0], [1 1 1]), 2) +%!assert (expfit ([1:3]', 0.1, [0 0 0]', [1 1 1]'), 2) +%!assert (expfit (1:3, 0.1, [0 0 0]', [1 1 1]'), 2) +%!assert (expfit (1:3, 0.1, [1 0 0], [1 1 1]), 3) +%!assert (expfit (1:3, 0.1, [0 0 0], [4 1 1]), 1.5) +%!assert (expfit (1:3, 0.1, [1 0 0], [4 1 1]), 4.5) +%!assert (expfit (1:3, 0.1, [1 0 1], [4 1 1]), 9) +%!assert (expfit (1:3, 0.1, [], [-1 1 1]), 4) +%!assert (expfit (1:3, 0.1, [], [0.5 1 1]), 2.2) +%!assert (expfit (1:3, 0.1, [1 1 1]), NaN) +%!assert (expfit (1:3, 0.1, [], [0 0 0]), NaN) +%!assert (expfit (reshape (1:9, [3 3])), [2 5 8]) +%!assert (expfit (reshape (1:9, [3 3]), [], eye(3)), [3 7.5 12]) +%!assert (expfit (reshape (1:9, [3 3]), [], 2*eye(3)), [3 7.5 12]) +%!assert (expfit (reshape (1:9, [3 3]), [], [], [2 2 2; 1 1 1; 1 1 1]), [1.75 4.75 7.75]) +%!assert (expfit (reshape (1:9, [3 3]), [], [], [2 2 2; 1 1 1; 1 1 1]), [1.75 4.75 7.75]) +%!assert (expfit (reshape (1:9, [3 3]), [], eye(3), [2 2 2; 1 1 1; 1 1 1]), [3.5 19/3 31/3]) + +##tests for confidence intervals +%!assert ([~,v] = expfit (1:3, 0), [0; Inf]) +%!assert ([~,v] = expfit (1:3, 2), [Inf; 0]) +%!assert ([~,v] = expfit (1:3, 0.1, [1 1 1]), [NaN; NaN]) +%!assert ([~,v] = expfit (1:3, 0.1, [], [0 0 0]), [NaN; NaN]) +%!assert ([~,v] = expfit (1:3, -1), [NaN; NaN]) +%!assert ([~,v] = expfit (1:3, 5), [NaN; NaN]) +#!assert ([~,v] = expfit ([1:3;1:3], -1), NaN(2, 3)] +#!assert ([~,v] = expfit ([1:3;1:3], 5), NaN(2, 3)] +%!assert ([~,v] = expfit (1:3), [0.830485728373393; 9.698190330474096], 1000*eps) +%!assert ([~,v] = expfit (1:3, 0.1), [0.953017262058213; 7.337731146400207], 1000*eps) +%!assert ([~,v] = expfit ([1:3;2:4]), ... +%! [0.538440777613095, 0.897401296021825, 1.256361814430554; ... +%! 12.385982973214016, 20.643304955356694, 28.900626937499371], 1000*eps) +%!assert ([~,v] = expfit ([1:3;2:4], [], [1 1 1; 0 0 0]), ... +%! 100*[0.008132550920455, 0.013554251534091, 0.018975952147727; ... +%! 1.184936706156216, 1.974894510260360, 2.764852314364504], 1000*eps) +%!assert ([~,v] = expfit ([1:3;2:4], [], [], [3 3 3; 1 1 1]), ... +%! [0.570302756652583, 1.026544961974649, 1.482787167296715; ... +%! 4.587722594914109, 8.257900670845396, 11.928078746776684], 1000*eps) +%!assert ([~,v] = expfit ([1:3;2:4], [], [0 0 0; 1 1 1], [3 3 3; 1 1 1]), ... +%! [0.692071440311161, 1.245728592560089, 1.799385744809018; ... +%! 8.081825275395081, 14.547285495711145, 21.012745716027212], 1000*eps) + +%!test +%! s = reshape (1:8, [4 2]); +%! s(4) = NaN; +%! [m,v] = expfit (s); +%! assert ({m, v}, {[NaN, 6.5], [NaN, 2.965574334593430;NaN, 23.856157493553368]}, 1000*eps); + +%!test +%! s = magic (3); +%! c = [0 1 0; 0 1 0; 0 1 0]; +%! f = [1 1 0; 1 1 0; 1 1 0]; +%! [m,v] = expfit (s, [], c, f); +%! assert ({m, v}, {[5 NaN NaN], [[2.076214320933482; 24.245475826185242],NaN(2)]}, 1000*eps); + +## input validation +%!error expfit () +%!error expfit (1,2,3,4,5) +%!error [a b c] = expfit (1) +%!error expfit (1, [1 2]) +%!error expfit ([-1 2 3 4 5]) +%!error expfit ([1:5], [], "test") +%!error expfit ([1:5], [], [], "test") +%!error expfit ([1:5], [], [0 0 0 0]) +%!error expfit ([1:5], [], [], [1 1 1 1]) diff --git a/inst/explike.m b/inst/explike.m new file mode 100644 index 0000000..3885171 --- /dev/null +++ b/inst/explike.m @@ -0,0 +1,87 @@ +## Copyright (C) 2021 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{nlogL}, @var{avar} =} explike (@var{param}, @var{data}) +## Compute the negative log-likelihood of data under the exponential distribution with given parameter value. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{param} is a scalar containing the scale parameter of the exponential distribution (equal to its mean). +## @item +## @var{data} is the vector of given values. +## +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{nlogL} is the negative log-likelihood. +## @item +## @var{avar} is the inverse of the Fisher information matrix. +## (The Fisher information matrix is the second derivative of the negative log likelihood with respect to the parameter value.) +## +## @end itemize +## +## @seealso{expcdf, expfit, expinv, explike, exppdf, exprnd} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Negative log-likelihood for the exponential distribution + +function [nlogL, avar] = explike (param, data) + + # Check arguments + if (nargin != 2) + print_usage; + endif + + beta = param(1); + n = numel (data); + sx = sum (data(:)); + sxb = sx/beta; + + #calculate negative log likelihood + nlogL = sxb + n*log(beta); + + #optionally calculate the inverse (reciprocal) of the second derivative of the negative log likelihood with respect to parameter + if nargout > 1 + avar = (beta^2) ./ (2*sxb - n); + endif + +endfunction + + +%!test +%! x = 12; +%! beta = 5; +%! [L, V] = explike (beta, x); +%! expected_L = 4.0094; +%! expected_V = 6.5789; +%! assert (L, expected_L, 0.001); +%! assert (V, expected_V, 0.001); + +%!test +%! x = 1:5; +%! beta = 2; +%! [L, V] = explike (beta, x); +%! expected_L = 10.9657; +%! expected_V = 0.4; +%! assert (L, expected_L, 0.001); +%! assert (V, expected_V, 0.001); + diff --git a/inst/expstat.m b/inst/expstat.m new file mode 100644 index 0000000..6f6f3f3 --- /dev/null +++ b/inst/expstat.m @@ -0,0 +1,93 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} expstat (@var{l}) +## Compute mean and variance of the exponential distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{l} is the parameter of the exponential distribution. The +## elements of @var{l} must be positive +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the exponential distribution +## +## @item +## @var{v} is the variance of the exponential distribution +## @end itemize +## +## @subheading Example +## +## @example +## @group +## l = 1:6; +## [m, v] = expstat (l) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @seealso{expcdf, expfit, expinv, explike, exppdf, exprnd} +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the exponential distribution + +function [m, v] = expstat (l) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (l) && ! ismatrix (l)) + error ("expstat: l must be a numeric matrix"); + endif + + # Calculate moments + m = l; + v = m .^ 2; + + # Continue argument check + k = find (! (l > 0) | ! (l < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! l = 1:6; +%! [m, v] = expstat (l); +%! assert (m, [1, 2, 3, 4, 5, 6], 0.001); +%! assert (v, [1, 4, 9, 16, 25, 36], 0.001); diff --git a/inst/ff2n.m b/inst/ff2n.m new file mode 100644 index 0000000..f152a0c --- /dev/null +++ b/inst/ff2n.m @@ -0,0 +1,13 @@ +## Author: Paul Kienzle +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} ff2n (@var{n}) +## Full-factor design with n binary terms. +## +## @seealso {fullfact} +## @end deftypefn + +function A=ff2n(n) + A = fullfact (2 * ones (1,n)) - 1; +endfunction diff --git a/inst/fisheriris.mat b/inst/fisheriris.mat new file mode 100644 index 0000000..f6a02b1 Binary files /dev/null and b/inst/fisheriris.mat differ diff --git a/inst/fisheriris.txt b/inst/fisheriris.txt new file mode 100644 index 0000000..372fb2d --- /dev/null +++ b/inst/fisheriris.txt @@ -0,0 +1,153 @@ +#Fisher iris data set +#cf. https://en.wikipedia.org/wiki/Iris_flower_data_set +#Type PW PL SW SL +0 2 14 33 50 +1 24 56 31 67 +1 23 51 31 69 +0 2 10 36 46 +1 20 52 30 65 +1 19 51 27 58 +2 13 45 28 57 +2 16 47 33 63 +1 17 45 25 49 +2 14 47 32 70 +0 2 16 31 48 +1 19 50 25 63 +0 1 14 36 49 +0 2 13 32 44 +2 12 40 26 58 +1 18 49 27 63 +2 10 33 23 50 +0 2 16 38 51 +0 2 16 30 50 +1 21 56 28 64 +0 4 19 38 51 +0 2 14 30 49 +2 10 41 27 58 +2 15 45 29 60 +0 2 14 36 50 +1 19 51 27 58 +0 4 15 34 54 +1 18 55 31 64 +2 10 33 24 49 +0 2 14 42 55 +1 15 50 22 60 +2 14 39 27 52 +0 2 14 29 44 +2 12 39 27 58 +1 23 57 32 69 +2 15 42 30 59 +1 20 49 28 56 +1 18 58 25 67 +2 13 44 23 63 +2 15 49 25 63 +2 11 30 25 51 +1 21 54 31 69 +1 25 61 36 72 +2 13 36 29 56 +1 21 55 30 68 +0 1 14 30 48 +0 3 17 38 57 +2 14 44 30 66 +0 4 15 37 51 +2 17 50 30 67 +1 22 56 28 64 +1 15 51 28 63 +2 15 45 22 62 +2 14 46 30 61 +2 11 39 25 56 +1 23 59 32 68 +1 23 54 34 62 +1 25 57 33 67 +0 2 13 35 55 +2 15 45 32 64 +1 18 51 30 59 +1 23 53 32 64 +2 15 45 30 54 +1 21 57 33 67 +0 2 13 30 44 +0 2 16 32 47 +1 18 60 32 72 +1 18 49 30 61 +0 2 12 32 50 +0 1 11 30 43 +2 14 44 31 67 +0 2 14 35 51 +0 4 16 34 50 +2 10 35 26 57 +1 23 61 30 77 +2 13 42 26 57 +0 1 15 41 52 +1 18 48 30 60 +2 13 42 27 56 +0 2 15 31 49 +0 4 17 39 54 +2 16 45 34 60 +2 10 35 20 50 +0 2 13 32 47 +2 13 54 29 62 +0 2 15 34 51 +2 10 50 22 60 +0 1 15 31 49 +0 2 15 37 54 +2 12 47 28 61 +2 13 41 28 57 +0 4 13 39 54 +1 20 51 32 65 +2 15 49 31 69 +2 13 40 25 55 +0 3 13 23 45 +0 3 15 38 51 +2 14 48 28 68 +0 2 15 35 52 +1 25 60 33 63 +2 15 46 28 65 +0 3 14 34 46 +2 18 48 32 59 +2 16 51 27 60 +1 18 55 30 65 +0 5 17 33 51 +1 22 67 38 77 +1 21 66 30 76 +1 13 52 30 67 +2 13 40 28 61 +2 11 38 24 55 +0 2 14 34 52 +1 20 64 38 79 +0 6 16 35 50 +1 20 67 28 77 +2 12 44 26 55 +0 3 14 30 48 +0 2 19 34 48 +1 14 56 26 61 +0 2 12 40 58 +1 18 48 28 62 +2 15 45 30 56 +0 2 14 32 46 +0 4 15 44 57 +1 24 56 34 63 +1 16 58 30 72 +1 21 59 30 71 +1 18 56 29 63 +2 12 42 30 57 +1 23 69 26 77 +2 13 56 29 66 +0 2 15 34 52 +2 10 37 24 55 +0 2 15 31 46 +1 19 61 28 74 +0 3 13 35 50 +1 18 63 29 73 +2 15 47 31 67 +2 13 41 30 56 +2 13 43 29 64 +1 22 58 30 65 +0 3 14 35 51 +2 14 47 29 61 +1 19 53 27 64 +0 2 16 34 48 +1 20 50 25 57 +2 13 40 23 55 +0 2 17 34 54 +1 24 51 28 58 +0 2 15 37 53 diff --git a/inst/fitgmdist.m b/inst/fitgmdist.m new file mode 100644 index 0000000..7440245 --- /dev/null +++ b/inst/fitgmdist.m @@ -0,0 +1,533 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2015 Lachlan Andrew +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{GMdist} =} fitgmdist (@var{data}, @var{k}, @var{param1}, @var{value1}, @dots{}) +## Fit a Gaussian mixture model with @var{k} components to @var{data}. +## Each row of @var{data} is a data sample. Each column is a variable. +## +## Optional parameters are: +## @itemize +## @item 'start': initialization conditions. Possible values are: +## @itemize +## @item 'randSample' (default) takes means uniformly from rows of data +## @item 'plus' use k-means++ to initialize means +## @item 'cluster' Performs an initial clustering with 10% of the data +## @item vector A vector whose length is the number of rows in data, +## and whose values are 1 to k specify the components +## each row is initially allocated to. The mean, variance +## and weight of each component is calculated from that +## @item structure with elements mu, Sigma ComponentProportion +## @end itemize +## For 'randSample', 'plus' and 'cluster', the initial variance of each +## component is the variance of the entire data sample. +## +## @item 'Replicates' Number of random restarts to perform +## +## @item 'RegularizationValue' +## @item 'Regularize' A small number added to the diagonal entries +## of the covariance to prevent singular covariances +## +## @item 'SharedCovariance' +## @item 'SharedCov' (logical) True if all components must share the +## same variance, to reduce the number of free parameters +## +## @item 'CovarianceType' +## @item 'CovType' (string). Possible values are: +## @itemize +## @item 'full' (default) Allow arbitrary covariance matrices +## @item 'diagonal' Force covariances to be diagonal, to reduce the +## number of free parameters. +## @end itemize +## +## @item 'Option' A structure with all of the following fields: +## @itemize +## @item 'MaxIter' Maximum number of EM iterations (default 100) +## @item 'TolFun' Threshold increase in likelihood to terminate EM +## (default 1e-6) +## @item 'Display' +## @itemize +## @item 'off' (default): display nothing +## @item 'final': display the number of iterations and likelihood +## once execution completes +## @item 'iter': display the above after each iteration +## @end itemize +## @end itemize +## @item 'Weight' A column vector or n-by-2 matrix. The first column +## consists of non-negative weights given to the +## samples. +## If these are all integers, this is equivalent +## to specifying @var{weight}(i) copies of row i of +## @var{data}, but potentially faster. +## +## If a row of @var{data} is used to represent samples +## that are similar but not identical, then the second +## column of @var{weight} indicates the variance of +## those original samples. Specifically, in the EM +## algorithm, the contribution of row i towards the +## variance is set to at least @var{weight}(i,2), to +## prevent spurious components with zero variance. +## @end itemize +## +## @seealso{gmdistribution, kmeans} +## @end deftypefn + +function obj = fitgmdist(data, k, varargin) + + if nargin < 2 || mod (nargin, 2) == 1 + print_usage; + endif + + [~, prop] = parseparams (varargin); + + ## defaults for options + diagonalCovar = false; # "full". (true is "diagonal") + sharedCovar = false; + start = "randSample"; + replicates = 1; + option.MaxIter= 100; + option.TolFun = 1e-6; + option.Display= "off"; # "off" (1 is "final", 2 is "iter") + Regularizer = 0; + weights = []; # Each row i counts as "weights(i,1)" rows + + + # Remove rows containing NaN / NA + data = data(!any (isnan (data), 2),:); + + # used for getting the number of samples + nRows = rows (data); + nCols = columns (data); + + # Parse options + while (!isempty (prop)) + try + switch (lower (prop{1})) + case {"sharedcovariance",... + "sharedcov"}, sharedCovar = prop{2}; + case {"covariancetype",... + "covartype"}, diagonalCovar = prop{2}; + case {"regularizationvalue",... + "regularize"}, Regularizer = prop{2}; + case "replicates", replicates = prop{2}; + case "start", start = prop{2}; + case "weights", weights = prop{2}; + + case "option" + option.MaxIter = prop{2}.MaxIter; + option.TolFun = prop{2}.TolFun; + option.Display = prop{2}.Display; + + otherwise + error ("fitgmdist: Unknown option %s", prop{1}); + endswitch + catch ME + if (length (prop) < 2) + error ("fitgmdist: Option '%s' has no argument", prop{1}); + else + rethrow (ME) + endif + end_try_catch + prop = prop(3:end); + endwhile + + # Process options + + + # check for the "replicates" property + try + if isempty (1:replicates) + error ("fitgmdist: replicates must be positive"); + endif + catch + error ("fitgmdist: invalid number of replicates"); + end_try_catch + + # check for the "option" property + MaxIter = option.MaxIter; + TolFun = option.TolFun; + switch (lower (option.Display)) + case "off", Display = 0; + case "final", Display = 1; + case "iter", Display = 2; + case "notify", Display = 0; + otherwise, error ("fitgmdist: Unknown Display option %s", option.Display); + endswitch + + try + p = ones(1, k) / k; # Default is uniform component proportions + catch ME + if (!isscalar (k) || !isnumeric (k)) + error ("fitgmdist: The second argument must be a numeric scalar"); + else + rethrow (ME) + endif + end_try_catch + # check for the "start" property + if (ischar (start)) + start = lower (start); + switch (start) + case {"randsample", "plus", "cluster", "randsamplep", "plusp", "clusterp"} + otherwise + error ("fitgmdist: Unknown Start value %s\n", start); + endswitch + component_order_free = true; + else + component_order_free = false; + if (!ismatrix (start) || !isnumeric (start)) + try + mu = start.mu; + Sigma = start.Sigma; + if (isfield (start, 'ComponentProportion')) + p = start.ComponentProportion(:)'; + end + if (any (size (data, 2) ~= [size(mu,2), size(Sigma)]) || ... + any (k ~= [size(mu,1), size(p,2)])) + error ('fitgmdist: Start parameter has mismatched dimensions'); + endif + catch + error ("fitgmdist: invalid start parameter"); + end_try_catch + else + validIndices = 0; + mu = zeros (k, nRows); + Sigma = zeros (nRows, nRows, k); + for i = 1:k + idx = (start == i); + validIndices = validIndices + sum (idx); + mu(i,:) = mean (data(idx,:)); + Sigma(:,:,i) = cov (data(idx,:)) + Regularizer*eye (nCols); + endfor + if (validIndices < nRows) + error ("fitgmdist: Start is numeric, but is not integers between 1 and k"); + endif + endif + start = []; # so that variance isn't recalculated later + replicates = 1; # Will be the same each time anyway + endif + + # check for the "SharedCovariance" property + if (!islogical (sharedCovar)) + error ("fitgmdist: SharedCoveriance must be logical true or false"); + endif + + # check for the "CovarianceType" property + if (!islogical (diagonalCovar)) + try + if (strcmpi (diagonalCovar, "diagonal")) + diagonalCovar = true; + elseif (strcmpi (diagonalCovar, "full")) + diagonalCovar = false; + else + error ("fitgmdist: CovarianceType must be Full or Diagonal"); + endif + catch + error ("fitgmdist: CovarianceType must be 'Full' or 'Diagonal'"); + end_try_catch + endif + + # check for the "Regularizer" property + try + if (Regularizer < 0) + error ("fitgmdist: Regularizer must be non-negative"); + endif + catch ME + if (!isscalar (Regularizer) || !isnumeric (Regularizer)) + error ("fitgmdist: Regularizer must be a numeric scalar"); + else + rethrow (ME) + endif + end_try_catch + + # check for the "Weights" property and the matrix + try + if (!isempty (weights)) + if (columns (weights) > 2 || any (weights(:) < 0)) + error ("fitgmdist: weights must be a nonnegative numeric dx1 or dx2 matrix"); + endif + if (rows (weights) != nRows) + error ("fitgmdist: number of weights %d must match number of samples %d",... + rows (weights), nRows) + endif + non_zero = (weights(:,1) > 0); + weights = weights(non_zero,:); + data = data (non_zero,:); + + nRows = rows (data); + raw_samples = sum (weights(:,1)); + else + raw_samples = nRows; + endif + + # Validate the matrix + if (!isreal (data(k,1))) + error ("fitgmdist: first input argument must be a DxN real data matrix"); + endif + catch ME + if (!isnumeric (data) || !ismatrix (data) || !isreal (data)) + error ("fitgmdist: first input argument must be a DxN real data matrix"); + elseif (k > nRows || k < 0) + if (exists("non_zero", "var") && k <= length(non_zero)) + error ("fitgmdist: The number of non-zero weights (%d) must be at least the number of components (%d)", nRows, k); + else + error ("fitgmdist: The number of components (%d) must be a positive number less than the number of data rows (%d)", k, nRows); + endif + elseif (!ismatrix (weights) || !isnumeric (weights)) + error ("fitgmdist: weights must be a nonnegative numeric dx1 or dx2 matrix"); + else + rethrow (ME) + endif + end_try_catch + #if k == 1 + # replicates = 1; + #endif + + + # Done processing options + ####################################### + + # used to hold the probability of each class, given each data vector + try + p_x_l = zeros (nRows, k); # probability of observation x given class l + + best = -realmax; + best_params = []; + diag_slice = 1:(nCols+1):(nCols)^2; + + # Create index slices to calculate symmetric completion of upper triangular Mx + lower_half = zeros(nCols*(nCols-1)/2,1); + upper_half = zeros(nCols*(nCols-1)/2,1); + i = 1; + for rw = 1:nCols + for cl = rw+1:nCols + upper_half(i) = sub2ind([nCols, nCols], rw, cl); + lower_half(i) = sub2ind([nCols, nCols], cl, rw); + i = i + 1; + endfor + endfor + + for rep = 1:replicates + if (!isempty (start)) + # Initialize the means + switch (start) + case {"randsample"} + if (isempty (weights)) + idx = randperm (nRows, k); + else + idx = randsample (nRows, k, false, weights); + endif + mu = data(idx, :); + case {"plus"} # k-means++, by Arthur and Vassilios + mu(1,:) = data(randi (nRows),:); + d = inf (nRows, 1); # Distance to nearest centroid so far + for i = 2:k + d = min (d, sum (bsxfun (@minus, data, mu(i-1, :)).^2, 2)); + # pick next sample with prob. prop to dist.*weights + if (isempty (weights)) + cs = cumsum (d); + else + cs = cumsum (d .* weights(:,1)); + endif + mu(i,:) = data(find (cs > rand * cs(end), 1), :); + endfor + case {"cluster"} + subsamp = max (k, ceil (nRows/10)); + if (isempty (weights)) + idx = randperm (nRows, subsamp); + else + idx = randsample (nRows, subsamp, false, weights); + endif + [~, mu] = kmeans (data(idx), k, "start", "sample"); + endswitch + + # Initialize the variance, unless set explicitly + # + Sigma = var (data) + Regularizer; + if (!diagonalCovar) + Sigma = diag (Sigma); + endif + if (!sharedCovar) + Sigma = repmat (Sigma, [1, 1, k]); + endif + endif + + # Run the algorithm + iter = 1; + + log_likeli = -inf; + incr = 1; + + while (incr > TolFun && iter <= MaxIter) + iter = iter + 1; + ####################################### + # "E step" + # Calculate probability of class l given observations + for i = 1:k + if (sharedCovar) + sig = Sigma; + else + sig = Sigma(:,:,i); + endif + if (diagonalCovar) + sig = diag(sig); + endif + try + p_x_l (:, i) = mvnpdf (data, mu(i, :), sig); + catch ME + if (strfind (ME.message, "positive definite")) + error ("fitgmdist: Covariance is not positive definite. Increase RegularizationValue"); + else + rethrow (ME) + endif + end_try_catch + endfor + # Bayes' rule + p_x_l = bsxfun (@times, p_x_l, p); # weight by priors + p_l_x = bsxfun (@rdivide, p_x_l, sum (p_x_l, 2)); # Normalize + + ####################################### + # "M step" + # Calculate new parameters + if (!isempty (weights)) + p_l_x = bsxfun (@times, p_l_x, weights(:,1)); + endif + + sum_p_l_x = sum(p_l_x); # row vec of \sum_{data} p(class|data,params) + + p = sum_p_l_x / raw_samples; # new proportions + mu = bsxfun(@rdivide, p_l_x' * data, sum_p_l_x'); # new means + if (sharedCovar) + sumSigma = zeros (size (Sigma(:,:,1))); # diagonalCovar gives size + endif + for i = 1:k + # Sigma + deviation = bsxfun(@minus, data, mu(i,:)); + lhs = bsxfun(@times, p_l_x(:,i), deviation); + + # Calculate covariance + # Iterate either over elements of the covariance matrix, since + # there should be fewer of those than rows of data. + for rw = 1:nCols + for cl = rw:nCols + sig(rw,cl) = lhs(:,rw)' * deviation(:,cl); + endfor + endfor + sig(lower_half) = sig(upper_half); + + sig = sig/sum_p_l_x(i) + Regularizer*eye (nCols); + + if (columns (weights) > 1) # don't give "singleton" clusters low var + sig(diag_slice) = max (sig(diag_slice), weights(i,2)); + endif + + if (diagonalCovar) + sig = diag(sig)'; + endif + + if (sharedCovar) + sumSigma = sumSigma + sig * p(i); # Heuristic. Should it use + else # old p? Something else? + Sigma(:,:,i) = sig; + endif + endfor + if (sharedCovar) + Sigma = sumSigma; + endif + ####################################### + + # calculate the new (and relative change in) log-likelihood + if (isempty (weights)) + new_log_likeli = sum (log (sum (p_x_l, 2))); + else + new_log_likeli = sum (weights(:,1) .* log (sum (p_x_l, 2))); + endif + incr = (new_log_likeli - log_likeli)/max(1,abs(new_log_likeli)); + if Display == 2 + fprintf("iter %d log-likelihood %g\n", iter-1, new_log_likeli); + %disp(mu); + endif + log_likeli = new_log_likeli; + endwhile + if (log_likeli > best) + best = log_likeli; + best_params.mu = mu; + best_params.Sigma = Sigma; + best_params.p = p; + endif + endfor + catch ME + try + if (1 < MaxIter), end + catch + error ("fitgmdist: invalid MaxIter"); + end_try_catch + rethrow (ME) + end_try_catch + + # List components in descending order of proportion, + # unless the order was implicitly specified by "start" + if (component_order_free) + [~, idx] = sort (-best_params.p); + best_params.p = best_params.p (idx); + best_params.mu = best_params.mu(idx,:); + if (!sharedCovar) + best_params.Sigma = best_params.Sigma(:,:,idx); + endif + endif + + # Calculate number of parameters + if (diagonalCovar) + params = nCols; + else + params = nCols * (nCols+1) / 2; + endif + params = params*size (Sigma, 3) + 2*rows (mu) - 1; + + # This works in Octave, but not in Matlab + #obj = gmdistribution (best_params.mu, best_params.Sigma, best_params.p', extra); + obj = gmdistribution (best_params.mu, best_params.Sigma, best_params.p'); + + obj.NegativeLogLikelihood = -best; + obj.AIC = -2*(best - params); + obj.BIC = -2*best + params * log (raw_samples); + obj.Converged = (incr <= TolFun); + obj.NumIterations = iter-1; + obj.RegularizationValue = Regularizer; + + if (Display == 1) + fprintf (" %d iterations log-likelihood = %g\n", ... + obj.NumIterations, -obj.NegativeLogLikelihood); + endif +endfunction + +%!xdemo <50286> +%! ## Generate a two-cluster problem +%! C1 = randn (100, 2) + 1; +%! C2 = randn (100, 2) - 1; +%! data = [C1; C2]; +%! +%! ## Perform clustering +%! GMModel = fitgmdist (data, 2); +%! +%! ## Plot the result +%! figure +%! [heights, bins] = hist3([C1; C2]); +%! [xx, yy] = meshgrid(bins{1}, bins{2}); +%! bbins = [xx(:), yy(:)]; +%! contour (reshape (GMModel.pdf (bbins), heights)); +%! hold on +%! plot (centers (:, 1), centers (:, 2), "kv", "markersize", 10); +%! hold off diff --git a/inst/fstat.m b/inst/fstat.m new file mode 100644 index 0000000..cd30a13 --- /dev/null +++ b/inst/fstat.m @@ -0,0 +1,130 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{mn}, @var{v}] =} fstat (@var{m}, @var{n}) +## Compute mean and variance of the F distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{m} is the first parameter of the F distribution. The elements +## of @var{m} must be positive +## +## @item +## @var{n} is the second parameter of the F distribution. The +## elements of @var{n} must be positive +## @end itemize +## @var{m} and @var{n} must be of common size or one of them must be scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{mn} is the mean of the F distribution. The mean is undefined for +## @var{n} not greater than 2 +## +## @item +## @var{v} is the variance of the F distribution. The variance is undefined +## for @var{n} not greater than 4 +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## m = 1:6; +## n = 5:10; +## [mn, v] = fstat (m, n) +## @end group +## +## @group +## [mn, v] = fstat (m, 5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the F distribution + +function [mn, v] = fstat (m, n) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (m) && ! ismatrix (m)) + error ("fstat: m must be a numeric matrix"); + endif + if (! isempty (n) && ! ismatrix (n)) + error ("fstat: n must be a numeric matrix"); + endif + + if (! isscalar (m) || ! isscalar (n)) + [retval, m, n] = common_size (m, n); + if (retval > 0) + error ("fstat: m and n must be of common size or scalar"); + endif + endif + + # Calculate moments + mn = n ./ (n - 2); + v = (2 .* (n .^ 2) .* (m + n - 2)) ./ (m .* ((n - 2) .^ 2) .* (n - 4)); + + # Continue argument check + k = find (! (m > 0) | ! (m < Inf) | ! (n > 2) | ! (n < Inf)); + if (any (k)) + mn(k) = NaN; + v(k) = NaN; + endif + + k = find (! (n > 4)); + if (any (k)) + v(k) = NaN; + endif + +endfunction + +%!test +%! m = 1:6; +%! n = 5:10; +%! [mn, v] = fstat (m, n); +%! expected_mn = [1.6667, 1.5000, 1.4000, 1.3333, 1.2857, 1.2500]; +%! expected_v = [22.2222, 6.7500, 3.4844, 2.2222, 1.5869, 1.2153]; +%! assert (mn, expected_mn, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! m = 1:6; +%! [mn, v] = fstat (m, 5); +%! expected_mn = [1.6667, 1.6667, 1.6667, 1.6667, 1.6667, 1.6667]; +%! expected_v = [22.2222, 13.8889, 11.1111, 9.7222, 8.8889, 8.3333]; +%! assert (mn, expected_mn, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/fullfact.m b/inst/fullfact.m new file mode 100644 index 0000000..31ed007 --- /dev/null +++ b/inst/fullfact.m @@ -0,0 +1,27 @@ +## Author: Paul Kienzle +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} fullfact (@var{N}) +## Full factorial design. +## +## If @var{N} is a scalar, return the full factorial design with @var{N} binary +## choices, 0 and 1. +## +## If @var{N} is a vector, return the full factorial design with choices 1 +## through @var{n_i} for each factor @var{i}. +## +## @end deftypefn + +function A = fullfact(n) + if length(n) == 1 + % combinatorial design with n either/or choices + A = fullfact(2*ones(1,n))-1; + else + % combinatorial design with n(i) choices per level + A = [1:n(end)]'; + for i=length(n)-1:-1:1 + A = [kron([1:n(i)]',ones(rows(A),1)), repmat(A,n(i),1)]; + end + end +endfunction diff --git a/inst/gamfit.m b/inst/gamfit.m new file mode 100644 index 0000000..90791e3 --- /dev/null +++ b/inst/gamfit.m @@ -0,0 +1,69 @@ + + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{MLE} =} gamfit (@var{data}) +## Calculate gamma distribution parameters. +## +## Find the maximum likelihood estimate parameters of the Gamma distribution +## of @var{data}. @var{MLE} is a two element vector with shape parameter +## @var{A} and scale @var{B}. +## +## @seealso{gampdf, gaminv, gamrnd, gamlike} +## @end deftypefn + +## This function works by minimizing the value of gamlike for the vector R. +## Just about any minimization function will work, all it has to do is +## minimize for one variable. Although the gamma distribution has two +## parameters, their product is the mean of the data. so a helper function +## for the search takes one parameter, calculates the other and then returns +## the value of gamlike. + +## Author: Martijn van Oosterhout +## This program is granted to the public domain. +# Revisions copyright (C) 2019 by Nir Krakauer under GPL (below). + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; If not, see . + + +function res = gamfit(R) + + if (nargin != 1) + print_usage; + endif + + avg = mean(R); + + # Optimize with respect to log(a), since both a and b must be positive + x = fminsearch( @(x) gamfit_search(x, avg, R), 0 ); + a = exp(x); + + b = avg/a; + + res = [a b]; +endfunction + +# Helper function so we only have to minimize for one variable. +function res = gamfit_search( x, avg, R ) + a = exp(x); + b = avg/a; + res = gamlike([a b], R); +endfunction + + +#example data from https://www.real-statistics.com/distribution-fitting/distribution-fitting-via-maximum-likelihood/fitting-gamma-parameters-mle/ +%!shared v, res +%! v = [1.2 1.6 1.7 1.8 1.9 2.0 2.2 2.6 3.0 3.5 4.0 4.8 5.6 6.6 7.6]; +%! res = gamfit(v); +%!assert (res(1), 3.425, 1E-3); +%!assert (res(2), 0.975, 1E-3); diff --git a/inst/gamlike.m b/inst/gamlike.m new file mode 100644 index 0000000..7085896 --- /dev/null +++ b/inst/gamlike.m @@ -0,0 +1,21 @@ +## Author: Martijn van Oosterhout +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{X} =} gamlike ([@var{A} @var{B}], @var{R}) +## Calculates the negative log-likelihood function for the Gamma +## distribution over vector @var{R}, with the given parameters @var{A} and @var{B}. +## @seealso{gampdf, gaminv, gamrnd, gamfit} +## @end deftypefn + +function res = gamlike(P,K) + + if (nargin != 2) + print_usage; + endif + + a=P(1); + b=P(2); + + res = -sum( log( gampdf(K, a, b) ) ); +endfunction diff --git a/inst/gamstat.m b/inst/gamstat.m new file mode 100644 index 0000000..1f95226 --- /dev/null +++ b/inst/gamstat.m @@ -0,0 +1,123 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} gamstat (@var{a}, @var{b}) +## Compute mean and variance of the gamma distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{a} is the first parameter of the gamma distribution. @var{a} must be +## positive +## +## @item +## @var{b} is the second parameter of the gamma distribution. @var{b} must be +## positive +## @end itemize +## @var{a} and @var{b} must be of common size or one of them must be scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the gamma distribution +## +## @item +## @var{v} is the variance of the gamma distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## a = 1:6; +## b = 1:0.2:2; +## [m, v] = gamstat (a, b) +## @end group +## +## @group +## [m, v] = gamstat (a, 1.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the gamma distribution + +function [m, v] = gamstat (a, b) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (a) && ! ismatrix (a)) + error ("gamstat: a must be a numeric matrix"); + endif + if (! isempty (b) && ! ismatrix (b)) + error ("gamstat: b must be a numeric matrix"); + endif + + if (! isscalar (a) || ! isscalar (b)) + [retval, a, b] = common_size (a, b); + if (retval > 0) + error ("gamstat: a and b must be of common size or scalar"); + endif + endif + + # Calculate moments + m = a .* b; + v = a .* (b .^ 2); + + # Continue argument check + k = find (! (a > 0) | ! (a < Inf) | ! (b > 0) | ! (b < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! a = 1:6; +%! b = 1:0.2:2; +%! [m, v] = gamstat (a, b); +%! expected_m = [1.00, 2.40, 4.20, 6.40, 9.00, 12.00]; +%! expected_v = [1.00, 2.88, 5.88, 10.24, 16.20, 24.00]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! a = 1:6; +%! [m, v] = gamstat (a, 1.5); +%! expected_m = [1.50, 3.00, 4.50, 6.00, 7.50, 9.00]; +%! expected_v = [2.25, 4.50, 6.75, 9.00, 11.25, 13.50]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/geomean.m b/inst/geomean.m new file mode 100644 index 0000000..7ed85fc --- /dev/null +++ b/inst/geomean.m @@ -0,0 +1,34 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} geomean (@var{x}) +## @deftypefnx{Function File} geomean (@var{x}, @var{dim}) +## Compute the geometric mean. +## +## This function does the same as @code{mean (x, "g")}. +## +## @seealso{mean} +## @end deftypefn + +function a = geomean(x, dim) + if (nargin == 1) + a = mean(x, "g"); + elseif (nargin == 2) + a = mean(x, "g", dim); + else + print_usage; + endif +endfunction diff --git a/inst/geostat.m b/inst/geostat.m new file mode 100644 index 0000000..a6c94a2 --- /dev/null +++ b/inst/geostat.m @@ -0,0 +1,93 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} geostat (@var{p}) +## Compute mean and variance of the geometric distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{p} is the rate parameter of the geometric distribution. The +## elements of @var{p} must be probabilities +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the geometric distribution +## +## @item +## @var{v} is the variance of the geometric distribution +## @end itemize +## +## @subheading Example +## +## @example +## @group +## p = 1 ./ (1:6); +## [m, v] = geostat (p) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the geometric distribution + +function [m, v] = geostat (p) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (p) && ! ismatrix (p)) + error ("geostat: p must be a numeric matrix"); + endif + + # Calculate moments + q = 1 - p; + m = q ./ p; + v = q ./ (p .^ 2); + + # Continue argument check + k = find (! (p >= 0) | ! (p <= 1)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! p = 1 ./ (1:6); +%! [m, v] = geostat (p); +%! assert (m, [0, 1, 2, 3, 4, 5], 0.001); +%! assert (v, [0, 2, 6, 12, 20, 30], 0.001); diff --git a/inst/gevcdf.m b/inst/gevcdf.m new file mode 100644 index 0000000..756a7d1 --- /dev/null +++ b/inst/gevcdf.m @@ -0,0 +1,131 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} gevcdf (@var{x}, @var{k}, @var{sigma}, @var{mu}) +## Compute the cumulative distribution function of the generalized extreme value (GEV) distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the support. +## +## @item +## @var{k} is the shape parameter of the GEV distribution. (Also denoted gamma or xi.) +## @item +## @var{sigma} is the scale parameter of the GEV distribution. The elements +## of @var{sigma} must be positive. +## @item +## @var{mu} is the location parameter of the GEV distribution. +## @end itemize +## The inputs must be of common size, or some of them must be scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the cumulative distribution of the GEV distribution at each +## element of @var{x} and corresponding parameter values. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = 0:0.5:2.5; +## sigma = 1:6; +## k = 1; +## mu = 0; +## y = gevcdf (x, k, sigma, mu) +## @end group +## +## @group +## y = gevcdf (x, k, 0.5, mu) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Rolf-Dieter Reiss and Michael Thomas. @cite{Statistical Analysis of Extreme Values with Applications to Insurance, Finance, Hydrology and Other Fields}. Chapter 1, pages 16-17, Springer, 2007. +## +## @end enumerate +## @seealso{gevfit, gevinv, gevlike, gevpdf, gevrnd, gevstat} +## @end deftypefn + +## Author: Nir Krakauer +## Description: CDF of the generalized extreme value distribution + +function p = gevcdf (x, k, sigma, mu) + + # Check arguments + if (nargin != 4) + print_usage (); + endif + + if (isempty (x) || isempty (k) || isempty (sigma) || isempty (mu) || ~ismatrix (x) || ~ismatrix (k) || ~ismatrix (sigma) || ~ismatrix (mu)) + error ("gevcdf: inputs must be a numeric matrices"); + endif + + [retval, x, k, sigma, mu] = common_size (x, k, sigma, mu); + if (retval > 0) + error ("gevcdf: inputs must be of common size or scalars"); + endif + + z = 1 + k .* (x - mu) ./ sigma; + + # Calculate pdf + p = exp(-(z .^ (-1 ./ k))); + + p(z <= 0 & x < mu) = 0; + p(z <= 0 & x > mu) = 1; + + inds = (abs (k) < (eps^0.7)); %use a different formula if k is very close to zero + if any(inds) + z = (mu(inds) - x(inds)) ./ sigma(inds); + p(inds) = exp(-exp(z)); + endif + + +endfunction + +%!test +%! x = 0:0.5:2.5; +%! sigma = 1:6; +%! k = 1; +%! mu = 0; +%! p = gevcdf (x, k, sigma, mu); +%! expected_p = [0.36788 0.44933 0.47237 0.48323 0.48954 0.49367]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = -0.5:0.5:2.5; +%! sigma = 0.5; +%! k = 1; +%! mu = 0; +%! p = gevcdf (x, k, sigma, mu); +%! expected_p = [0 0.36788 0.60653 0.71653 0.77880 0.81873 0.84648]; +%! assert (p, expected_p, 0.001); + +%!test #check for continuity for k near 0 +%! x = 1; +%! sigma = 0.5; +%! k = -0.03:0.01:0.03; +%! mu = 0; +%! p = gevcdf (x, k, sigma, mu); +%! expected_p = [0.88062 0.87820 0.87580 0.87342 0.87107 0.86874 0.86643]; +%! assert (p, expected_p, 0.001); diff --git a/inst/gevfit.m b/inst/gevfit.m new file mode 100644 index 0000000..4bdbbcc --- /dev/null +++ b/inst/gevfit.m @@ -0,0 +1,97 @@ +## Copyright (C) 2012-2021 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{paramhat}, @var{paramci} =} gevfit (@var{data}, @var{parmguess}) +## Find the maximum likelihood estimator (@var{paramhat}) of the generalized extreme value (GEV) distribution to fit @var{data}. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{data} is the vector of given values. +## @item +## @var{parmguess} is an initial guess for the maximum likelihood parameter vector. If not given, this defaults to @var{k}=0 and @var{sigma}, @var{mu} determined by matching the data mean and standard deviation to their expected values. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{parmhat} is the 3-parameter maximum-likelihood parameter vector [@var{k} @var{sigma} @var{mu}], where @var{k} is the shape parameter of the GEV distribution, @var{sigma} is the scale parameter of the GEV distribution, and @var{mu} is the location parameter of the GEV distribution. +## @item +## @var{paramci} has the approximate 95% confidence intervals of the parameter values based on the Fisher information matrix at the maximum-likelihood position. +## +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## data = 1:50; +## [pfit, pci] = gevfit (data); +## p1 = gevcdf(data,pfit(1),pfit(2),pfit(3)); +## plot(data, p1) +## @end group +## @end example +## @seealso{gevcdf, gevinv, gevlike, gevpdf, gevrnd, gevstat} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Maximum likelihood parameter estimation for the generalized extreme value distribution + +function [paramhat, paramci] = gevfit (data, paramguess) + + # Check arguments + if (nargin < 1) + print_usage; + endif + + if (nargin < 2) || isempty(paramguess) + paramguess = zeros (3, 1); + paramguess(2) = (sqrt(6)/pi) * std (data); + paramguess(3) = mean(data) - 0.5772156649*paramguess(2); #expectation involves Euler–Mascheroni constant + endif + + #cost function to minimize + f = @(p) gevlike (p, data); + + [paramhat, ~, info] = fminunc(f, paramguess, optimset("GradObj", "on")); + if info <= 0 + warning ('gevfit: optimization did not converge, results may be unreliable') + endif + paramhat = paramhat(:)'; #return a row vector for Matlab compatibility + + if nargout > 1 + [nlogL, ~, ACOV] = gevlike (paramhat, data); + param_se = sqrt(diag(inv(ACOV)))'; + if any(iscomplex(param_se)) + warning ('gevfit: Fisher information matrix not positive definite; parameter optimization likely did not converge') + paramci = nan (3, 2); + else + paramci(1, :) = paramhat - 1.96*param_se; + paramci(2, :) = paramhat + 1.96*param_se; + endif + endif + +endfunction + +%!test +%! data = 1:50; +%! [pfit, pci] = gevfit (data); +%! expected_p = [-0.44 15.19 21.53]; +%! expected_pu = [-0.13 19.31 26.49]; +%! assert (pfit, expected_p, 0.1); +%! assert (pci(2, :), expected_pu, 0.1); diff --git a/inst/gevfit_lmom.m b/inst/gevfit_lmom.m new file mode 100644 index 0000000..60482d1 --- /dev/null +++ b/inst/gevfit_lmom.m @@ -0,0 +1,113 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{paramhat}, @var{paramci} =} gevfit_lmom (@var{data}) +## Find an estimator (@var{paramhat}) of the generalized extreme value (GEV) distribution fitting @var{data} using the method of L-moments. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{data} is the vector of given values. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{parmhat} is the 3-parameter maximum-likelihood parameter vector [@var{k}; @var{sigma}; @var{mu}], where @var{k} is the shape parameter of the GEV distribution, @var{sigma} is the scale parameter of the GEV distribution, and @var{mu} is the location parameter of the GEV distribution. +## @item +## @var{paramci} has the approximate 95% confidence intervals of the parameter values (currently not implemented). +## +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## data = gevrnd (0.1, 1, 0, 100, 1); +## [pfit, pci] = gevfit_lmom (data); +## p1 = gevcdf (data,pfit(1),pfit(2),pfit(3)); +## [f, x] = ecdf (data); +## plot(data, p1, 's', x, f) +## @end group +## @end example +## @seealso{gevfit} +## @subheading References +## +## @enumerate +## @item +## Ailliot, P.; Thompson, C. & Thomson, P. Mixed methods for fitting the GEV distribution, Water Resources Research, 2011, 47, W05551 +## +## @end enumerate +## @end deftypefn + +## Author: Nir Krakauer +## Description: L-moments parameter estimation for the generalized extreme value distribution + +function [paramhat, paramci] = gevfit_lmom (data) + + # Check arguments + if (nargin < 1) + print_usage; + endif + + # find the L-moments + data = sort (data(:))'; + n = numel(data); + L1 = mean(data); + L2 = sum(data .* (2*(1:n) - n - 1)) / (2*nchoosek(n, 2)); # or mean(triu(data' - data, 1, 'pack')) / 2; + b = bincoeff((1:n) - 1, 2); + L3 = sum(data .* (b - 2 * ((1:n) - 1) .* (n - (1:n)) + fliplr(b))) / (3*nchoosek(n, 3)); + + #match the moments to the GEV distribution + #first find k based on L3/L2 + f = @(k) (L3/L2 + 3)/2 - limdiv((1 - 3^(k)), (1 - 2^(k))); + k = fzero(f, 0); + + #next find sigma and mu given k + if abs(k) < 1E-8 + sigma = L2 / log(2); + eg = 0.57721566490153286; %Euler-Mascheroni constant + mu = L1 - sigma * eg; + else + sigma = -k*L2 / (gamma(1 - k) * (1 - 2^(k))); + mu = L1 - sigma * ((gamma(1 - k) - 1) / k); + endif + + paramhat = [k; sigma; mu]; + + if nargout > 1 + paramci = NaN; + endif +endfunction + +#internal function to accurately evaluate (1 - 3^k)/(1 - 2^k) in the limit as k --> 0 +function c = limdiv(a, b) + # c = ifelse (abs(b) < 1E-8, log(3)/log(2), a ./ b); + if abs(b) < 1E-8 + c = log(3)/log(2); + else + c = a / b; + endif +endfunction + + +%!xtest <31070> +%! data = 1:50; +%! [pfit, pci] = gevfit_lmom (data); +%! expected_p = [-0.28 15.01 20.22]'; +%! assert (pfit, expected_p, 0.1); diff --git a/inst/gevinv.m b/inst/gevinv.m new file mode 100644 index 0000000..98a16c4 --- /dev/null +++ b/inst/gevinv.m @@ -0,0 +1,100 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{X} =} gevinv (@var{P}, @var{k}, @var{sigma}, @var{mu}) +## Compute a desired quantile (inverse CDF) of the generalized extreme value (GEV) distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{P} is the desired quantile of the GEV distribution. (Between 0 and 1.) +## @item +## @var{k} is the shape parameter of the GEV distribution. (Also denoted gamma or xi.) +## @item +## @var{sigma} is the scale parameter of the GEV distribution. The elements +## of @var{sigma} must be positive. +## @item +## @var{mu} is the location parameter of the GEV distribution. +## @end itemize +## The inputs must be of common size, or some of them must be scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{X} is the value corresponding to each quantile of the GEV distribution +## @end itemize +## @subheading References +## +## @enumerate +## @item +## Rolf-Dieter Reiss and Michael Thomas. @cite{Statistical Analysis of Extreme Values with Applications to Insurance, Finance, Hydrology and Other Fields}. Chapter 1, pages 16-17, Springer, 2007. +## @item +## J. R. M. Hosking (2012). @cite{L-moments}. R package, version 1.6. URL: http://CRAN.R-project.org/package=lmom. +## +## @end enumerate +## @seealso{gevcdf, gevfit, gevlike, gevpdf, gevrnd, gevstat} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Inverse CDF of the generalized extreme value distribution + +function [X] = gevinv (P, k = 0, sigma = 1, mu = 0) + + [retval, P, k, sigma, mu] = common_size (P, k, sigma, mu); + if (retval > 0) + error ("gevinv: inputs must be of common size or scalars"); + endif + + X = P; + + llP = log(-log(P)); + kllP = k .* llP; + + ii = (abs(kllP) < 1E-4); #use the Taylor series expansion of the exponential to avoid roundoff error or dividing by zero when k is small + X(ii) = mu(ii) - sigma(ii) .* llP(ii) .* (1 - kllP(ii) .* (1 - kllP(ii))); + X(~ii) = mu(~ii) + (sigma(~ii) ./ k(~ii)) .* (exp(-kllP(~ii)) - 1); + +endfunction + +%!test +%! p = 0.1:0.1:0.9; +%! k = 0; +%! sigma = 1; +%! mu = 0; +%! x = gevinv (p, k, sigma, mu); +%! c = gevcdf(x, k, sigma, mu); +%! assert (c, p, 0.001); + +%!test +%! p = 0.1:0.1:0.9; +%! k = 1; +%! sigma = 1; +%! mu = 0; +%! x = gevinv (p, k, sigma, mu); +%! c = gevcdf(x, k, sigma, mu); +%! assert (c, p, 0.001); + +%!test +%! p = 0.1:0.1:0.9; +%! k = 0.3; +%! sigma = 1; +%! mu = 0; +%! x = gevinv (p, k, sigma, mu); +%! c = gevcdf(x, k, sigma, mu); +%! assert (c, p, 0.001); + diff --git a/inst/gevlike.m b/inst/gevlike.m new file mode 100644 index 0000000..0355870 --- /dev/null +++ b/inst/gevlike.m @@ -0,0 +1,369 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{nlogL}, @var{Grad}, @var{ACOV} =} gevlike (@var{params}, @var{data}) +## Compute the negative log-likelihood of data under the generalized extreme value (GEV) distribution with given parameter values. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{params} is the 3-parameter vector [@var{k}, @var{sigma}, @var{mu}], where @var{k} is the shape parameter of the GEV distribution, @var{sigma} is the scale parameter of the GEV distribution, and @var{mu} is the location parameter of the GEV distribution. +## @item +## @var{data} is the vector of given values. +## +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{nlogL} is the negative log-likelihood. +## @item +## @var{Grad} is the 3 by 1 gradient vector (first derivative of the negative log likelihood with respect to the parameter values) +## @item +## @var{ACOV} is the 3 by 3 Fisher information matrix (second derivative of the negative log likelihood with respect to the parameter values) +## +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = -5:-1; +## k = -0.2; +## sigma = 0.3; +## mu = 0.5; +## [L, ~, C] = gevlike ([k sigma mu], x); +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Rolf-Dieter Reiss and Michael Thomas. @cite{Statistical Analysis of Extreme Values with Applications to Insurance, Finance, Hydrology and Other Fields}. Chapter 1, pages 16-17, Springer, 2007. +## +## @end enumerate +## @seealso{gevcdf, gevfit, gevinv, gevpdf, gevrnd, gevstat} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Negative log-likelihood for the generalized extreme value distribution + +function [nlogL, Grad, ACOV] = gevlike (params, data) + + # Check arguments + if (nargin != 2) + print_usage; + endif + + k = params(1); + sigma = params(2); + mu = params(3); + + #calculate negative log likelihood + [nll, k_terms] = gevnll (data, k, sigma, mu); + nlogL = sum(nll(:)); + + #optionally calculate the first and second derivatives of the negative log likelihood with respect to parameters + if nargout > 1 + [Grad, kk_terms] = gevgrad (data, k, sigma, mu, k_terms); + if nargout > 2 + ACOV = gevfim (data, k, sigma, mu, k_terms, kk_terms); + endif + endif + +endfunction + + +function [nlogL, k_terms] = gevnll (x, k, sigma, mu) +#internal function to calculate negative log likelihood for gevlike +#no input checking done + + k_terms = []; + a = (x - mu) ./ sigma; + + if all(k == 0) + nlogL = exp(-a) + a + log(sigma); + else + aa = k .* a; + if min(abs(aa)) < 1E-3 && max(abs(aa)) < 0.5 #use a series expansion to find the log likelihood more accurately when k is small + k_terms = 1; sgn = 1; i = 0; + while 1 + sgn = -sgn; i++; + newterm = (sgn / (i + 1)) * (aa .^ i); + k_terms = k_terms + newterm; + if max(abs(newterm)) <= eps + break + endif + endwhile + nlogL = exp(-a .* k_terms) + a .* (k + 1) .* k_terms + log(sigma); + else + b = 1 + aa; + nlogL = b .^ (-1 ./ k) + (1 + 1 ./ k) .* log(b) + log(sigma); + nlogL(b <= 0) = Inf; + endif + endif + +endfunction + +function [G, kk_terms] = gevgrad (x, k, sigma, mu, k_terms) +#calculate the gradient of the negative log likelihood of data x with respect to the parameters of the generalized extreme value distribution for gevlike +#no input checking done + +kk_terms = []; + +G = ones(3, 1); + +if k == 0 ##use the expressions for first derivatives that are the limits as k --> 0 + a = (x - mu) ./ sigma; + f = exp(-a) - 1; + #k + #g = -(2 * x .* (mu .* (1 - f) - sigma .* f) + 2 .* sigma .* mu .* f + (x.^2 + mu.^2).*(f - 1)) ./ (2 * f .* sigma .^ 2); + g = a .* (1 + a .* f / 2); + + G(1) = sum(g(:)); + + #sigma + g = (a .* f + 1) ./ sigma; + G(2) = sum(g(:)); + + #mu + g = f ./ sigma; + G(3) = sum(g(:)); + + return +endif + +a = (x - mu) ./ sigma; +b = 1 + k .* a; +if any (b <= 0) + G(:) = 0; #negative log likelihood is locally infinite + return +endif + +#k +c = log(b); +d = 1 ./ k + 1; +if nargin > 4 && ~isempty(k_terms) #use a series expansion to find the gradient more accurately when k is small + aa = k .* a; + f = exp(-a .* k_terms); + kk_terms = 0.5; sgn = 1; i = 0; + while 1 + sgn = -sgn; i++; + newterm = (sgn * (i + 1) / (i + 2)) * (aa .^ i); + kk_terms = kk_terms + newterm; + if max(abs(newterm)) <= eps + break + endif + endwhile + g = a .* ((a .* kk_terms) .* (f - 1 - k) + k_terms); +else + g = (c ./ k - a ./ b) ./ (k .* b .^ (1/k)) - c ./ (k .^ 2) + a .* d ./ b; +endif +%keyboard +G(1) = sum(g(:)); + +#sigma +if nargin > 4 && ~isempty(k_terms) #use a series expansion to find the gradient more accurately when k is small + g = (1 - a .* (a .* k .* kk_terms - k_terms) .* (f - k - 1)) ./ sigma; +else + #g = (a .* b .^ (-d) - d .* k .* a ./ b + 1) ./ sigma; + g = (a .* b .^ (-d) - (k + 1) .* a ./ b + 1) ./ sigma; +endif +G(2) = sum(g(:)); + +#mu +if nargin > 4 && ~isempty(k_terms) #use a series expansion to find the gradient more accurately when k is small + g = -(a .* k .* kk_terms - k_terms) .* (f - k - 1) ./ sigma; +else + #g = (b .^ (-d) - d .* k ./ b) ./ sigma; + g = (b .^ (-d) - (k + 1) ./ b) ./ sigma; +end +G(3) = sum(g(:)); + +endfunction + +function ACOV = gevfim (x, k, sigma, mu, k_terms, kk_terms) +#internal function to calculate the Fisher information matrix for gevlike +#no input checking done + +#find the various second derivatives (used Maxima to help find the expressions) + +ACOV = ones(3); + +if k == 0 ##use the expressions for second derivatives that are the limits as k --> 0 + #k, k + a = (x - mu) ./ sigma; + f = exp(-a); + #der = (x .* (24 * mu .^ 2 .* sigma .* (f - 1) + 24 * mu .* sigma .^ 2 .* f - 12 * mu .^ 3) + x .^ 3 .* (8 * sigma .* (f - 1) - 12*mu) + x .^ 2 .* (-12 * sigma .^ 2 .* f + 24 * mu .* sigma .* (1 - f) + 18 * mu .^ 2) - 12 * mu .^ 2 .* sigma .^ 2 .* f + 8 * mu .^ 3 .* sigma .* (1 - f) + 3 * (x .^ 4 + mu .^ 4)) ./ (12 .* f .* sigma .^ 4); + der = (a .^ 2) .* (a .* (a/4 - 2/3) .* f + 2/3 * a - 1); + ACOV(1, 1) = sum(der(:)); + + #sigma, sigma + der = (sigma .^ -2) .* (a .* ((a - 2) .* f + 2) - 1); + ACOV(2, 2) = sum(der(:)); + + #mu, mu + der = (sigma .^ -2) .* f; + ACOV(3, 3) = sum(der(:)); + + #k, sigma + #der = (x .^2 .* (2*sigma .* (f - 1) - 3*mu) + x .* (-2 * sigma .^ 2 .* f + 4 * mu .* sigma .* (1 - f) + 3 .* mu .^ 2) + 2 * mu .^ 2 .* sigma .* (f - 1) + 2 * mu * sigma .^ 2 * f + x .^ 3 - mu .^ 3) ./ (2 .* f .* sigma .^ 4); + der = (-a ./ sigma) .* (a .* (1 - a/2) .* f - a + 1); + ACOV(1, 2) = ACOV(2, 1) = sum(der(:)); + + #k, mu + #der = (x .* (2*sigma .* (f - 1) - 2*mu) - 2 * f .* sigma .^ 2 + 2 .* mu .* sigma .* (1 - f) + x .^ 2 + mu .^ 2)./ (2 .* f .* sigma .^ 3); + der = (-1 ./ sigma) .* (a .* (1 - a/2) .* f - a + 1); + ACOV(1, 3) = ACOV(3, 1) = sum(der(:)); + + #sigma, mu + der = (1 + (a - 1) .* f) ./ (sigma .^ 2); + ACOV(2, 3) = ACOV(3, 2) = sum(der(:)); + + return +endif + +#general case + +z = 1 + k .* (x - mu) ./ sigma; + +#k, k +a = (x - mu) ./ sigma; +b = k .* a + 1; +c = log(b); +d = 1 ./ k + 1; +if nargin > 5 && ~isempty(kk_terms) #use a series expansion to find the derivatives more accurately when k is small + aa = k .* a; + f = exp(-a .* k_terms); + kkk_terms = 2/3; sgn = 1; i = 0; + while 1 + sgn = -sgn; i++; + newterm = (sgn * (i + 1) * (i + 2) / (i + 3)) * (aa .^ i); + kkk_terms = kkk_terms + newterm; + if max(abs(newterm)) <= eps + break + endif + endwhile + der = (a .^ 2) .* (a .* (a .* kk_terms .^ 2 - kkk_terms) .* f + a .* (1 + k) .* kkk_terms - 2 * kk_terms); +else + der = ((((c ./ k.^2) - (a ./ (k .* b))) .^ 2) ./ (b .^ (1 ./ k))) + ... + ((-2*c ./ k.^3) + (2*a ./ (k.^2 .* b)) + ((a ./ b) .^ 2 ./ k)) ./ (b .^ (1 ./ k)) + ... + 2*c ./ k.^3 - ... + (2*a ./ (k.^2 .* b)) - (d .* (a ./ b) .^ 2); +endif +der(z <= 0) = 0; %no probability mass in this region +ACOV(1, 1) = sum(der(:)); + +#sigma, sigma +if nargin > 5 && ~isempty(kk_terms) #use a series expansion to find the derivatives more accurately when k is small + der = ((-2*a .* k_terms + 4 * a .^ 2 .* k .* kk_terms - a .^ 3 .* (k .^ 2) .* kkk_terms) .* (f - k - 1) + f .* ((a .* (k_terms - a .* k .* kk_terms)) .^ 2) - 1) ./ (sigma .^ 2); +else + der = (sigma .^ -2) .* (... + -2*a .* b .^ (-d) + ... + d .* k .* a .^ 2 .* (b .^ (-d-1)) + ... + 2 .* d .* k .* a ./ b - ... + d .* (k .* a ./ b) .^ 2 - 1); +end +der(z <= 0) = 0; %no probability mass in this region +ACOV(2, 2) = sum(der(:)); + +#mu, mu +if nargin > 5 && ~isempty(kk_terms) #use a series expansion to find the derivatives involving k more accurately when k is small + der = (f .* (a .* k .* kk_terms - k_terms) .^ 2 - a .* k .^ 2 .* kkk_terms .* (f - k - 1)) ./ (sigma .^ 2); +else + der = (d .* (sigma .^ -2)) .* (... + k .* (b .^ (-d-1)) - ... + (k ./ b) .^ 2); +endif +der(z <= 0) = 0; %no probability mass in this region +ACOV(3, 3) = sum(der(:)); + + +#k, mu +if nargin > 5 && ~isempty(kk_terms) #use a series expansion to find the derivatives involving k more accurately when k is small + der = 2 * a .* kk_terms .* (f - 1 - k) - a .^ 2 .* k_terms .* kk_terms .* f + k_terms; #k, a second derivative + der = -der ./ sigma; +else + der = ( (b .^ (-d)) .* (c ./ k - a ./ b) ./ k - ... +a .* (b .^ (-d-1)) + ... +((1 ./ k) - d) ./ b + +a .* k .* d ./ (b .^ 2)) ./ sigma; +endif +der(z <= 0) = 0; %no probability mass in this region +ACOV(1, 3) = ACOV(3, 1) = sum(der(:)); + +#k, sigma +der = a .* der; +der(z <= 0) = 0; %no probability mass in this region +ACOV(1, 2) = ACOV(2, 1) = sum(der(:)); + +#sigma, mu +if nargin > 5 && ~isempty(kk_terms) #use a series expansion to find the derivatives involving k more accurately when k is small + der = ((-k_terms + 3 * a .* k .* kk_terms - (a .* k) .^ 2 .* kkk_terms) .* (f - k - 1) + a .* (k_terms - a .* k .* kk_terms) .^ 2 .* f) ./ (sigma .^ 2); +else + der = ( -(b .^ (-d)) + ... +a .* k .* d .* (b .^ (-d-1)) + ... +(d .* k ./ b) - a .* (k./b).^2 .* d) ./ (sigma .^ 2); +end +der(z <= 0) = 0; %no probability mass in this region +ACOV(2, 3) = ACOV(3, 2) = sum(der(:)); + +endfunction + + + + +%!test +%! x = 1; +%! k = 0.2; +%! sigma = 0.3; +%! mu = 0.5; +%! [L, D, C] = gevlike ([k sigma mu], x); +%! expected_L = 0.75942; +%! expected_D = [0.53150; -0.67790; -2.40674]; +%! expected_C = [-0.12547 1.77884 1.06731; 1.77884 16.40761 8.48877; 1.06731 8.48877 0.27979]; +%! assert (L, expected_L, 0.001); +%! assert (D, expected_D, 0.001); +%! assert (C, expected_C, 0.001); + +%!test +%! x = 1; +%! k = 0; +%! sigma = 0.3; +%! mu = 0.5; +%! [L, D, C] = gevlike ([k sigma mu], x); +%! expected_L = 0.65157; +%! expected_D = [0.54011; -1.17291; -2.70375]; +%! expected_C = [0.090036 3.41229 2.047337; 3.412229 24.760027 12.510190; 2.047337 12.510190 2.098618]; +%! assert (L, expected_L, 0.001); +%! assert (D, expected_D, 0.001); +%! assert (C, expected_C, 0.001); + +%!test +%! x = -5:-1; +%! k = -0.2; +%! sigma = 0.3; +%! mu = 0.5; +%! [L, D, C] = gevlike ([k sigma mu], x); +%! expected_L = 3786.4; +%! expected_D = [6.4511e+04; -4.8194e+04; 3.0633e+03]; +%! expected_C = -[-1.4937e+06 1.0083e+06 -6.1837e+04; 1.0083e+06 -8.1138e+05 4.0917e+04; -6.1837e+04 4.0917e+04 -2.0422e+03]; +%! assert (L, expected_L, -0.001); +%! assert (D, expected_D, -0.001); +%! assert (C, expected_C, -0.001); diff --git a/inst/gevpdf.m b/inst/gevpdf.m new file mode 100644 index 0000000..ac86fcf --- /dev/null +++ b/inst/gevpdf.m @@ -0,0 +1,130 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} gevpdf (@var{x}, @var{k}, @var{sigma}, @var{mu}) +## Compute the probability density function of the generalized extreme value (GEV) distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the support. +## +## @item +## @var{k} is the shape parameter of the GEV distribution. (Also denoted gamma or xi.) +## @item +## @var{sigma} is the scale parameter of the GEV distribution. The elements +## of @var{sigma} must be positive. +## @item +## @var{mu} is the location parameter of the GEV distribution. +## @end itemize +## The inputs must be of common size, or some of them must be scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{y} is the probability density of the GEV distribution at each +## element of @var{x} and corresponding parameter values. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = 0:0.5:2.5; +## sigma = 1:6; +## k = 1; +## mu = 0; +## y = gevpdf (x, k, sigma, mu) +## @end group +## +## @group +## y = gevpdf (x, k, 0.5, mu) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Rolf-Dieter Reiss and Michael Thomas. @cite{Statistical Analysis of Extreme Values with Applications to Insurance, Finance, Hydrology and Other Fields}. Chapter 1, pages 16-17, Springer, 2007. +## +## @end enumerate +## @seealso{gevcdf, gevfit, gevinv, gevlike, gevrnd, gevstat} +## @end deftypefn + +## Author: Nir Krakauer +## Description: PDF of the generalized extreme value distribution + +function y = gevpdf (x, k, sigma, mu) + + # Check arguments + if (nargin != 4) + print_usage (); + endif + + if (isempty (x) || isempty (k) || isempty (sigma) || isempty (mu) || ~ismatrix (x) || ~ismatrix (k) || ~ismatrix (sigma) || ~ismatrix (mu)) + error ("gevpdf: inputs must be numeric matrices"); + endif + + [retval, x, k, sigma, mu] = common_size (x, k, sigma, mu); + if (retval > 0) + error ("gevpdf: inputs must be of common size or scalars"); + endif + + z = 1 + k .* (x - mu) ./ sigma; + + # Calculate pdf + y = exp(-(z .^ (-1 ./ k))) .* (z .^ (-1 - 1 ./ k)) ./ sigma; + + y(z <= 0) = 0; + + inds = (abs (k) < (eps^0.7)); %use a different formula if k is very close to zero + if any(inds) + z = (mu(inds) - x(inds)) ./ sigma(inds); + y(inds) = exp(z-exp(z)) ./ sigma(inds); + endif + + +endfunction + +%!test +%! x = 0:0.5:2.5; +%! sigma = 1:6; +%! k = 1; +%! mu = 0; +%! y = gevpdf (x, k, sigma, mu); +%! expected_y = [0.367879 0.143785 0.088569 0.063898 0.049953 0.040997]; +%! assert (y, expected_y, 0.001); + +%!test +%! x = -0.5:0.5:2.5; +%! sigma = 0.5; +%! k = 1; +%! mu = 0; +%! y = gevpdf (x, k, sigma, mu); +%! expected_y = [0 0.735759 0.303265 0.159229 0.097350 0.065498 0.047027]; +%! assert (y, expected_y, 0.001); + +%!test #check for continuity for k near 0 +%! x = 1; +%! sigma = 0.5; +%! k = -0.03:0.01:0.03; +%! mu = 0; +%! y = gevpdf (x, k, sigma, mu); +%! expected_y = [0.23820 0.23764 0.23704 0.23641 0.23576 0.23508 0.23438]; +%! assert (y, expected_y, 0.001); diff --git a/inst/gevrnd.m b/inst/gevrnd.m new file mode 100644 index 0000000..583c5e9 --- /dev/null +++ b/inst/gevrnd.m @@ -0,0 +1,121 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {} gevrnd (@var{k}, @var{sigma}, @var{mu}) +## @deftypefnx {Function File} {} gevrnd (@var{k}, @var{sigma}, @var{mu}, @var{r}) +## @deftypefnx {Function File} {} gevrnd (@var{k}, @var{sigma}, @var{mu}, @var{r}, @var{c}, @dots{}) +## @deftypefnx {Function File} {} gevrnd (@var{k}, @var{sigma}, @var{mu}, [@var{sz}]) +## Return a matrix of random samples from the generalized extreme value (GEV) distribution with parameters +## @var{k}, @var{sigma}, @var{mu}. +## +## When called with a single size argument, returns a square matrix with +## the dimension specified. When called with more than one scalar argument the +## first two arguments are taken as the number of rows and columns and any +## further arguments specify additional matrix dimensions. The size may also +## be specified with a vector @var{sz} of dimensions. +## +## If no size arguments are given, then the result matrix is the common size of +## the input parameters. +## @seealso{gevcdf, gevfit, gevinv, gevlike, gevpdf, gevstat} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Random deviates from the generalized extreme value distribution + +function rnd = gevrnd (k, sigma, mu, varargin) + + if (nargin < 3) + print_usage (); + endif + + if any (sigma <= 0) + error ("gevrnd: sigma must be positive"); + endif + + if (!isscalar (k) || !isscalar (sigma) || !isscalar (mu)) + [retval, k, sigma, mu] = common_size (k, sigma, mu); + if (retval > 0) + error ("gevrnd: k, sigma, mu must be of common size or scalars"); + endif + endif + + if (iscomplex (k) || iscomplex (sigma) || iscomplex (mu)) + error ("gevrnd: k, sigma, mu must not be complex"); + endif + + if (nargin == 3) + sz = size (k); + elseif (nargin == 4) + if (isscalar (varargin{1}) && varargin{1} >= 0) + sz = [varargin{1}, varargin{1}]; + elseif (isrow (varargin{1}) && all (varargin{1} >= 0)) + sz = varargin{1}; + else + error ("gevrnd: dimension vector must be row vector of non-negative integers"); + endif + elseif (nargin > 4) + if (any (cellfun (@(x) (!isscalar (x) || x < 0), varargin))) + error ("gevrnd: dimensions must be non-negative integers"); + endif + sz = [varargin{:}]; + endif + + if (!isscalar (k) && !isequal (size (k), sz)) + error ("gevrnd: k, sigma, mu must be scalar or of size SZ"); + endif + + if (isa (k, "single") || isa (sigma, "single") || isa (mu, "single")) + cls = "single"; + else + cls = "double"; + endif + + rnd = gevinv (rand(sz), k, sigma, mu); + + if (strcmp (cls, "single")) + rnd = single (rnd); + endif + +endfunction + + +%!assert(size (gevrnd (1,2,1)), [1, 1]); +%!assert(size (gevrnd (ones(2,1), 2, 1)), [2, 1]); +%!assert(size (gevrnd (ones(2,2), 2, 1)), [2, 2]); +%!assert(size (gevrnd (1, 2*ones(2,1), 1)), [2, 1]); +%!assert(size (gevrnd (1, 2*ones(2,2), 1)), [2, 2]); +%!assert(size (gevrnd (1, 2, 1, 3)), [3, 3]); +%!assert(size (gevrnd (1, 2, 1, [4 1])), [4, 1]); +%!assert(size (gevrnd (1, 2, 1, 4, 1)), [4, 1]); + +%% Test input validation +%!error gevrnd () +%!error gevrnd (1, 2) +%!error gevrnd (ones(3),ones(2),1) +%!error gevrnd (ones(2),ones(3),1) +%!error gevrnd (i, 2, 1) +%!error gevrnd (2, i, 1) +%!error gevrnd (2, 0, 1) +%!error gevrnd (1,2, 1, -1) +%!error gevrnd (1,2, 1, ones(2)) +%!error gevrnd (1,2, 1, [2 -1 2]) +%!error gevrnd (1,2, 1, 1, ones(2)) +%!error gevrnd (1,2, 1, 1, -1) +%!error gevrnd (ones(2,2), 2, 1, 3) +%!error gevrnd (ones(2,2), 2, 1, [3, 2]) +%!error gevrnd (ones(2,2), 2, 1, 2, 3) + diff --git a/inst/gevstat.m b/inst/gevstat.m new file mode 100644 index 0000000..ccf2b36 --- /dev/null +++ b/inst/gevstat.m @@ -0,0 +1,90 @@ +## Copyright (C) 2012 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} gevstat (@var{k}, @var{sigma}, @var{mu}) +## Compute the mean and variance of the generalized extreme value (GEV) distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{k} is the shape parameter of the GEV distribution. (Also denoted gamma or xi.) +## @item +## @var{sigma} is the scale parameter of the GEV distribution. The elements +## of @var{sigma} must be positive. +## @item +## @var{mu} is the location parameter of the GEV distribution. +## @end itemize +## The inputs must be of common size, or some of them must be scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the GEV distribution +## +## @item +## @var{v} is the variance of the GEV distribution +## @end itemize +## @seealso{gevcdf, gevfit, gevinv, gevlike, gevpdf, gevrnd} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Moments of the generalized extreme value distribution + +function [m, v] = gevstat (k, sigma, mu) + + # Check arguments + if (nargin < 3) + print_usage (); + endif + + if (isempty (k) || isempty (sigma) || isempty (mu) || ~ismatrix (k) || ~ismatrix (sigma) || ~ismatrix (mu)) + error ("gevstat: inputs must be numeric matrices"); + endif + + [retval, k, sigma, mu] = common_size (k, sigma, mu); + if (retval > 0) + error ("gevstat: inputs must be of common size or scalars"); + endif + + eg = 0.57721566490153286; %Euler-Mascheroni constant + + m = v = k; + + #find the mean + m(k >= 1) = Inf; + m(k == 0) = mu(k == 0) + eg*sigma(k == 0); + m(k < 1 & k ~= 0) = mu(k < 1 & k ~= 0) + sigma(k < 1 & k ~= 0) .* (gamma(1-k(k < 1 & k ~= 0)) - 1) ./ k(k < 1 & k ~= 0); + + #find the variance + v(k >= 0.5) = Inf; + v(k == 0) = (pi^2 / 6) * sigma(k == 0) .^ 2; + v(k < 0.5 & k ~= 0) = (gamma(1-2*k(k < 0.5 & k ~= 0)) - gamma(1-k(k < 0.5 & k ~= 0)).^2) .* (sigma(k < 0.5 & k ~= 0) ./ k(k < 0.5 & k ~= 0)) .^ 2; + + + +endfunction + +%!test +%! k = [-1 -0.5 0 0.2 0.4 0.5 1]; +%! sigma = 2; +%! mu = 1; +%! [m, v] = gevstat (k, sigma, mu); +%! expected_m = [1 1.4551 2.1544 2.6423 3.4460 4.0898 Inf]; +%! expected_v = [4 3.4336 6.5797 13.3761 59.3288 Inf Inf]; +%! assert (m, expected_m, -0.001); +%! assert (v, expected_v, -0.001); diff --git a/inst/gmdistribution.m b/inst/gmdistribution.m new file mode 100644 index 0000000..ba1ad78 --- /dev/null +++ b/inst/gmdistribution.m @@ -0,0 +1,349 @@ +## Copyright (C) 2015 Lachlan Andrew +## Copyright (C) 2018-2020 John Donoghue +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef gmdistribution + ## -*- texinfo -*- + ## @deftypefn {Function File} {@var{GMdist} =} gmdistribution (@var{mu}, @var{Sigma}) + ## @deftypefnx {Function File} {@var{GMdist} =} gmdistribution (@var{mu}, @var{Sigma}, @var{p}) + ## @deftypefnx {Function File} {@var{GMdist} =} gmdistribution (@var{mu}, @var{Sigma}, @var{p}, @var{extra}) + ## Create an object of the gmdistribution class which represents a Gaussian + ## mixture model with k components of n-dimensional Gaussians. + ## + ## Input @var{mu} is a k-by-n matrix specifying the n-dimensional mean of each + ## of the k components of the distribution. + ## + ## Input @var{Sigma} is an array that specifies the variances of the + ## distributions, in one of four forms depending on its dimension. + ## @itemize + ## @item n-by-n-by-k: Slice @var{Sigma}(:,:,i) is the variance of the + ## i'th component + ## @item 1-by-n-by-k: Slice diag(@var{Sigma}(1,:,i)) is the variance of the + ## i'th component + ## @item n-by-n: @var{Sigma} is the variance of every component + ## @item 1-by-n-by-k: Slice diag(@var{Sigma}) is the variance of every + ## component + ## @end itemize + ## + ## If @var{p} is specified, it is a vector of length k specifying the proportion + ## of each component. If it is omitted or empty, each component has an equal + ## proportion. + ## + ## Input @var{extra} is used by fitgmdist to indicate the parameters of the + ## fitting process. + ## @seealso{fitgmdist} + ## @end deftypefn + + properties + mu ## means + Sigma ## covariances + ComponentProportion ## mixing proportions + DistributionName ## "gaussian mixture distribution" + NumComponents ## Number of mixture components + NumVariables ## Dimension d of each Gaussian component + + CovarianceType ## 'diagonal' if DiagonalCovariance, 'full' othw + SharedCovariance ## true if all components have equal covariance + + ## Set by a call to gmdistribution.fit or fitgmdist + AIC ## Akaike Information Criterion + BIC ## Bayes Information Criterion + Converged ## true if algorithm converged by MaxIter + NegativeLogLikelihood ## Negative of log-likelihood + NlogL ## Negative of log-likelihood + NumIterations ## Number of iterations + RegularizationValue ## const added to diag of cov to make +ve def + endproperties + + properties (Access = private) + DiagonalCovariance ## bool summary of "CovarianceType" + endproperties + + methods + ######################################## + ## Constructor + function obj = gmdistribution (mu,sigma,p = [],extra = []) + obj.DistributionName = "gaussian mixture distribution"; + obj.mu = mu; + obj.Sigma = sigma; + obj.NumComponents = rows (mu); + obj.NumVariables = columns (mu); + if (isempty (p)) + obj.ComponentProportion = ones (1,obj.NumComponents) / obj.NumComponents; + else + if any (p < 0) + error ("gmmdistribution: component weights must be non-negative"); + endif + s = sum(p); + if (s == 0) + error ("gmmdistribution: component weights must not be all zero"); + elseif (s != 1) + p = p / s; + endif + obj.ComponentProportion = p(:)'; + endif + if (length (size (sigma)) == 3) + obj.SharedCovariance = false; + else + obj.SharedCovariance = true; + endif + if (rows (sigma) == 1 && columns (mu) > 1) + obj.DiagonalCovariance = true; + obj.CovarianceType = 'diagonal'; + else + obj.DiagonalCovariance = false; ## full + obj.CovarianceType = 'full'; + endif + + if (!isempty (extra)) + obj.AIC = extra.AIC; + obj.BIC = extra.BIC; + obj.Converged = extra.Converged; + obj.NegativeLogLikelihood = extra.NegativeLogLikelihood; + obj.NlogL = extra.NegativeLogLikelihood; + obj.NumIterations = extra.NumIterations; + obj.RegularizationValue = extra.RegularizationValue; + endif + endfunction + + ######################################## + ## Cumulative distribution function for Gaussian mixture distribution + function c = cdf (obj, X) + X = checkX (obj, X, "cdf"); + p_x_l = zeros (rows (X), obj.NumComponents); + if (obj.SharedCovariance) + if (obj.DiagonalCovariance) + sig = diag (obj.Sigma); + else + sig = obj.Sigma; + endif + endif + for i = 1:obj.NumComponents + if (!obj.SharedCovariance) + if (obj.DiagonalCovariance) + sig = diag (obj.Sigma(:,:,i)); + else + sig = obj.Sigma(:,:,i); + endif + endif + p_x_l(:,i) = mvncdf (X,obj.mu(i,:),sig)*obj.ComponentProportion(i); + endfor + c = sum (p_x_l, 2); + endfunction + + ######################################## + ## Construct clusters from Gaussian mixture distribution + ## + function [idx,nlogl,P,logpdf,M] = cluster (obj,X) + X = checkX (obj, X, "cluster"); + [p_x_l, M] = componentProb (obj, X); + [~, idx] = max (p_x_l, [], 2); + if (nargout >= 2) + PDF = sum (p_x_l, 2); + logpdf = log (PDF); + nlogl = -sum (logpdf); + if (nargout >= 3) + P = bsxfun (@rdivide, p_x_l, PDF); + endif + endif + endfunction + + ######################################## + ## Display Gaussian mixture distribution object + function c = disp (obj) + fprintf("Gaussian mixture distribution with %d components in %d dimension(s)\n", obj.NumComponents, columns (obj.mu)); + for i = 1:obj.NumComponents + fprintf("Clust %d: weight %d\n\tMean: ", i, obj.ComponentProportion(i)); + fprintf("%g ", obj.mu(i,:)); + fprintf("\n"); + if (!obj.SharedCovariance) + fprintf("\tVariance:"); + if (!obj.DiagonalCovariance) + if columns (obj.mu) > 1 + fprintf("\n"); + endif + disp(squeeze(obj.Sigma(:,:,i))) + else + fprintf(" diag("); + fprintf("%g ", obj.Sigma(:,:,i)); + fprintf(")\n"); + endif + endif + endfor + if (obj.SharedCovariance) + fprintf("Shared variance\n"); + if (!obj.DiagonalCovariance) + obj.Sigma + else + fprintf(" diag("); + fprintf("%g ", obj.Sigma); + fprintf(")\n"); + endif + endif + if (!isempty (obj.AIC)) + fprintf("AIC=%g BIC=%g NLogL=%g Iter=%d Cged=%d Reg=%g\n", ... + obj.AIC, obj.BIC, obj.NegativeLogLikelihood, ... + obj.NumIterations, obj.Converged, obj.RegularizationValue); + endif + endfunction + + ######################################## + ## Display Gaussian mixture distribution object + function c = display (obj) + disp(obj); + endfunction + + ######################################## + ## Mahalanobis distance to component means + function D = mahal (obj,X) + X = checkX (obj, X, "mahal"); + [~, D] = componentProb (obj,X); + endfunction + + ######################################## + ## Probability density function for Gaussian mixture distribution + function c = pdf (obj,X) + X = checkX (obj, X, "pdf"); + p_x_l = componentProb (obj, X); + c = sum (p_x_l, 2); + endfunction + + ######################################## + ## Posterior probabilities of components + function c = posterior (obj,X) + X = checkX (obj, X, "posterior"); + p_x_l = componentProb (obj, X); + c = bsxfun(@rdivide, p_x_l, sum (p_x_l, 2)); + endfunction + + ######################################## + ## Random numbers from Gaussian mixture distribution + function c = random (obj,n) + if nargin == 1 + n = 1; + endif + c = zeros (n, obj.NumVariables); + classes = randsample (obj.NumComponents, n, true, obj.ComponentProportion); + if (obj.SharedCovariance) + if (obj.DiagonalCovariance) + sig = diag (obj.Sigma); + else + sig = obj.Sigma; + endif + endif + for i = 1:obj.NumComponents + idx = (classes == i); + k = sum(idx); + if k > 0 + if (!obj.SharedCovariance) + if (obj.DiagonalCovariance) + sig = diag (obj.Sigma(:,:,i)); + else + sig = obj.Sigma(:,:,i); + endif + endif + # [sig] forces [sig] not to have class "diagonal", + # since mvnrnd uses automatic broadcast, + # which fails on structured matrices + c(idx,:) = mvnrnd ([obj.mu(i,:)], [sig], k); + endif + endfor + endfunction + endmethods + + ######################################## + methods (Static) + ## Gaussian mixture parameter estimates + function c = fit (X,k,varargin) + c = fitgmdist (X,k,varargin{:}); + endfunction + endmethods + + ######################################## + methods (Access = private) + ## Probability density of (row of) X *and* component l + ## Second argument is an array of the Mahalonis distances + function [p_x_l, M] = componentProb (obj, X) + M = zeros (rows (X), obj.NumComponents); + dets = zeros (1, obj.NumComponents); % sqrt(determinant) + if (obj.SharedCovariance) + if (obj.DiagonalCovariance) + r = diag (sqrt(obj.Sigma)); + else + r = chol (obj.Sigma); + endif + endif + for i = 1:obj.NumComponents + dev = bsxfun (@minus, X, obj.mu(i,:)); + if (!obj.SharedCovariance) + if (obj.DiagonalCovariance) + r = diag (sqrt (obj.Sigma(:,:,i))); + else + r = chol (obj.Sigma(:,:,i)); + endif + endif + M(:,i) = sumsq (dev / r, 2); + dets(i) = prod (diag (r)); + endfor + p_x_l = exp (-M/2); + coeff = obj.ComponentProportion ./ ((2*pi)^(obj.NumVariables/2).*dets); + p_x_l = bsxfun (@times, p_x_l, coeff); + endfunction + + ######################################## + ## Check format of argument X + function X = checkX (obj, X, name) + if (columns (X) != obj.NumVariables) + if (columns (X) == 1 && rows (X) == obj.NumVariables) + X = X'; + else + error ("gmdistribution.%s: X has %d columns instead of %d\n", ... + name, columns (X), obj.NumVariables); + end + endif + endfunction + endmethods +endclassdef + +%!test +%! mu = eye(2); +%! Sigma = eye(2); +%! GM = gmdistribution (mu, Sigma); +%! density = GM.pdf ([0 0; 1 1]); +%! assert (density(1) - density(2), 0, 1e-6); +%! +%! [idx, nlogl, P, logpdf,M] = cluster (GM, eye(2)); +%! assert (idx, [1; 2]); +%! [idx2,nlogl2,P2,logpdf2] = GM.cluster (eye(2)); +%! assert (nlogl - nlogl2, 0, 1e-6); +%! [idx3,nlogl3,P3] = cluster (GM, eye(2)); +%! assert (P - P3, zeros (2), 1e-6); +%! [idx4,nlogl4] = cluster (GM, eye(2)); +%! assert (size (nlogl4), [1 1]); +%! idx5 = cluster (GM, eye(2)); +%! assert (idx - idx5, zeros (2,1)); +%! +%! D = GM.mahal ([1;0]); +%! assert (D - M(1,:), zeros (1,2), 1e-6); +%! +%! P = GM.posterior ([0 1]); +%! assert (P - P2(2,:), zeros (1,2), 1e-6); +%! +%! R = GM.random(20); +%! assert (size(R), [20, 2]); +%! +%! R = GM.random(); +%! assert (size(R), [1, 2]); + diff --git a/inst/gpcdf.m b/inst/gpcdf.m new file mode 100644 index 0000000..060c12d --- /dev/null +++ b/inst/gpcdf.m @@ -0,0 +1,176 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1997-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} gpcdf (@var{x}, @var{shape}, @var{scale}, @var{location}) +## Compute the cumulative distribution function (CDF) at @var{x} of the +## generalized Pareto distribution with parameters @var{location}, @var{scale}, +## and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: PDF of the generalized Pareto distribution + +function cdf = gpcdf (x, shape, scale, location) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar (shape)) + [retval, x, location, scale, shape] = ... + common_size (x, location, scale, shape); + if (retval > 0) + error ("gpcdf: X, LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (location) || iscomplex (scale) ... + || iscomplex (shape)) + error ("gpcdf: X, LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (isa (x, "single") || isa (location, "single") || isa (scale, "single") ... + || isa (shape, "single")) + cdf = zeros (size (x), "single"); + else + cdf = zeros (size (x)); + endif + + k = isnan (x) | ! (-Inf < location) | ! (location < Inf) | ! (scale > 0) ... + | ! (-Inf < shape) | ! (shape < Inf); + cdf(k) = NaN; + + k = (x == Inf) & (-Inf < location) & (location < Inf) & (scale > 0) ... + & (-Inf < shape) & (shape < Inf); + cdf(k) = 1; + + k = (-Inf < x) & (x < Inf) & (-Inf < location) & (location < Inf) ... + & (scale > 0) & (scale < Inf) & (-Inf < shape) & (shape < Inf); + if (isscalar (location) && isscalar (scale) && isscalar (shape)) + z = (x - location) / scale; + + j = k & (shape == 0) & (z >= 0); + if (any (j)) + cdf(j) = 1 - exp (-z(j)); + endif + + j = k & (shape > 0) & (z >= 0); + if (any (j)) + cdf(j) = 1 - (shape * z(j) + 1).^(-1 / shape); + endif + + if (shape < 0) + j = k & (shape < 0) & (0 <= z) & (z <= -1 ./ shape); + if (any (j)) + cdf(j) = 1 - (shape * z(j) + 1).^(-1 / shape); + endif + endif + else + z = (x - location) ./ scale; + + j = k & (shape == 0) & (z >= 0); + if (any (j)) + cdf(j) = 1 - exp (-z(j)); + endif + + j = k & (shape > 0) & (z >= 0); + if (any (j)) + cdf(j) = 1 - (shape(j) .* z(j) + 1).^(-1 ./ shape(j)); + endif + + if (any (shape < 0)) + j = k & (shape < 0) & (0 <= z) & (z <= -1 ./ shape); + if (any (j)) + cdf(j) = 1 - (shape(j) .* z(j) + 1).^(-1 ./ shape(j)); + endif + endif + endif + +endfunction + + +%!shared x,y1,y2,y3 +%! x = [-Inf, -1, 0, 1/2, 1, Inf]; +%! y1 = [0, 0, 0, 0.3934693402873666, 0.6321205588285577, 1]; +%! y2 = [0, 0, 0, 1/3, 1/2, 1]; +%! y3 = [0, 0, 0, 1/2, 1, 1]; +%! seps = eps('single')*5; +%!assert (gpcdf (x, zeros (1,6), ones (1,6), zeros (1,6)), y1, eps) +%!assert (gpcdf (x, 0, 1, zeros (1,6)), y1, eps) +%!assert (gpcdf (x, 0, ones (1,6), 0), y1, eps) +%!assert (gpcdf (x, zeros (1,6), 1, 0), y1, eps) +%!assert (gpcdf (x, 0, 1, 0), y1, eps) +%!assert (gpcdf (x, 0, 1, [0, 0, 0, NaN, 0, 0]), [y1(1:3), NaN, y1(5:6)], eps) +%!assert (gpcdf (x, 0, [1, 1, 1, NaN, 1, 1], 0), [y1(1:3), NaN, y1(5:6)], eps) +%!assert (gpcdf (x, [0, 0, 0, NaN, 0, 0], 1, 0), [y1(1:3), NaN, y1(5:6)], eps) +%!assert (gpcdf ([x(1:3), NaN, x(5:6)], 0, 1, 0), [y1(1:3), NaN, y1(5:6)], eps) + +%!assert (gpcdf (x, ones (1,6), ones (1,6), zeros (1,6)), y2, eps) +%!assert (gpcdf (x, 1, 1, zeros (1,6)), y2, eps) +%!assert (gpcdf (x, 1, ones (1,6), 0), y2, eps) +%!assert (gpcdf (x, ones (1,6), 1, 0), y2, eps) +%!assert (gpcdf (x, 1, 1, 0), y2, eps) +%!assert (gpcdf (x, 1, 1, [0, 0, 0, NaN, 0, 0]), [y2(1:3), NaN, y2(5:6)], eps) +%!assert (gpcdf (x, 1, [1, 1, 1, NaN, 1, 1], 0), [y2(1:3), NaN, y2(5:6)], eps) +%!assert (gpcdf (x, [1, 1, 1, NaN, 1, 1], 1, 0), [y2(1:3), NaN, y2(5:6)], eps) +%!assert (gpcdf ([x(1:3), NaN, x(5:6)], 1, 1, 0), [y2(1:3), NaN, y2(5:6)], eps) + +%!assert (gpcdf (x, -ones (1,6), ones (1,6), zeros (1,6)), y3, eps) +%!assert (gpcdf (x, -1, 1, zeros (1,6)), y3, eps) +%!assert (gpcdf (x, -1, ones (1,6), 0), y3, eps) +%!assert (gpcdf (x, -ones (1,6), 1, 0), y3, eps) +%!assert (gpcdf (x, -1, 1, 0), y3, eps) +%!assert (gpcdf (x, -1, 1, [0, 0, 0, NaN, 0, 0]), [y1(1:3), NaN, y3(5:6)], eps) +%!assert (gpcdf (x, -1, [1, 1, 1, NaN, 1, 1], 0), [y1(1:3), NaN, y3(5:6)], eps) +%!assert (gpcdf (x, [-1, -1, -1, NaN, -1, -1], 1, 0), [y1(1:3), NaN, y3(5:6)], eps) +%!assert (gpcdf ([x(1:3), NaN, x(5:6)], -1, 1, 0), [y1(1:3), NaN, y3(5:6)], eps) + +## Test class of input preserved +%!assert (gpcdf (single ([x, NaN]), 0, 1, 0), single ([y1, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], 0, 1, single (0)), single ([y1, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], 0, single (1), 0), single ([y1, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], single (0), 1, 0), single ([y1, NaN]), eps('single')) + +%!assert (gpcdf (single ([x, NaN]), 1, 1, 0), single ([y2, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], 1, 1, single (0)), single ([y2, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], 1, single (1), 0), single ([y2, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], single (1), 1, 0), single ([y2, NaN]), eps('single')) + +%!assert (gpcdf (single ([x, NaN]), -1, 1, 0), single ([y3, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], -1, 1, single (0)), single ([y3, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], -1, single (1), 0), single ([y3, NaN]), eps('single')) +%!assert (gpcdf ([x, NaN], single (-1), 1, 0), single ([y3, NaN]), eps('single')) + +## Test input validation +%!error gpcdf () +%!error gpcdf (1) +%!error gpcdf (1,2) +%!error gpcdf (1,2,3) +%!error gpcdf (1,2,3,4,5) +%!error gpcdf (ones (3), ones (2), ones (2), ones (2)) +%!error gpcdf (ones (2), ones (2), ones (2), ones (3)) +%!error gpcdf (ones (2), ones (2), ones (3), ones (2)) +%!error gpcdf (ones (2), ones (3), ones (2), ones (2)) +%!error gpcdf (i, 2, 2, 2) +%!error gpcdf (2, i, 2, 2) +%!error gpcdf (2, 2, i, 2) +%!error gpcdf (2, 2, 2, i) + diff --git a/inst/gpinv.m b/inst/gpinv.m new file mode 100644 index 0000000..7650f86 --- /dev/null +++ b/inst/gpinv.m @@ -0,0 +1,162 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1997-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} gpinv (@var{x}, @var{shape}, @var{scale}, @var{location}) +## For each element of @var{x}, compute the quantile (the inverse of the CDF) +## at @var{x} of the generalized Pareto distribution with parameters +## @var{location}, @var{scale}, and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Quantile function of the generalized Pareto distribution + +function inv = gpinv (x, shape, scale, location) + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar (shape)) + [retval, x, location, scale, shape] = ... + common_size (x, location, scale, shape); + if (retval > 0) + error ("gpinv: X, LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (location) ... + || iscomplex (scale) || iscomplex (shape)) + error ("gpinv: X, LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (isa (x, "single") || isa (location, "single") ... + || isa (scale, "single") || isa (shape, "single")) + inv = zeros (size (x), "single"); + else + inv = zeros (size (x)); + endif + + k = isnan (x) | ! (0 <= x) | ! (x <= 1) ... + | ! (-Inf < location) | ! (location < Inf) ... + | ! (scale > 0) | ! (scale < Inf) ... + | ! (-Inf < shape) | ! (shape < Inf); + inv(k) = NaN; + + k = (0 <= x) & (x <= 1) & (-Inf < location) & (location < Inf) ... + & (scale > 0) & (scale < Inf) & (-Inf < shape) & (shape < Inf); + if (isscalar (location) && isscalar (scale) && isscalar (shape)) + if (shape == 0) + inv(k) = -log(1 - x(k)); + inv(k) = scale * inv(k) + location; + elseif (shape > 0) + inv(k) = (1 - x(k)).^(-shape) - 1; + inv(k) = (scale / shape) * inv(k) + location; + elseif (shape < 0) + inv(k) = (1 - x(k)).^(-shape) - 1; + inv(k) = (scale / shape) * inv(k) + location; + end + else + j = k & (shape == 0); + if (any (j)) + inv(j) = -log (1 - x(j)); + inv(j) = scale(j) .* inv(j) + location(j); + endif + + j = k & (shape > 0); + if (any (j)) + inv(j) = (1 - x(j)).^(-shape(j)) - 1; + inv(j) = (scale(j) ./ shape(j)) .* inv(j) + location(j); + endif + + j = k & (shape < 0); + if (any (j)) + inv(j) = (1 - x(j)).^(-shape(j)) - 1; + inv(j) = (scale(j) ./ shape(j)) .* inv(j) + location(j); + endif + endif +endfunction + + +%!shared x,y1,y2,y3 +%! x = [-1, 0, 1/2, 1, 2]; +%! y1 = [NaN, 0, 0.6931471805599453, Inf, NaN]; +%! y2 = [NaN, 0, 1, Inf, NaN]; +%! y3 = [NaN, 0, 1/2, 1, NaN]; +%!assert (gpinv (x, zeros (1,5), ones (1,5), zeros (1,5)), y1) +%!assert (gpinv (x, 0, 1, zeros (1,5)), y1) +%!assert (gpinv (x, 0, ones (1,5), 0), y1) +%!assert (gpinv (x, zeros (1,5), 1, 0), y1) +%!assert (gpinv (x, 0, 1, 0), y1) +%!assert (gpinv (x, 0, 1, [0, 0, NaN, 0, 0]), [y1(1:2), NaN, y1(4:5)]) +%!assert (gpinv (x, 0, [1, 1, NaN, 1, 1], 0), [y1(1:2), NaN, y1(4:5)]) +%!assert (gpinv (x, [0, 0, NaN, 0, 0], 1, 0), [y1(1:2), NaN, y1(4:5)]) +%!assert (gpinv ([x(1:2), NaN, x(4:5)], 0, 1, 0), [y1(1:2), NaN, y1(4:5)]) + +%!assert (gpinv (x, ones (1,5), ones (1,5), zeros (1,5)), y2) +%!assert (gpinv (x, 1, 1, zeros (1,5)), y2) +%!assert (gpinv (x, 1, ones (1,5), 0), y2) +%!assert (gpinv (x, ones (1,5), 1, 0), y2) +%!assert (gpinv (x, 1, 1, 0), y2) +%!assert (gpinv (x, 1, 1, [0, 0, NaN, 0, 0]), [y2(1:2), NaN, y2(4:5)]) +%!assert (gpinv (x, 1, [1, 1, NaN, 1, 1], 0), [y2(1:2), NaN, y2(4:5)]) +%!assert (gpinv (x, [1, 1, NaN, 1, 1], 1, 0), [y2(1:2), NaN, y2(4:5)]) +%!assert (gpinv ([x(1:2), NaN, x(4:5)], 1, 1, 0), [y2(1:2), NaN, y2(4:5)]) + +%!assert (gpinv (x, -ones (1,5), ones (1,5), zeros (1,5)), y3) +%!assert (gpinv (x, -1, 1, zeros (1,5)), y3) +%!assert (gpinv (x, -1, ones (1,5), 0), y3) +%!assert (gpinv (x, -ones (1,5), 1, 0), y3) +%!assert (gpinv (x, -1, 1, 0), y3) +%!assert (gpinv (x, -1, 1, [0, 0, NaN, 0, 0]), [y3(1:2), NaN, y3(4:5)]) +%!assert (gpinv (x, -1, [1, 1, NaN, 1, 1], 0), [y3(1:2), NaN, y3(4:5)]) +%!assert (gpinv (x, -[1, 1, NaN, 1, 1], 1, 0), [y3(1:2), NaN, y3(4:5)]) +%!assert (gpinv ([x(1:2), NaN, x(4:5)], -1, 1, 0), [y3(1:2), NaN, y3(4:5)]) + +## Test class of input preserved +%!assert (gpinv (single ([x, NaN]), 0, 1, 0), single ([y1, NaN])) +%!assert (gpinv ([x, NaN], 0, 1, single (0)), single ([y1, NaN])) +%!assert (gpinv ([x, NaN], 0, single (1), 0), single ([y1, NaN])) +%!assert (gpinv ([x, NaN], single (0), 1, 0), single ([y1, NaN])) + +%!assert (gpinv (single ([x, NaN]), 1, 1, 0), single ([y2, NaN])) +%!assert (gpinv ([x, NaN], 1, 1, single (0)), single ([y2, NaN])) +%!assert (gpinv ([x, NaN], 1, single (1), 0), single ([y2, NaN])) +%!assert (gpinv ([x, NaN], single (1), 1, 0), single ([y2, NaN])) + +%!assert (gpinv (single ([x, NaN]), -1, 1, 0), single ([y3, NaN])) +%!assert (gpinv ([x, NaN], -1, 1, single (0)), single ([y3, NaN])) +%!assert (gpinv ([x, NaN], -1, single (1), 0), single ([y3, NaN])) +%!assert (gpinv ([x, NaN], single (-1), 1, 0), single ([y3, NaN])) + +## Test input validation +%!error gpinv () +%!error gpinv (1) +%!error gpinv (1,2) +%!error gpinv (1,2,3) +%!error gpinv (1,2,3,4,5) +%!error gpinv (ones (3), ones (2), ones (2), ones (2)) +%!error gpinv (ones (2), ones (3), ones (2), ones (2)) +%!error gpinv (ones (2), ones (2), ones (3), ones (2)) +%!error gpinv (ones (2), ones (2), ones (2), ones (3)) +%!error gpinv (i, 2, 2, 2) +%!error gpinv (2, i, 2, 2) +%!error gpinv (2, 2, i, 2) +%!error gpinv (2, 2, 2, i) + diff --git a/inst/gppdf.m b/inst/gppdf.m new file mode 100644 index 0000000..d85869a --- /dev/null +++ b/inst/gppdf.m @@ -0,0 +1,170 @@ +## Copyright (C) 2018 John Donoghue +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1997-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} gppdf (@var{x}, @var{shape}, @var{scale}, @var{location}) +## Compute the probability density function (PDF) at @var{x} of the +## generalized Pareto distribution with parameters @var{location}, @var{scale}, +## and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: PDF of the generalized Pareto distribution + +function pdf = gppdf (x, shape, scale, location) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar (shape)) + [retval, x, location, scale, shape] = ... + common_size (x, location, scale, shape); + if (retval > 0) + error ("gppdf: X, LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (location) || iscomplex (scale) ... + || iscomplex (shape)) + error ("gppdf: X, LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (isa (x, "single") || isa (location, "single") || isa (scale, "single") ... + || isa (shape, "single")) + pdf = zeros (size (x), "single"); + else + pdf = zeros (size (x)); + endif + + k = isnan (x) | ! (-Inf < location) | ! (location < Inf) | ... + ! (scale > 0) | ! (scale < Inf) | ! (-Inf < shape) | ! (shape < Inf); + pdf(k) = NaN; + + k = (-Inf < x) & (x < Inf) & (-Inf < location) & (location < Inf) & ... + (scale > 0) & (scale < Inf) & (-Inf < shape) & (shape < Inf); + if (isscalar (location) && isscalar (scale) && isscalar (shape)) + z = (x - location) / scale; + + j = k & (shape == 0) & (z >= 0); + if (any (j)) + pdf(j) = exp (-z(j)); + endif + + j = k & (shape > 0) & (z >= 0); + if (any (j)) + pdf(j) = (shape * z(j) + 1).^(-(shape + 1) / shape) ./ scale; + endif + + if (shape < 0) + j = k & (shape < 0) & (0 <= z) & (z <= -1. / shape); + if (any (j)) + pdf(j) = (shape * z(j) + 1).^(-(shape + 1) / shape) ./ scale; + endif + endif + else + z = (x - location) ./ scale; + + j = k & (shape == 0) & (z >= 0); + if (any (j)) + pdf(j) = exp( -z(j)); + endif + + j = k & (shape > 0) & (z >= 0); + if (any (j)) + pdf(j) = (shape(j) .* z(j) + 1).^(-(shape(j) + 1) ./ shape(j)) ./ scale(j); + endif + + if (any (shape < 0)) + j = k & (shape < 0) & (0 <= z) & (z <= -1 ./ shape); + if (any (j)) + pdf(j) = (shape(j) .* z(j) + 1).^(-(shape(j) + 1) ./ shape(j)) ./ scale(j); + endif + endif + endif + +endfunction + + +%!shared x,y1,y2,y3 +%! x = [-Inf, -1, 0, 1/2, 1, Inf]; +%! y1 = [0, 0, 1, 0.6065306597126334, 0.36787944117144233, 0]; +%! y2 = [0, 0, 1, 4/9, 1/4, 0]; +%! y3 = [0, 0, 1, 1, 1, 0]; +%!assert (gppdf (x, zeros (1,6), ones (1,6), zeros (1,6)), y1, eps) +%!assert (gppdf (x, 0, 1, zeros (1,6)), y1, eps) +%!assert (gppdf (x, 0, ones (1,6), 0), y1, eps) +%!assert (gppdf (x, zeros (1,6), 1, 0), y1, eps) +%!assert (gppdf (x, 0, 1, 0), y1, eps) +%!assert (gppdf (x, 0, 1, [0, 0, 0, NaN, 0, 0]), [y1(1:3), NaN, y1(5:6)]) +%!assert (gppdf (x, 0, [1, 1, 1, NaN, 1, 1], 0), [y1(1:3), NaN, y1(5:6)]) +%!assert (gppdf (x, [0, 0, 0, NaN, 0, 0], 1, 0), [y1(1:3), NaN, y1(5:6)]) +%!assert (gppdf ([x(1:3), NaN, x(5:6)], 0, 1, 0), [y1(1:3), NaN, y1(5:6)]) + +%!assert (gppdf (x, ones (1,6), ones (1,6), zeros (1,6)), y2, eps) +%!assert (gppdf (x, 1, 1, zeros (1,6)), y2, eps) +%!assert (gppdf (x, 1, ones (1,6), 0), y2, eps) +%!assert (gppdf (x, ones (1,6), 1, 0), y2, eps) +%!assert (gppdf (x, 1, 1, 0), y2, eps) +%!assert (gppdf (x, 1, 1, [0, 0, 0, NaN, 0, 0]), [y2(1:3), NaN, y2(5:6)]) +%!assert (gppdf (x, 1, [1, 1, 1, NaN, 1, 1], 0), [y2(1:3), NaN, y2(5:6)]) +%!assert (gppdf (x, [1, 1, 1, NaN, 1, 1], 1, 0), [y2(1:3), NaN, y2(5:6)]) +%!assert (gppdf ([x(1:3), NaN, x(5:6)], 1, 1, 0), [y2(1:3), NaN, y2(5:6)]) + +%!assert (gppdf (x, -ones (1,6), ones (1,6), zeros (1,6)), y3, eps) +%!assert (gppdf (x, -1, 1, zeros (1,6)), y3, eps) +%!assert (gppdf (x, -1, ones (1,6), 0), y3, eps) +%!assert (gppdf (x, -ones (1,6), 1, 0), y3, eps) +%!assert (gppdf (x, -1, 1, 0), y3, eps) +%!assert (gppdf (x, -1, 1, [0, 0, 0, NaN, 0, 0]), [y3(1:3), NaN, y3(5:6)]) +%!assert (gppdf (x, -1, [1, 1, 1, NaN, 1, 1], 0), [y3(1:3), NaN, y3(5:6)]) +%!assert (gppdf (x, [-1, -1, -1, NaN, -1, -1], 1, 0), [y3(1:3), NaN, y3(5:6)]) +%!assert (gppdf ([x(1:3), NaN, x(5:6)], -1, 1, 0), [y3(1:3), NaN, y3(5:6)]) + +## Test class of input preserved +%!assert (gppdf (single ([x, NaN]), 0, 1, 0), single ([y1, NaN])) +%!assert (gppdf ([x, NaN], 0, 1, single (0)), single ([y1, NaN])) +%!assert (gppdf ([x, NaN], 0, single (1), 0), single ([y1, NaN])) +%!assert (gppdf ([x, NaN], single (0), 1, 0), single ([y1, NaN])) + +%!assert (gppdf (single ([x, NaN]), 1, 1, 0), single ([y2, NaN])) +%!assert (gppdf ([x, NaN], 1, 1, single (0)), single ([y2, NaN])) +%!assert (gppdf ([x, NaN], 1, single (1), 0), single ([y2, NaN])) +%!assert (gppdf ([x, NaN], single (1), 1, 0), single ([y2, NaN])) + +%!assert (gppdf (single ([x, NaN]), -1, 1, 0), single ([y3, NaN])) +%!assert (gppdf ([x, NaN], -1, 1, single (0)), single ([y3, NaN])) +%!assert (gppdf ([x, NaN], -1, single (1), 0), single ([y3, NaN])) +%!assert (gppdf ([x, NaN], single (-1), 1, 0), single ([y3, NaN])) + +## Test input validation +%!error gppdf () +%!error gppdf (1) +%!error gppdf (1,2) +%!error gppdf (1,2,3) +%!error gppdf (1,2,3,4,5) +%!error gppdf (1, ones (2), ones (2), ones (3)) +%!error gppdf (1, ones (2), ones (3), ones (2)) +%!error gppdf (1, ones (3), ones (2), ones (2)) +%!error gppdf (i, 2, 2, 2) +%!error gppdf (2, i, 2, 2) +%!error gppdf (2, 2, i, 2) +%!error gppdf (2, 2, 2, i) + diff --git a/inst/gprnd.m b/inst/gprnd.m new file mode 100644 index 0000000..5511db8 --- /dev/null +++ b/inst/gprnd.m @@ -0,0 +1,174 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} gprnd (@var{shape}, @var{scale}, @var{location}) +## @deftypefnx {} {} gprnd (@var{shape}, @var{scale}, @var{location}, @var{r}) +## @deftypefnx {} {} gprnd (@var{shape}, @var{scale}, @var{location}, @var{r}, @var{c}, @dots{}) +## @deftypefnx {} {} gprnd (@var{shape}, @var{scale}, @var{location}, [@var{sz}]) +## Return a matrix of random samples from the generalized Pareto distribution +## with parameters @var{location}, @var{scale} and @var{shape}. +## +## When called with a single size argument, return a square matrix with +## the dimension specified. When called with more than one scalar argument the +## first two arguments are taken as the number of rows and columns and any +## further arguments specify additional matrix dimensions. The size may also +## be specified with a vector of dimensions @var{sz}. +## +## If no size arguments are given then the result matrix is the common size of +## @var{location}, @var{scale} and @var{shape}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Random deviates from the generalized Pareto distribution + +function rnd = gprnd (shape, scale, location, varargin) + + if (nargin < 3) + print_usage (); + endif + + if (! isscalar (location) || ! isscalar (scale) || ! isscalar (shape)) + [retval, location, scale, shape] = common_size (location, scale, shape); + if (retval > 0) + error ("gpgrnd: LOCATION, SCALE and SHAPE must be of common size or scalars"); + endif + endif + + if (iscomplex (location) || iscomplex (scale) || iscomplex (shape)) + error ("gprnd: LOCATION, SCALE and SHAPE must not be complex"); + endif + + if (nargin == 3) + sz = size (location); + elseif (nargin == 4) + if (isscalar (varargin{1}) && varargin{1} >= 0) + sz = [varargin{1}, varargin{1}]; + elseif (isrow (varargin{1}) && all (varargin{1} >= 0)) + sz = varargin{1}; + else + error ("gprnd: dimension vector must be row vector of non-negative integers"); + endif + elseif (nargin > 4) + if (any (cellfun (@(x) (! isscalar (x) || x < 0), varargin))) + error ("gprnd: dimensions must be non-negative integers"); + endif + sz = [varargin{:}]; + endif + + if (! isscalar (location) && ! isequal (size (location), sz)) + error ("gprnd: LOCATION, SCALE and SHAPE must be scalar or of size SZ"); + endif + + if (isa (location, "single") || isa (scale, "single") || isa (shape, "single")) + cls = "single"; + else + cls = "double"; + endif + + if (isscalar (location) && isscalar (scale) && isscalar (shape)) + if ((-Inf < location) && (location < Inf) && (0 < scale) && (scale < Inf) ... + && (-Inf < shape) && (shape < Inf)) + rnd = rand(sz,cls); + if (shape == 0) + rnd = -log(1 - rnd); + rnd = scale * rnd + location; + elseif ((shape < 0) || (shape > 0)) + rnd = (1 - rnd).^(-shape) - 1; + rnd = (scale / shape) * rnd + location; + end + else + rnd = NaN (sz, cls); + endif + else + rnd = NaN (sz, cls); + + k = (-Inf < location) & (location < Inf) & (scale > 0) ... + & (-Inf < shape) & (shape < Inf); + rnd(k(:)) = rand (1, sum(k(:)), cls); + if (any (shape == 0)) + rnd(k) = -log(1 - rnd(k)); + rnd(k) = scale(k) .* rnd(k) + location(k); + elseif (any (shape < 0 | shape > 0)) + rnd(k) = (1 - rnd(k)).^(-shape(k)) - 1; + rnd(k) = (scale(k) ./ shape(k)) .* rnd(k) + location(k); + end + endif +endfunction + + +%!assert (size (gprnd (0,1,0)), [1, 1]) +%!assert (size (gprnd (0, 1, zeros (2,1))), [2, 1]) +%!assert (size (gprnd (0, 1, zeros (2,2))), [2, 2]) +%!assert (size (gprnd (0, ones (2,1), 0)), [2, 1]) +%!assert (size (gprnd (0, ones (2,2), 0)), [2, 2]) +%!assert (size (gprnd (zeros (2,1), 1, 0)), [2, 1]) +%!assert (size (gprnd (zeros (2,2), 1, 0)), [2, 2]) +%!assert (size (gprnd (0, 1, 0, 3)), [3, 3]) +%!assert (size (gprnd (0, 1, 0, [4 1])), [4, 1]) +%!assert (size (gprnd (0, 1, 0, 4, 1)), [4, 1]) + +%!assert (size (gprnd (1,1,0)), [1, 1]) +%!assert (size (gprnd (1, 1, zeros (2,1))), [2, 1]) +%!assert (size (gprnd (1, 1, zeros (2,2))), [2, 2]) +%!assert (size (gprnd (1, ones (2,1), 0)), [2, 1]) +%!assert (size (gprnd (1, ones (2,2), 0)), [2, 2]) +%!assert (size (gprnd (ones (2,1), 1, 0)), [2, 1]) +%!assert (size (gprnd (ones (2,2), 1, 0)), [2, 2]) +%!assert (size (gprnd (1, 1, 0, 3)), [3, 3]) +%!assert (size (gprnd (1, 1, 0, [4 1])), [4, 1]) +%!assert (size (gprnd (1, 1, 0, 4, 1)), [4, 1]) + +%!assert (size (gprnd (-1, 1, 0)), [1, 1]) +%!assert (size (gprnd (-1, 1, zeros (2,1))), [2, 1]) +%!assert (size (gprnd (1, -1, zeros (2,2))), [2, 2]) +%!assert (size (gprnd (-1, ones (2,1), 0)), [2, 1]) +%!assert (size (gprnd (-1, ones (2,2), 0)), [2, 2]) +%!assert (size (gprnd (-ones (2,1), 1, 0)), [2, 1]) +%!assert (size (gprnd (-ones (2,2), 1, 0)), [2, 2]) +%!assert (size (gprnd (-1, 1, 0, 3)), [3, 3]) +%!assert (size (gprnd (-1, 1, 0, [4 1])), [4, 1]) +%!assert (size (gprnd (-1, 1, 0, 4, 1)), [4, 1]) + +## Test class of input preserved +%!assert (class (gprnd (0,1,0)), "double") +%!assert (class (gprnd (0, 1, single (0))), "single") +%!assert (class (gprnd (0, 1, single ([0 0]))), "single") +%!assert (class (gprnd (0,single (1),0)), "single") +%!assert (class (gprnd (0,single ([1 1]),0)), "single") +%!assert (class (gprnd (single (0), 1, 0)), "single") +%!assert (class (gprnd (single ([0 0]), 1, 0)), "single") + +## Test input validation +%!error gprnd () +%!error gprnd (1) +%!error gprnd (1,2) +%!error gprnd (zeros (2), ones (2), zeros (3)) +%!error gprnd (zeros (2), ones (3), zeros (2)) +%!error gprnd (zeros (3), ones (2), zeros (2)) +%!error gprnd (i, 1, 0) +%!error gprnd (0, i, 0) +%!error gprnd (0, 1, i) +%!error gprnd (0,1,0, -1) +%!error gprnd (0,1,0, ones (2)) +%!error gprnd (0,1,0, [2 -1 2]) +%!error gprnd (0,1, zeros (2), 3) +%!error gprnd (0,1, zeros (2), [3, 2]) +%!error gprnd (0,1, zeros (2), 3, 2) + diff --git a/inst/grp2idx.m b/inst/grp2idx.m new file mode 100644 index 0000000..a84a267 --- /dev/null +++ b/inst/grp2idx.m @@ -0,0 +1,131 @@ +## Copyright (C) 2015 Carnë Draug +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{g}, @var{gn}, @var{gl}] =} grp2idx (@var{s}) +## Get index for group variables. +## +## For variable @var{s}, returns the indices @var{g}, into the variable +## groups @var{gn} and @var{gl}. The first has a string representation of +## the groups while the later has its actual values. +## +## NaNs and empty strings in @var{s} appear as NaN in @var{g} and are +## not present on either @var{gn} and @var{gl}. +## +## @seealso{cellstr, num2str, unique} +## @end deftypefn + +function [g, gn, gl] = grp2idx (s) + if (nargin != 1) + print_usage (); + endif + + s_was_char = false; + if (ischar (s)) + s_was_char = true; + s = cellstr (s); + elseif (! isvector (s)) + error ("grp2idx: S must be a vector, cell array of strings, or char matrix"); + endif + + ## FIXME once Octave core implements "sorted" and "stable" argument to + ## unique(), we can use the following snippet so that we are fully + ## Matlab compatible. +# set_order = "sorted"; +# if (iscellstr (s)) +# set_order = "stable"; +# endif +# [gl, ~, g] = unique (s(:), set_order); + + [gl, ~, g] = unique (s(:)); + + ## handle NaNs and empty strings + if (iscellstr (s)) + ## FIXME empty strings appear at the front because unique is sorting + ## them, so we only need to subtract one. However, when fix the + ## order for strings (when core's unique has the stable option), + ## then we'll have to come up with something clever. + empties = cellfun (@isempty, s); + if (any (empties)) + g(empties) = NaN; + g--; + gl(1) = []; + endif + else + ## This works fine because NaN come at the end after sorting, we don't + ## have to worry about change on the indices. + g(isnan (s)) = NaN; + gl(isnan (gl)) = []; + endif + + if (isargout (2)) + if (iscellstr (gl)) + gn = gl; + elseif (iscell (gl)) + gn = cellfun (@num2str, gl, "UniformOutput", false); + else + gn = arrayfun (@num2str, gl, "UniformOutput", false); + endif + endif + + if (isargout (3) && s_was_char) + gl = char (gl); + endif + +endfunction + +## test boolean input and note that row or column vector makes no difference +%!test +%! in = [true false false true]; +%! out = {[2; 1; 1; 2] {"0"; "1"} [false; true]}; +%! assert (nthargout (1:3, @grp2idx, in), out) +%! assert (nthargout (1:3, @grp2idx, in), nthargout (1:3, @grp2idx, in')) + +## test that groups are ordered in boolean +%!test +%! assert (nthargout (1:3, @grp2idx, [false true]), +%! {[1; 2] {"0"; "1"} [false; true]}); +%! assert (nthargout (1:3, @grp2idx, [true false]), +%! {[2; 1] {"0"; "1"} [false; true]}); + +## test char matrix and cell array of strings +%!assert (nthargout (1:3, @grp2idx, ["oct"; "sci"; "oct"; "oct"; "sci"]), +%! {[1; 2; 1; 1; 2] {"oct"; "sci"} ["oct"; "sci"]}); +## and cell array of strings +%!assert (nthargout (1:3, @grp2idx, {"oct"; "sci"; "oct"; "oct"; "sci"}), +%! {[1; 2; 1; 1; 2] {"oct"; "sci"} {"oct"; "sci"}}); + +## test numeric arrays +%!assert (nthargout (1:3, @grp2idx, [ 1 -3 -2 -3 -3 2 1 -1 3 -3]), +%! {[4; 1; 2; 1; 1; 5; 4; 3; 6; 1] {"-3"; "-2"; "-1"; "1"; "2"; "3"} ... +%! [-3; -2; -1; 1; 2; 3]}); + +## test for NaN and empty strings +%!assert (nthargout (1:3, @grp2idx, [2 2 3 NaN 2 3]), +%! {[1; 1; 2; NaN; 1; 2] {"2"; "3"} [2; 3]}) +%!assert (nthargout (1:3, @grp2idx, {"et" "sa" "sa" "" "et"}), +%! {[1; 2; 2; NaN; 1] {"et"; "sa"} {"et"; "sa"}}) + +## FIXME this fails because unique() in core does not yet have set_order +## option implemented. See code for code to uncomment once it is +## implemented in core. +## Test that order when handling strings is by order of appearance +%!xtest <51928> assert (nthargout (1:3, @grp2idx, ["sci"; "oct"; "sci"; "oct"; "oct"]), +%! {[1; 2; 1; 2; 2] {"sci"; "oct"} ["sci"; "oct"]}); +%!xtest <51928> assert (nthargout (1:3, @grp2idx, {"sci"; "oct"; "sci"; "oct"; "oct"}), +%! {[1; 2; 1; 2; 2] {"sci"; "oct"} {"sci"; "oct"}}); +%!xtest <51928> assert (nthargout (1:3, @grp2idx, {"sa" "et" "et" "" "sa"}), +%! {[1; 2; 2; NaN; 1] {"sa"; "et"} {"sa"; "et"}}) diff --git a/inst/gscatter.m b/inst/gscatter.m new file mode 100644 index 0000000..27452bc --- /dev/null +++ b/inst/gscatter.m @@ -0,0 +1,247 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} gscatter (@var{x}, @var{y}, @var{g}) +## @deftypefnx {Function File} @ +## {} gscatter (@var{x}, @var{y}, @var{g}, @var{clr}, @var{sym}, @var{siz}) +## @deftypefnx {Function File} @ +## {} gscatter (@dots{}, @var{doleg}, @var{xnam}, @var{ynam}) +## @deftypefnx {Function File} {@var{h} =} gscatter (@dots{}) +## +## Draw a scatter plot with grouped data. +## +## @code{gscatter} is a utility function to draw a scatter plot of @var{x} and +## @var{y}, according to the groups defined by @var{g}. Input @var{x} and +## @var{y} are numeric vectors of the same size, while @var{g} is either a +## vector of the same size as @var{x} or a character matrix with the same number +## of rows as the size of @var{x}. As a vector @var{g} can be numeric, logical, +## a character array, a string array (not implemented), a cell string or cell +## array. +## +## A number of optional inputs change the appearance of the plot: +## @itemize @bullet +## @item @var{"clr"} +## defines the color for each group; if not enough colors are defined by +## @var{"clr"}, @code{gscatter} cycles through the specified colors. Colors can +## be defined as named colors, as rgb triplets or as indices for the current +## @code{colormap}. The default value is a different color for each group, +## according to the current @code{colormap}. +## +## @item @var{"sym"} +## is a char array of symbols for each group; if not enough symbols are defined +## by @var{"sym"}, @code{gscatter} cycles through the specified symbols. +## +## @item @var{"siz"} +## is a numeric array of sizes for each group; if not enough sizes are defined +## by @var{"siz"}, @code{gscatter} cycles through the specified sizes. +## +## @item @var{"doleg"} +## is a boolean value to show the legend; it can be either @qcode{on} (default) +## or @qcode{off}. +## +## @item @var{"xnam"} +## is a character array, the name for the x axis. +## +## @item @var{"ynam"} +## is a character array, the name for the y axis. +## @end itemize +## +## Output @var{h} is an array of graphics handles to the @code{line} object of +## each group. +## +## @end deftypefn +## +## @seealso{scatter} + +function h = gscatter (varargin) + ## optional axes handle + if (isaxes (varargin{1})) + ## parameter is an axes handle + hax = varargin{1}; + varargin = varargin(2:end); + nargin--; + endif + + ## check the input parameters + if (nargin < 3) + print_usage (); + endif + + ## + ## necessary parameters + ## + + ## x coordinates + if (isvector (varargin{1}) && + isnumeric (varargin{1})) + x = varargin{1}; + n = numel (x); + else + error ("gscatter: x must be a numeric vector"); + endif + + ## y coordinates + if (isvector (varargin{2}) && + isnumeric (varargin{2})) + if (numel (varargin{2}) == n) + y = varargin{2}; + else + error ("gscatter: x and y must have the same size"); + endif + else + error ("gscatter: y must be a numeric vector"); + endif + + ## groups + if (isrow (varargin{3})) + varargin{3} = transpose (varargin{3}); + endif + if (ismatrix (varargin{3}) && ischar (varargin{3})) + varargin{3} = cellstr (varargin{3}); # char matrix to cellstr + elseif (iscell (varargin{3}) && ! iscellstr (varargin{3})) + varargin{3} = cell2mat (varargin{3}); # numeric cell to vector + endif + if (isvector (varargin{3})) # only numeric vectors or cellstr + if (rows (varargin{3}) == n) + gv = varargin{3}; + g_names = unique (gv, "rows"); + g_len = numel (g_names); + if (iscellstr (g_names)) + for i = 1 : g_len + g(find (strcmp(gv, g_names{i}))) = i; + endfor + else + for i = 1 : g_len + g(find (gv == g_names(i))) = i; + endfor + endif + else + error ("gscatter: g must have the same size as x and y"); + endif + else + error (["gscatter: g must be a numeric or logical or char vector, "... + "or a cell or cellstr array, or a char matrix"]); + endif + + ## + ## optional parameters + ## + + ## Note: this parameters are passed as they are to 'line', + ## the validity check is delegated to 'line' + g_col = lines (g_len); + g_size = 6 * ones (g_len, 1); + g_sym = repmat ('o', 1, g_len); + + ## optional parameters for legend and axes labels + do_legend = 1; # legend shown by default + ## MATLAB compatibility: by default MATLAB uses the variable name as + ## label for either axis + mygetname = @(x) inputname(1); # to retrieve the name of a variable + x_nam = mygetname (varargin{1}); # this should retrieve the name of the var, + y_nam = mygetname (varargin{2}); # but it does not work + + ## parameters are all in fixed positions + for i = 4 : nargin + switch (i) + case 4 + ## colours + c_list = varargin{4}; + if (isrow (c_list)) + c_list = transpose (c_list); + endif + c_list_len = rows (c_list); + + g_col = repmat (c_list, ceil (g_len / c_list_len)); + case {5, 6} + ## size and symbols + s_list = varargin{i}; + s_list_len = length (s_list); + + g_tmp = repmat (s_list, ceil (g_len / s_list_len)); + if (i == 6) + g_size = g_tmp; + else + g_sym = g_tmp; + endif + case 7 + ## legend + switch (lower (varargin{7})) + case "on" + do_legend = 1; + case "off" + do_legend = 0; + otherwise + error ("gscatter: invalid dolegend parameter '%s'", varargin{7}); + endswitch + case {8, 9} + ## x and y label + if (! ischar (varargin{i}) && ! isvector (varargin{i})) + error ("gscatter: xnam and ynam must be strings"); + endif + if (i == 8) + x_nam = varargin{8}; + else + y_nam = varargin{9}; + endif + endswitch + endfor + + + ## scatter plot with grouping + if (! exist ("hax", "var")) + hax = gca (); + endif + + ## return value + h = []; + + hold on; + for i = 1 : g_len + idcs = find (g == i); + h(i) = line (hax, x(idcs), y(idcs), "linestyle", "none", ... + "markersize", g_size(i), "color", g_col(i,:), "marker", g_sym(i)); + endfor + if (do_legend) + if (isnumeric (g_names)) + g_names = num2str (g_names); + endif + warning ("off", "Octave:legend:unimplemented-location", "local"); + legend (hax, g_names, "location", "best"); + endif + xlabel (hax, x_nam); + ylabel (hax, y_nam); + hold off; +endfunction + +## input tests +%!error gscatter(); +%!error gscatter([1]); +%!error gscatter([1], [2]); +%!error gscatter('abc', [1 2 3], [1]); +%!error gscatter([1 2 3], [1 2], [1]); +%!error gscatter([1 2 3], 'abc', [1]); +%!error gscatter([1 2], [1 2], [1]); +%!error gscatter([1 2], [1 2], [1 2], 'rb', 'so', 12, 'xxx'); + +## demonstration +%!demo +%! load fisheriris; +%! X = meas(:,3:4); +%! cidcs = kmeans (X, 3, "Replicates", 5); +%! gscatter (X(:,1), X(:,2), cidcs, [.75 .75 0; 0 .75 .75; .75 0 .75], "os^"); +%! title ("Fisher's iris data"); + diff --git a/inst/harmmean.m b/inst/harmmean.m new file mode 100644 index 0000000..fcbd45b --- /dev/null +++ b/inst/harmmean.m @@ -0,0 +1,34 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} harmmean (@var{x}) +## @deftypefnx{Function File} harmmean (@var{x}, @var{dim}) +## Compute the harmonic mean. +## +## This function does the same as @code{mean (x, "h")}. +## +## @seealso{mean} +## @end deftypefn + +function a = harmmean(x, dim) + if (nargin == 1) + a = mean(x, "h"); + elseif (nargin == 2) + a = mean(x, "h", dim); + else + print_usage; + endif +endfunction diff --git a/inst/hist3.m b/inst/hist3.m new file mode 100644 index 0000000..da987db --- /dev/null +++ b/inst/hist3.m @@ -0,0 +1,419 @@ +## Copyright (C) 2015 Carnë Draug +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {} hist3 (@var{X}) +## @deftypefnx {Function File} {} hist3 (@var{X}, @var{nbins}) +## @deftypefnx {Function File} {} hist3 (@var{X}, @qcode{"Nbins"}, @var{nbins}) +## @deftypefnx {Function File} {} hist3 (@var{X}, @var{centers}) +## @deftypefnx {Function File} {} hist3 (@var{X}, @qcode{"Ctrs"}, @var{centers}) +## @deftypefnx {Function File} {} hist3 (@var{X}, @qcode{"Edges"}, @var{edges}) +## @deftypefnx {Function File} {[@var{N}, @var{C}] =} hist3 (@dots{}) +## @deftypefnx {Function File} {} hist3 (@dots{}, @var{prop}, @var{val}, @dots{}) +## @deftypefnx {Function File} {} hist3 (@var{hax}, @dots{}) +## Produce bivariate (2D) histogram counts or plots. +## +## The elements to produce the histogram are taken from the Nx2 matrix +## @var{X}. Any row with NaN values are ignored. The actual bins can +## be configured in 3 different: number, centers, or edges of the bins: +## +## @table @asis +## @item Number of bins (default) +## Produces equally spaced bins between the minimum and maximum values +## of @var{X}. Defined as a 2 element vector, @var{nbins}, one for each +## dimension. Defaults to @code{[10 10]}. +## +## @item Center of bins +## Defined as a cell array of 2 monotonically increasing vectors, +## @var{centers}. The width of each bin is determined from the adjacent +## values in the vector with the initial and final bin, extending to Infinity. +## +## @item Edge of bins +## Defined as a cell array of 2 monotonically increasing vectors, +## @var{edges}. @code{@var{N}(i,j)} contains the number of elements +## in @var{X} for which: +## +## @itemize @w{} +## @item +## @var{edges}@{1@}(i) <= @var{X}(:,1) < @var{edges}@{1@}(i+1) +## @item +## @var{edges}@{2@}(j) <= @var{X}(:,2) < @var{edges}@{2@}(j+1) +## @end itemize +## +## The consequence of this definition is that values outside the initial +## and final edge values are ignored, and that the final bin only contains +## the number of elements exactly equal to the final edge. +## +## @end table +## +## The return values, @var{N} and @var{C}, are the bin counts and centers +## respectively. These are specially useful to produce intensity maps: +## +## @example +## [counts, centers] = hist3 (data); +## imagesc (centers@{1@}, centers@{2@}, counts) +## @end example +## +## If there is no output argument, or if the axes graphics handle +## @var{hax} is defined, the function will plot a 3 dimensional bar +## graph. Any extra property/value pairs are passed directly to the +## underlying surface object. +## +## @seealso{hist, histc, lookup, mesh} +## @end deftypefn + +function [N, C] = hist3 (X, varargin) + if (nargin < 1) + print_usage (); + endif + + next_argin = 1; + should_draw = true; + if (isaxes (X)) + hax = X; + X = varargin{next_argin++}; + elseif (nargout == 0) + hax = gca (); + else + should_draw = false; + endif + + if (! ismatrix (X) || columns (X) != 2) + error ("hist3: X must be a 2 columns matrix"); + endif + + method = "nbins"; + val = [10 10]; + if (numel (varargin) >= next_argin) + this_arg = varargin{next_argin++}; + if (isnumeric (this_arg)) + method = "nbins"; + val = this_arg; + elseif (iscell (this_arg)) + method = "ctrs"; + val = this_arg; + elseif (numel (varargin) >= next_argin + && any (strcmpi ({"nbins", "ctrs", "edges"}, this_arg))) + method = tolower (this_arg); + val = varargin{next_argin++}; + else + next_argin--; + endif + endif + + have_centers = false; + switch (tolower (method)) + case "nbins" + [r_edges, c_edges] = edges_from_nbins (X, val); + case "ctrs" + have_centers = true; + centers = val; + [r_edges, c_edges] = edges_from_centers (val); + case "centers" + ## This was supported until 1.2.4 when the Matlab compatible option + ## 'Ctrs' was added. + persistent warned = false; + if (! warned) + warning ("hist3: option `centers' is deprecated. Use `ctrs'"); + endif + have_centers = true; + centers = val; + [r_edges, c_edges] = edges_from_centers (val); + case "edges" + if (! iscell (val) || numel (val) != 2 + || ! all (cellfun (@isvector, val))) + error ("hist3: EDGES must be a cell array with 2 vectors"); + endif + [r_edges] = vec (val{1}, 2); + [c_edges] = vec (val{2}, 2); + out_rows = any (X < [r_edges(1) c_edges(1)] + | X > [r_edges(end) c_edges(end)], 2); + X(out_rows,:) = []; + otherwise + ## we should never get here... + error ("hist3: invalid binning method `%s'", method); + endswitch + + ## We only remove the NaN now, after having computed the bin edges, + ## because the extremes from each column that define the edges may + ## be paired with a NaN. While such values do not appear on the + ## histogram, they must still be used to compute the histogram + ## edges. + X(any (isnan (X), 2), :) = []; + + r_idx = lookup (r_edges, X(:,1), "l"); + c_idx = lookup (c_edges, X(:,2), "l"); + + counts_size = [numel(r_edges) numel(c_edges)]; + counts = accumarray ([r_idx, c_idx], 1, counts_size); + + if (should_draw) + counts = counts.'; + z = zeros ((size (counts) +1) *2); + z(2:end-1,2:end-1) = kron (counts, ones (2, 2)); + ## Setting the values for the end of the histogram bin like this + ## seems straight wrong but that's hwo Matlab plots look. + y = [kron(c_edges, ones (1, 2)) (c_edges(end)*2-c_edges(end-1))([1 1])]; + x = [kron(r_edges, ones (1, 2)) (r_edges(end)*2-r_edges(end-1))([1 1])]; + mesh (hax, x, y, z, "facecolor", [.75 .85 .95], varargin{next_argin:end}); + else + N = counts; + if (isargout (2)) + if (! have_centers) + C = {(r_edges + [diff(r_edges)([1:end end])]/ 2) ... + (c_edges + [diff(c_edges)([1:end end])]/ 2)}; + else + C = centers(:)'; + C{1} = vec (C{1}, 2); + C{2} = vec (C{2}, 2); + endif + endif + endif + +endfunction + +function [r_edges, c_edges] = edges_from_nbins (X, nbins) + if (! isnumeric (nbins) || numel (nbins) != 2) + error ("hist3: NBINS must be a 2 element vector"); + endif + inits = min (X, [], 1); + ends = max (X, [], 1); + ends -= (ends - inits) ./ vec (nbins, 2); + + ## If any histogram side has an empty range, then still make NBINS + ## but then place that value at the centre of the centre bin so that + ## they appear in the centre in the plot. + single_bins = inits == ends; + if (any (single_bins)) + inits(single_bins) -= (floor (nbins(single_bins) ./2)) + 0.5; + ends(single_bins) = inits(single_bins) + nbins(single_bins) -1; + endif + + r_edges = linspace (inits(1), ends(1), nbins(1)); + c_edges = linspace (inits(2), ends(2), nbins(2)); +endfunction + +function [r_edges, c_edges] = edges_from_centers (ctrs) + if (! iscell (ctrs) || numel (ctrs) != 2 || ! all (cellfun (@isvector, ctrs))) + error ("hist3: CTRS must be a cell array with 2 vectors"); + endif + r_edges = vec (ctrs{1}, 2); + c_edges = vec (ctrs{2}, 2); + r_edges(2:end) -= diff (r_edges) / 2; + c_edges(2:end) -= diff (c_edges) / 2; +endfunction + +%!demo +%! X = [ +%! 1 1 +%! 1 1 +%! 1 10 +%! 1 10 +%! 5 5 +%! 5 5 +%! 5 5 +%! 5 5 +%! 5 5 +%! 7 3 +%! 7 3 +%! 7 3 +%! 10 10 +%! 10 10]; +%! hist3 (X) + +%!test +%! N_exp = [ 0 0 0 5 20 +%! 0 0 10 15 0 +%! 0 15 10 0 0 +%! 20 5 0 0 0]; +%! +%! n = 100; +%! x = [1:n]'; +%! y = [n:-1:1]'; +%! D = [x y]; +%! N = hist3 (D, [4 5]); +%! assert (N, N_exp); + +%!test +%! N_exp = [0 0 0 0 1 +%! 0 0 0 0 1 +%! 0 0 0 0 1 +%! 1 1 1 1 93]; +%! +%! n = 100; +%! x = [1:n]'; +%! y = [n:-1:1]'; +%! D = [x y]; +%! C{1} = [1 1.7 3 4]; +%! C{2} = [1:5]; +%! N = hist3 (D, C); +%! assert (N, N_exp); + +## bug 44987 +%!test +%! D = [1 1; 3 1; 3 3; 3 1]; +%! [c, nn] = hist3 (D, {0:4, 0:4}); +%! exp_c = zeros (5); +%! exp_c([7 9 19]) = [1 2 1]; +%! assert (c, exp_c); +%! assert (nn, {0:4, 0:4}); + +%!test +%! for i = 10 +%! assert (size (hist3 (rand (9, 2), "Edges", {[0:.2:1]; [0:.2:1]})), [6 6]) +%! endfor + +%!test +%! edge_1 = linspace (0, 10, 10); +%! edge_2 = linspace (0, 50, 10); +%! [c, nn] = hist3 ([1:10; 1:5:50]', "Edges", {edge_1, edge_2}); +%! exp_c = zeros (10, 10); +%! exp_c([1 12 13 24 35 46 57 68 79 90]) = 1; +%! assert (c, exp_c); +%! +%! assert (nn{1}, edge_1 + edge_1(2)/2, eps*10^4) +%! assert (nn{2}, edge_2 + edge_2(2)/2, eps*10^4) + +%!shared X +%! X = [ +%! 5 2 +%! 5 3 +%! 1 4 +%! 5 3 +%! 4 4 +%! 1 2 +%! 2 3 +%! 3 3 +%! 5 4 +%! 5 3]; + +%!test +%! N = zeros (10); +%! N([1 10 53 56 60 91 98 100]) = [1 1 1 1 3 1 1 1]; +%! C = {(1.2:0.4:4.8), (2.1:0.2:3.9)}; +%! assert (nthargout ([1 2], @hist3, X), {N C}, eps*10^3) + +%!test +%! N = zeros (5, 7); +%! N([1 5 17 18 20 31 34 35]) = [1 1 1 1 3 1 1 1]; +%! C = {(1.4:0.8:4.6), ((2+(1/7)):(2/7):(4-(1/7)))}; +%! assert (nthargout ([1 2], @hist3, X, [5 7]), {N C}, eps*10^3) +%! assert (nthargout ([1 2], @hist3, X, "Nbins", [5 7]), {N C}, eps*10^3) + +%!test +%! N = [0 1 0; 0 1 0; 0 0 1; 0 0 0]; +%! C = {(2:5), (2.5:1:4.5)}; +%! assert (nthargout ([1 2], @hist3, X, "Edges", {(1.5:4.5), (2:4)}), {N C}) + +%!test +%! N = [0 0 1 0 1 0; 0 0 0 1 0 0; 0 0 1 4 2 0]; +%! C = {(1.2:3.2), (0:5)}; +%! assert (nthargout ([1 2], @hist3, X, "Ctrs", C), {N C}) +%! assert (nthargout ([1 2], @hist3, X, C), {N C}) + +%!test +%! [~, C] = hist3 (rand (10, 2), "Edges", {[0 .05 .15 .35 .55 .95], +%! [-1 .05 .07 .2 .3 .5 .89 1.2]}); +%! C_exp = {[ 0.025 0.1 0.25 0.45 0.75 1.15], ... +%! [-0.475 0.06 0.135 0.25 0.4 0.695 1.045 1.355]}; +%! assert (C, C_exp, eps*10^2) + +## Test how handling of out of borders is different whether we are +## defining Centers or Edges. +%!test +%! Xv = repmat ([1:10]', [1 2]); +%! +%! ## Test Centers +%! assert (hist3 (Xv, "Ctrs", {1:10, 1:10}), eye (10)) +%! +%! N_exp = eye (6); +%! N_exp([1 end]) = 3; +%! assert (hist3 (Xv, "Ctrs", {3:8, 3:8}), N_exp) +%! +%! N_exp = zeros (8, 6); +%! N_exp([1 2 11 20 29 38 47 48]) = [2 1 1 1 1 1 1 2]; +%! assert (hist3 (Xv, "Ctrs", {2:9, 3:8}), N_exp) +%! +%! ## Test Edges +%! assert (hist3 (Xv, "Edges", {1:10, 1:10}), eye (10)) +%! assert (hist3 (Xv, "Edges", {3:8, 3:8}), eye (6)) +%! assert (hist3 (Xv, "Edges", {2:9, 3:8}), [zeros(1, 6); eye(6); zeros(1, 6)]) +%! +%! N_exp = zeros (14); +%! N_exp(3:12, 3:12) = eye (10); +%! assert (hist3 (Xv, "Edges", {-1:12, -1:12}), N_exp) +%! +%! ## Test for Nbins +%! assert (hist3 (Xv), eye (10)) +%! assert (hist3 (Xv, [10 10]), eye (10)) +%! assert (hist3 (Xv, "nbins", [10 10]), eye (10)) +%! assert (hist3 (Xv, [5 5]), eye (5) * 2) +%! +%! N_exp = zeros (7, 5); +%! N_exp([1 9 10 18 26 27 35]) = [2 1 1 2 1 1 2]; +%! assert (hist3 (Xv, [7 5]), N_exp) + +%!test # bug #51059 +%! D = [1 1; NaN 2; 3 1; 3 3; 1 NaN; 3 1]; +%! [c, nn] = hist3 (D, {0:4, 0:4}); +%! exp_c = zeros (5); +%! exp_c([7 9 19]) = [1 2 1]; +%! assert (c, exp_c) +%! assert (nn, {0:4, 0:4}) + +## Single row of data or cases where all elements have the same value +## on one side of the histogram. +%!test +%! [c, nn] = hist3 ([1 8]); +%! exp_c = zeros (10, 10); +%! exp_c(6, 6) = 1; +%! exp_nn = {-4:5, 3:12}; +%! assert (c, exp_c) +%! assert (nn, exp_nn, eps) +%! +%! [c, nn] = hist3 ([1 8], [10 11]); +%! exp_c = zeros (10, 11); +%! exp_c(6, 6) = 1; +%! exp_nn = {-4:5, 3:13}; +%! assert (c, exp_c) +%! assert (nn, exp_nn, eps) + +## NaNs paired with values defining the histogram edges. +%!test +%! [c, nn] = hist3 ([1 NaN; 2 3; 6 9; 8 NaN]); +%! exp_c = zeros (10, 10); +%! exp_c(2, 1) = 1; +%! exp_c(8, 10) = 1; +%! exp_nn = {linspace(1.35, 7.65, 10) linspace(3.3, 8.7, 10)}; +%! assert (c, exp_c) +%! assert (nn, exp_nn, eps*100) + +## Columns full of NaNs (recent Matlab versions seem to throw an error +## but this did work like this on R2010b at least). +%!test +%! [c, nn] = hist3 ([1 NaN; 2 NaN; 6 NaN; 8 NaN]); +%! exp_c = zeros (10, 10); +%! exp_nn = {linspace(1.35, 7.65, 10) NaN(1, 10)}; +%! assert (c, exp_c) +%! assert (nn, exp_nn, eps*100) + +## Behaviour of an empty X after removal of rows with NaN. +%!test +%! [c, nn] = hist3 ([1 NaN; NaN 3; NaN 9; 8 NaN]); +%! exp_c = zeros (10, 10); +%! exp_nn = {linspace(1.35, 7.65, 10) linspace(3.3, 8.7, 10)}; +%! assert (c, exp_c) +%! assert (nn, exp_nn, eps*100) diff --git a/inst/histfit.m b/inst/histfit.m new file mode 100644 index 0000000..2c32dc6 --- /dev/null +++ b/inst/histfit.m @@ -0,0 +1,69 @@ +## Copyright (C) 2003 Alberto Terruzzi +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} histfit (@var{data}, @var{nbins}) +## +## Plot histogram with superimposed fitted normal density. +## +## @code{histfit (@var{data}, @var{nbins})} plots a histogram of the values in +## the vector @var{data} using @var{nbins} bars in the histogram. With one input +## argument, @var{nbins} is set to the square root of the number of elements in +## data. +## +## Example +## +## @example +## histfit (randn (100, 1)) +## @end example +## +## @seealso{bar,hist, pareto} +## @end deftypefn + +## Author: Alberto Terruzzi +## Version: 1.0 +## Created: 3 March 2004 + +function histfit (data,nbins) + + if nargin < 1 || nargin > 2 + print_usage; + endif + + if isvector (data) != 1 + error ("data must be a vector."); + endif + + row = sum(~isnan(data)); + + if nargin < 2 + nbins = ceil(sqrt(row)); + endif + + [n,xbin]=hist(data,nbins); + if any(abs(diff(xbin,2)) > 10*max(abs(xbin))*eps) + error("histfit bins must be uniform width"); + endif + + mr = nanmean(data); ## Estimates the parameter, MU, of the normal distribution. + sr = nanstd(data); ## Estimates the parameter, SIGMA, of the normal distribution. + x=(-3*sr+mr:0.1*sr:3*sr+mr)';## Evenly spaced samples of the expected data range. + [xb,yb] = bar(xbin,n); + y = normpdf(x,mr,sr); + binwidth = xbin(2)-xbin(1); + y = row*y*binwidth; ## Normalization necessary to overplot the histogram. + plot(xb,yb,";;b",x,y,";;r-"); ## Plots density line over histogram. + +endfunction diff --git a/inst/hmmestimate.m b/inst/hmmestimate.m new file mode 100644 index 0000000..dbbff5b --- /dev/null +++ b/inst/hmmestimate.m @@ -0,0 +1,338 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{transprobest}, @var{outprobest}] =} hmmestimate (@var{sequence}, @var{states}) +## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'statenames', @var{statenames}) +## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'symbols', @var{symbols}) +## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudotransitions', @var{pseudotransitions}) +## @deftypefnx {Function File} {} hmmestimate (@dots{}, 'pseudoemissions', @var{pseudoemissions}) +## Estimate the matrix of transition probabilities and the matrix of output +## probabilities of a given sequence of outputs and states generated by a +## hidden Markov model. The model assumes that the generation starts in +## state @code{1} at step @code{0} but does not include step @code{0} in the +## generated states and sequence. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{sequence} is a vector of a sequence of given outputs. The outputs +## must be integers ranging from @code{1} to the number of outputs of the +## hidden Markov model. +## +## @item +## @var{states} is a vector of the same length as @var{sequence} of given +## states. The states must be integers ranging from @code{1} to the number +## of states of the hidden Markov model. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{transprobest} is the matrix of the estimated transition +## probabilities of the states. @code{transprobest(i, j)} is the estimated +## probability of a transition to state @code{j} given state @code{i}. +## +## @item +## @var{outprobest} is the matrix of the estimated output probabilities. +## @code{outprobest(i, j)} is the estimated probability of generating +## output @code{j} given state @code{i}. +## @end itemize +## +## If @code{'symbols'} is specified, then @var{sequence} is expected to be a +## sequence of the elements of @var{symbols} instead of integers. +## @var{symbols} can be a cell array. +## +## If @code{'statenames'} is specified, then @var{states} is expected to be +## a sequence of the elements of @var{statenames} instead of integers. +## @var{statenames} can be a cell array. +## +## If @code{'pseudotransitions'} is specified then the integer matrix +## @var{pseudotransitions} is used as an initial number of counted +## transitions. @code{pseudotransitions(i, j)} is the initial number of +## counted transitions from state @code{i} to state @code{j}. +## @var{transprobest} will have the same size as @var{pseudotransitions}. +## Use this if you have transitions that are very unlikely to occur. +## +## If @code{'pseudoemissions'} is specified then the integer matrix +## @var{pseudoemissions} is used as an initial number of counted outputs. +## @code{pseudoemissions(i, j)} is the initial number of counted outputs +## @code{j} given state @code{i}. If @code{'pseudoemissions'} is also +## specified then the number of rows of @var{pseudoemissions} must be the +## same as the number of rows of @var{pseudotransitions}. @var{outprobest} +## will have the same size as @var{pseudoemissions}. Use this if you have +## outputs or states that are very unlikely to occur. +## +## @subheading Examples +## +## @example +## @group +## transprob = [0.8, 0.2; 0.4, 0.6]; +## outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +## [sequence, states] = hmmgenerate (25, transprob, outprob); +## [transprobest, outprobest] = hmmestimate (sequence, states) +## @end group +## +## @group +## symbols = @{'A', 'B', 'C'@}; +## statenames = @{'One', 'Two'@}; +## [sequence, states] = hmmgenerate (25, transprob, outprob, +## 'symbols', symbols, 'statenames', statenames); +## [transprobest, outprobest] = hmmestimate (sequence, states, +## 'symbols', symbols, +## 'statenames', statenames) +## @end group +## +## @group +## pseudotransitions = [8, 2; 4, 6]; +## pseudoemissions = [2, 4, 4; 7, 2, 1]; +## [sequence, states] = hmmgenerate (25, transprob, outprob); +## [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Lawrence R. Rabiner. A Tutorial on Hidden Markov Models and Selected +## Applications in Speech Recognition. @cite{Proceedings of the IEEE}, +## 77(2), pages 257-286, February 1989. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Estimation of a hidden Markov model for a given sequence + +function [transprobest, outprobest] = hmmestimate (sequence, states, varargin) + + # Check arguments + if (nargin < 2 || mod (length (varargin), 2) != 0) + print_usage (); + endif + + len = length (sequence); + if (length (states) != len) + error ("hmmestimate: sequence and states must have equal length"); + endif + + # Flag for symbols + usesym = false; + # Flag for statenames + usesn = false; + + # Variables for return values + transprobest = []; + outprobest = []; + + # Process varargin + for i = 1:2:length (varargin) + # There must be an identifier: 'symbols', 'statenames', + # 'pseudotransitions' or 'pseudoemissions' + if (! ischar (varargin{i})) + print_usage (); + endif + # Upper case is also fine + lowerarg = lower (varargin{i}); + if (strcmp (lowerarg, 'symbols')) + usesym = true; + # Use the following argument as symbols + symbols = varargin{i + 1}; + # The same for statenames + elseif (strcmp (lowerarg, 'statenames')) + usesn = true; + # Use the following argument as statenames + statenames = varargin{i + 1}; + elseif (strcmp (lowerarg, 'pseudotransitions')) + # Use the following argument as an initial count for transitions + transprobest = varargin{i + 1}; + if (! ismatrix (transprobest)) + error ("hmmestimate: pseudotransitions must be a non-empty numeric matrix"); + endif + if (rows (transprobest) != columns (transprobest)) + error ("hmmestimate: pseudotransitions must be a square matrix"); + endif + elseif (strcmp (lowerarg, 'pseudoemissions')) + # Use the following argument as an initial count for outputs + outprobest = varargin{i + 1}; + if (! ismatrix (outprobest)) + error ("hmmestimate: pseudoemissions must be a non-empty numeric matrix"); + endif + else + error ("hmmestimate: expected 'symbols', 'statenames', 'pseudotransitions' or 'pseudoemissions' but found '%s'", varargin{i}); + endif + endfor + + # Transform sequence from symbols to integers if necessary + if (usesym) + # sequenceint is used to build the transformed sequence + sequenceint = zeros (1, len); + for i = 1:length (symbols) + # Search for symbols(i) in the sequence, isequal will have 1 at + # corresponding indices; i is the right integer for that symbol + isequal = ismember (sequence, symbols(i)); + # We do not want to change sequenceint if the symbol appears a second + # time in symbols + if (any ((sequenceint == 0) & (isequal == 1))) + isequal *= i; + sequenceint += isequal; + endif + endfor + if (! all (sequenceint)) + index = max ((sequenceint == 0) .* (1:len)); + error (["hmmestimate: sequence(" int2str (index) ") not in symbols"]); + endif + sequence = sequenceint; + else + if (! isvector (sequence)) + error ("hmmestimate: sequence must be a non-empty vector"); + endif + if (! all (ismember (sequence, 1:max (sequence)))) + index = max ((ismember (sequence, 1:max (sequence)) == 0) .* (1:len)); + error (["hmmestimate: sequence(" int2str (index) ") not feasible"]); + endif + endif + + # Transform states from statenames to integers if necessary + if (usesn) + # statesint is used to build the transformed states + statesint = zeros (1, len); + for i = 1:length (statenames) + # Search for statenames(i) in states, isequal will have 1 at + # corresponding indices; i is the right integer for that statename + isequal = ismember (states, statenames(i)); + # We do not want to change statesint if the statename appears a second + # time in statenames + if (any ((statesint == 0) & (isequal == 1))) + isequal *= i; + statesint += isequal; + endif + endfor + if (! all (statesint)) + index = max ((statesint == 0) .* (1:len)); + error (["hmmestimate: states(" int2str (index) ") not in statenames"]); + endif + states = statesint; + else + if (! isvector (states)) + error ("hmmestimate: states must be a non-empty vector"); + endif + if (! all (ismember (states, 1:max (states)))) + index = max ((ismember (states, 1:max (states)) == 0) .* (1:len)); + error (["hmmestimate: states(" int2str (index) ") not feasible"]); + endif + endif + + # Estimate the number of different states as the max of states + nstate = max (states); + # Estimate the number of different outputs as the max of sequence + noutput = max (sequence); + + # transprobest is empty if pseudotransitions is not specified + if (isempty (transprobest)) + # outprobest is not empty if pseudoemissions is specified + if (! isempty (outprobest)) + if (nstate > rows (outprobest)) + error ("hmmestimate: not enough rows in pseudoemissions"); + endif + # The number of states is specified by pseudoemissions + nstate = rows (outprobest); + endif + transprobest = zeros (nstate, nstate); + else + if (nstate > rows (transprobest)) + error ("hmmestimate: not enough rows in pseudotransitions"); + endif + # The number of states is given by pseudotransitions + nstate = rows (transprobest); + endif + + # outprobest is empty if pseudoemissions is not specified + if (isempty (outprobest)) + outprobest = zeros (nstate, noutput); + else + if (noutput > columns (outprobest)) + error ("hmmestimate: not enough columns in pseudoemissions"); + endif + # Number of outputs is specified by pseudoemissions + noutput = columns (outprobest); + if (rows (outprobest) != nstate) + error ("hmmestimate: pseudoemissions must have the same number of rows as pseudotransitions"); + endif + endif + + # Assume that the model started in state 1 + cstate = 1; + for i = 1:len + # Count the number of transitions for each state pair + transprobest(cstate, states(i)) ++; + cstate = states (i); + # Count the number of outputs for each state output pair + outprobest(cstate, sequence(i)) ++; + endfor + + # transprobest and outprobest contain counted numbers + # Each row in transprobest and outprobest should contain estimated + # probabilities + # => scale so that the sum is 1 + # A zero row remains zero + # - for transprobest + s = sum (transprobest, 2); + s(s == 0) = 1; + transprobest = transprobest ./ (s * ones (1, nstate)); + # - for outprobest + s = sum (outprobest, 2); + s(s == 0) = 1; + outprobest = outprobest ./ (s * ones (1, noutput)); + +endfunction + +%!test +%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3]; +%! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1]; +%! [transprobest, outprobest] = hmmestimate (sequence, states); +%! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429]; +%! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000]; +%! assert (transprobest, expectedtransprob, 0.001); +%! assert (outprobest, expectedoutprob, 0.001); + +%!test +%! sequence = {'A', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'C', 'C', 'C', 'C', 'B', 'C', 'A', 'A', 'A', 'A', 'C', 'C', 'B', 'C', 'A', 'C'}; +%! states = {'One', 'One', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'Two', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One'}; +%! symbols = {'A', 'B', 'C'}; +%! statenames = {'One', 'Two'}; +%! [transprobest, outprobest] = hmmestimate (sequence, states, 'symbols', symbols, 'statenames', statenames); +%! expectedtransprob = [0.88889, 0.11111; 0.28571, 0.71429]; +%! expectedoutprob = [0.16667, 0.33333, 0.50000; 1.00000, 0.00000, 0.00000]; +%! assert (transprobest, expectedtransprob, 0.001); +%! assert (outprobest, expectedoutprob, 0.001); + +%!test +%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3]; +%! states = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1]; +%! pseudotransitions = [8, 2; 4, 6]; +%! pseudoemissions = [2, 4, 4; 7, 2, 1]; +%! [transprobest, outprobest] = hmmestimate (sequence, states, 'pseudotransitions', pseudotransitions, 'pseudoemissions', pseudoemissions); +%! expectedtransprob = [0.85714, 0.14286; 0.35294, 0.64706]; +%! expectedoutprob = [0.178571, 0.357143, 0.464286; 0.823529, 0.117647, 0.058824]; +%! assert (transprobest, expectedtransprob, 0.001); +%! assert (outprobest, expectedoutprob, 0.001); diff --git a/inst/hmmgenerate.m b/inst/hmmgenerate.m new file mode 100644 index 0000000..0af82b0 --- /dev/null +++ b/inst/hmmgenerate.m @@ -0,0 +1,251 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{sequence}, @var{states}] =} hmmgenerate (@var{len}, @var{transprob}, @var{outprob}) +## @deftypefnx {Function File} {} hmmgenerate (@dots{}, 'symbols', @var{symbols}) +## @deftypefnx {Function File} {} hmmgenerate (@dots{}, 'statenames', @var{statenames}) +## Generate an output sequence and hidden states of a hidden Markov model. +## The model starts in state @code{1} at step @code{0} but will not include +## step @code{0} in the generated states and sequence. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{len} is the number of steps to generate. @var{sequence} and +## @var{states} will have @var{len} entries each. +## +## @item +## @var{transprob} is the matrix of transition probabilities of the states. +## @code{transprob(i, j)} is the probability of a transition to state +## @code{j} given state @code{i}. +## +## @item +## @var{outprob} is the matrix of output probabilities. +## @code{outprob(i, j)} is the probability of generating output @code{j} +## given state @code{i}. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{sequence} is a vector of length @var{len} of the generated +## outputs. The outputs are integers ranging from @code{1} to +## @code{columns (outprob)}. +## +## @item +## @var{states} is a vector of length @var{len} of the generated hidden +## states. The states are integers ranging from @code{1} to +## @code{columns (transprob)}. +## @end itemize +## +## If @code{'symbols'} is specified, then the elements of @var{symbols} are +## used for the output sequence instead of integers ranging from @code{1} to +## @code{columns (outprob)}. @var{symbols} can be a cell array. +## +## If @code{'statenames'} is specified, then the elements of +## @var{statenames} are used for the states instead of integers ranging from +## @code{1} to @code{columns (transprob)}. @var{statenames} can be a cell +## array. +## +## @subheading Examples +## +## @example +## @group +## transprob = [0.8, 0.2; 0.4, 0.6]; +## outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +## [sequence, states] = hmmgenerate (25, transprob, outprob) +## @end group +## +## @group +## symbols = @{'A', 'B', 'C'@}; +## statenames = @{'One', 'Two'@}; +## [sequence, states] = hmmgenerate (25, transprob, outprob, +## 'symbols', symbols, 'statenames', statenames) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Lawrence R. Rabiner. A Tutorial on Hidden Markov Models and Selected +## Applications in Speech Recognition. @cite{Proceedings of the IEEE}, +## 77(2), pages 257-286, February 1989. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Output sequence and hidden states of a hidden Markov model + +function [sequence, states] = hmmgenerate (len, transprob, outprob, varargin) + + # Check arguments + if (nargin < 3 || mod (length (varargin), 2) != 0) + print_usage (); + endif + + if (! isscalar (len) || len < 0 || round (len) != len) + error ("hmmgenerate: len must be a non-negative scalar integer") + endif + + if (! ismatrix (transprob)) + error ("hmmgenerate: transprob must be a non-empty numeric matrix"); + endif + if (! ismatrix (outprob)) + error ("hmmgenerate: outprob must be a non-empty numeric matrix"); + endif + + # nstate is the number of states of the hidden Markov model + nstate = rows (transprob); + # noutput is the number of different outputs that the hidden Markov model + # can generate + noutput = columns (outprob); + + # Check whether transprob and outprob are feasible for a hidden Markov + # model + if (columns (transprob) != nstate) + error ("hmmgenerate: transprob must be a square matrix"); + endif + if (rows (outprob) != nstate) + error ("hmmgenerate: outprob must have the same number of rows as transprob"); + endif + + # Flag for symbols + usesym = false; + # Flag for statenames + usesn = false; + + # Process varargin + for i = 1:2:length (varargin) + # There must be an identifier: 'symbols' or 'statenames' + if (! ischar (varargin{i})) + print_usage (); + endif + # Upper case is also fine + lowerarg = lower (varargin{i}); + if (strcmp (lowerarg, 'symbols')) + if (length (varargin{i + 1}) != noutput) + error ("hmmgenerate: number of symbols does not match number of possible outputs"); + endif + usesym = true; + # Use the following argument as symbols + symbols = varargin{i + 1}; + # The same for statenames + elseif (strcmp (lowerarg, 'statenames')) + if (length (varargin{i + 1}) != nstate) + error ("hmmgenerate: number of statenames does not match number of states"); + endif + usesn = true; + # Use the following argument as statenames + statenames = varargin{i + 1}; + else + error ("hmmgenerate: expected 'symbols' or 'statenames' but found '%s'", varargin{i}); + endif + endfor + + # Each row in transprob and outprob should contain probabilities + # => scale so that the sum is 1 + # A zero row remains zero + # - for transprob + s = sum (transprob, 2); + s(s == 0) = 1; + transprob = transprob ./ repmat (s, 1, nstate); + # - for outprob + s = sum (outprob, 2); + s(s == 0) = 1; + outprob = outprob ./ repmat (s, 1, noutput); + + # Generate sequences of uniformly distributed random numbers between 0 and + # 1 + # - for the state transitions + transdraw = rand (1, len); + # - for the outputs + outdraw = rand (1, len); + + # Generate the return vectors + # They remain unchanged if the according probability row of transprob + # and outprob contain, respectively, only zeros + sequence = ones (1, len); + states = ones (1, len); + + if (len > 0) + # Calculate cumulated probabilities backwards for easy comparison with + # the generated random numbers + # Cumulated probability in first column must always be 1 + # We might have a zero row + # - for transprob + transprob(:, end:-1:1) = cumsum (transprob(:, end:-1:1), 2); + transprob(:, 1) = 1; + # - for outprob + outprob(:, end:-1:1) = cumsum (outprob(:, end:-1:1), 2); + outprob(:, 1) = 1; + + # cstate is the current state + # Start in state 1 but do not include it in the states vector + cstate = 1; + for i = 1:len + # Compare the randon number i of transdraw to the cumulated + # probability of the state transition and set the transition + # accordingly + states(i) = sum (transdraw(i) <= transprob(cstate, :)); + cstate = states(i); + endfor + + # Compare the random numbers of outdraw to the cumulated probabilities + # of the outputs and set the sequence vector accordingly + sequence = sum (repmat (outdraw, noutput, 1) <= outprob(states, :)', 1); + + # Transform default matrices into symbols/statenames if requested + if (usesym) + sequence = reshape (symbols(sequence), 1, len); + endif + if (usesn) + states = reshape (statenames(states), 1, len); + endif + endif + +endfunction + +%!test +%! len = 25; +%! transprob = [0.8, 0.2; 0.4, 0.6]; +%! outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +%! [sequence, states] = hmmgenerate (len, transprob, outprob); +%! assert (length (sequence), len); +%! assert (length (states), len); +%! assert (min (sequence) >= 1); +%! assert (max (sequence) <= columns (outprob)); +%! assert (min (states) >= 1); +%! assert (max (states) <= rows (transprob)); + +%!test +%! len = 25; +%! transprob = [0.8, 0.2; 0.4, 0.6]; +%! outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +%! symbols = {'A', 'B', 'C'}; +%! statenames = {'One', 'Two'}; +%! [sequence, states] = hmmgenerate (len, transprob, outprob, 'symbols', symbols, 'statenames', statenames); +%! assert (length (sequence), len); +%! assert (length (states), len); +%! assert (strcmp (sequence, 'A') + strcmp (sequence, 'B') + strcmp (sequence, 'C') == ones (1, len)); +%! assert (strcmp (states, 'One') + strcmp (states, 'Two') == ones (1, len)); diff --git a/inst/hmmviterbi.m b/inst/hmmviterbi.m new file mode 100644 index 0000000..14cfc9e --- /dev/null +++ b/inst/hmmviterbi.m @@ -0,0 +1,249 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{vpath} =} hmmviterbi (@var{sequence}, @var{transprob}, @var{outprob}) +## @deftypefnx {Function File} {} hmmviterbi (@dots{}, 'symbols', @var{symbols}) +## @deftypefnx {Function File} {} hmmviterbi (@dots{}, 'statenames', @var{statenames}) +## Use the Viterbi algorithm to find the Viterbi path of a hidden Markov +## model given a sequence of outputs. The model assumes that the generation +## starts in state @code{1} at step @code{0} but does not include step +## @code{0} in the generated states and sequence. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{sequence} is the vector of length @var{len} of given outputs. The +## outputs must be integers ranging from @code{1} to +## @code{columns (outprob)}. +## +## @item +## @var{transprob} is the matrix of transition probabilities of the states. +## @code{transprob(i, j)} is the probability of a transition to state +## @code{j} given state @code{i}. +## +## @item +## @var{outprob} is the matrix of output probabilities. +## @code{outprob(i, j)} is the probability of generating output @code{j} +## given state @code{i}. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{vpath} is the vector of the same length as @var{sequence} of the +## estimated hidden states. The states are integers ranging from @code{1} to +## @code{columns (transprob)}. +## @end itemize +## +## If @code{'symbols'} is specified, then @var{sequence} is expected to be a +## sequence of the elements of @var{symbols} instead of integers ranging +## from @code{1} to @code{columns (outprob)}. @var{symbols} can be a cell array. +## +## If @code{'statenames'} is specified, then the elements of +## @var{statenames} are used for the states in @var{vpath} instead of +## integers ranging from @code{1} to @code{columns (transprob)}. +## @var{statenames} can be a cell array. +## +## @subheading Examples +## +## @example +## @group +## transprob = [0.8, 0.2; 0.4, 0.6]; +## outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +## [sequence, states] = hmmgenerate (25, transprob, outprob) +## vpath = hmmviterbi (sequence, transprob, outprob) +## @end group +## +## @group +## symbols = @{'A', 'B', 'C'@}; +## statenames = @{'One', 'Two'@}; +## [sequence, states] = hmmgenerate (25, transprob, outprob, +## 'symbols', symbols, 'statenames', statenames) +## vpath = hmmviterbi (sequence, transprob, outprob, +## 'symbols', symbols, 'statenames', statenames) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Lawrence R. Rabiner. A Tutorial on Hidden Markov Models and Selected +## Applications in Speech Recognition. @cite{Proceedings of the IEEE}, +## 77(2), pages 257-286, February 1989. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Viterbi path of a hidden Markov model + +function vpath = hmmviterbi (sequence, transprob, outprob, varargin) + + # Check arguments + if (nargin < 3 || mod (length (varargin), 2) != 0) + print_usage (); + endif + + if (! ismatrix (transprob)) + error ("hmmviterbi: transprob must be a non-empty numeric matrix"); + endif + if (! ismatrix (outprob)) + error ("hmmviterbi: outprob must be a non-empty numeric matrix"); + endif + + len = length (sequence); + # nstate is the number of states of the hidden Markov model + nstate = rows (transprob); + # noutput is the number of different outputs that the hidden Markov model + # can generate + noutput = columns (outprob); + + # Check whether transprob and outprob are feasible for a hidden Markov model + if (columns (transprob) != nstate) + error ("hmmviterbi: transprob must be a square matrix"); + endif + if (rows (outprob) != nstate) + error ("hmmviterbi: outprob must have the same number of rows as transprob"); + endif + + # Flag for symbols + usesym = false; + # Flag for statenames + usesn = false; + + # Process varargin + for i = 1:2:length (varargin) + # There must be an identifier: 'symbols' or 'statenames' + if (! ischar (varargin{i})) + print_usage (); + endif + # Upper case is also fine + lowerarg = lower (varargin{i}); + if (strcmp (lowerarg, 'symbols')) + if (length (varargin{i + 1}) != noutput) + error ("hmmviterbi: number of symbols does not match number of possible outputs"); + endif + usesym = true; + # Use the following argument as symbols + symbols = varargin{i + 1}; + # The same for statenames + elseif (strcmp (lowerarg, 'statenames')) + if (length (varargin{i + 1}) != nstate) + error ("hmmviterbi: number of statenames does not match number of states"); + endif + usesn = true; + # Use the following argument as statenames + statenames = varargin{i + 1}; + else + error ("hmmviterbi: expected 'symbols' or 'statenames' but found '%s'", varargin{i}); + endif + endfor + + # Transform sequence from symbols to integers if necessary + if (usesym) + # sequenceint is used to build the transformed sequence + sequenceint = zeros (1, len); + for i = 1:noutput + # Search for symbols(i) in the sequence, isequal will have 1 at + # corresponding indices; i is the right integer for that symbol + isequal = ismember (sequence, symbols(i)); + # We do not want to change sequenceint if the symbol appears a second + # time in symbols + if (any ((sequenceint == 0) & (isequal == 1))) + isequal *= i; + sequenceint += isequal; + endif + endfor + if (! all (sequenceint)) + index = max ((sequenceint == 0) .* (1:len)); + error (["hmmviterbi: sequence(" int2str (index) ") not in symbols"]); + endif + sequence = sequenceint; + else + if (! isvector (sequence) && ! isempty (sequence)) + error ("hmmviterbi: sequence must be a vector"); + endif + if (! all (ismember (sequence, 1:noutput))) + index = max ((ismember (sequence, 1:noutput) == 0) .* (1:len)); + error (["hmmviterbi: sequence(" int2str (index) ") out of range"]); + endif + endif + + # Each row in transprob and outprob should contain log probabilities + # => scale so that the sum is 1 and convert to log space + # - for transprob + s = sum (transprob, 2); + s(s == 0) = 1; + transprob = log (transprob ./ (s * ones (1, columns (transprob)))); + # - for outprob + s = sum (outprob, 2); + s(s == 0) = 1; + outprob = log (outprob ./ (s * ones (1, columns (outprob)))); + + # Store the path starting from i in spath(i, :) + spath = ones (nstate, len + 1); + # Set the first state for each path + spath(:, 1) = (1:nstate)'; + # Store the probability of path i in spathprob(i) + spathprob = transprob(1, :); + + # Find the most likely paths for the given output sequence + for i = 1:len + # Calculate the new probabilities of the continuation with each state + nextpathprob = ((spathprob' + outprob(:, sequence(i))) * ones (1, nstate)) + transprob; + # Find the paths with the highest probabilities + [spathprob, mindex] = max (nextpathprob); + # Update spath and spathprob with the new paths + spath = spath(mindex, :); + spath(:, i + 1) = (1:nstate)'; + endfor + + # Set vpath to the most likely path + # We do not want the last state because we do not have an output for it + [m, mindex] = max (spathprob); + vpath = spath(mindex, 1:len); + + # Transform vpath into statenames if requested + if (usesn) + vpath = reshape (statenames(vpath), 1, len); + endif + +endfunction + +%!test +%! sequence = [1, 2, 1, 1, 1, 2, 2, 1, 2, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 3, 3, 2, 3, 1, 3]; +%! transprob = [0.8, 0.2; 0.4, 0.6]; +%! outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +%! vpath = hmmviterbi (sequence, transprob, outprob); +%! expected = [1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1]; +%! assert (vpath, expected); + +%!test +%! sequence = {'A', 'B', 'A', 'A', 'A', 'B', 'B', 'A', 'B', 'C', 'C', 'C', 'C', 'B', 'C', 'A', 'A', 'A', 'A', 'C', 'C', 'B', 'C', 'A', 'C'}; +%! transprob = [0.8, 0.2; 0.4, 0.6]; +%! outprob = [0.2, 0.4, 0.4; 0.7, 0.2, 0.1]; +%! symbols = {'A', 'B', 'C'}; +%! statenames = {'One', 'Two'}; +%! vpath = hmmviterbi (sequence, transprob, outprob, 'symbols', symbols, 'statenames', statenames); +%! expected = {'One', 'One', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'One', 'Two', 'Two', 'Two', 'Two', 'One', 'One', 'One', 'One', 'One', 'One'}; +%! assert (vpath, expected); diff --git a/inst/hygestat.m b/inst/hygestat.m new file mode 100644 index 0000000..54d3caa --- /dev/null +++ b/inst/hygestat.m @@ -0,0 +1,133 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{mn}, @var{v}] =} hygestat (@var{t}, @var{m}, @var{n}) +## Compute mean and variance of the hypergeometric distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{t} is the total size of the population of the hypergeometric +## distribution. The elements of @var{t} must be positive natural numbers +## +## @item +## @var{m} is the number of marked items of the hypergeometric distribution. +## The elements of @var{m} must be natural numbers +## +## @item +## @var{n} is the size of the drawn sample of the hypergeometric +## distribution. The elements of @var{n} must be positive natural numbers +## @end itemize +## @var{t}, @var{m}, and @var{n} must be of common size or scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{mn} is the mean of the hypergeometric distribution +## +## @item +## @var{v} is the variance of the hypergeometric distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## t = 4:9; +## m = 0:5; +## n = 1:6; +## [mn, v] = hygestat (t, m, n) +## @end group +## +## @group +## [mn, v] = hygestat (t, m, 2) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the hypergeometric distribution + +function [mn, v] = hygestat (t, m, n) + + # Check arguments + if (nargin != 3) + print_usage (); + endif + + if (! isempty (t) && ! ismatrix (t)) + error ("hygestat: t must be a numeric matrix"); + endif + if (! isempty (m) && ! ismatrix (m)) + error ("hygestat: m must be a numeric matrix"); + endif + if (! isempty (n) && ! ismatrix (n)) + error ("hygestat: n must be a numeric matrix"); + endif + + if (! isscalar (t) || ! isscalar (m) || ! isscalar (n)) + [retval, t, m, n] = common_size (t, m, n); + if (retval > 0) + error ("hygestat: t, m and n must be of common size or scalar"); + endif + endif + + # Calculate moments + mn = (n .* m) ./ t; + v = (n .* (m ./ t) .* (1 - m ./ t) .* (t - n)) ./ (t - 1); + + # Continue argument check + k = find (! (t >= 0) | ! (m >= 0) | ! (n > 0) | ! (t == round (t)) | ! (m == round (m)) | ! (n == round (n)) | ! (m <= t) | ! (n <= t)); + if (any (k)) + mn(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! t = 4:9; +%! m = 0:5; +%! n = 1:6; +%! [mn, v] = hygestat (t, m, n); +%! expected_mn = [0.0000, 0.4000, 1.0000, 1.7143, 2.5000, 3.3333]; +%! expected_v = [0.0000, 0.2400, 0.4000, 0.4898, 0.5357, 0.5556]; +%! assert (mn, expected_mn, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! t = 4:9; +%! m = 0:5; +%! [mn, v] = hygestat (t, m, 2); +%! expected_mn = [0.0000, 0.4000, 0.6667, 0.8571, 1.0000, 1.1111]; +%! expected_v = [0.0000, 0.2400, 0.3556, 0.4082, 0.4286, 0.4321]; +%! assert (mn, expected_mn, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/inconsistent.m b/inst/inconsistent.m new file mode 100644 index 0000000..a385e36 --- /dev/null +++ b/inst/inconsistent.m @@ -0,0 +1,118 @@ +## Copyright (C) 2020-2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{Y} =} inconsistent (@var{Z}) +## @deftypefnx {Function File} {@var{Y} =} inconsistent (@var{Z}, @var{d}) +## +## Compute the inconsistency coefficient for each link of a hierarchical cluster +## tree. +## +## Given a hierarchical cluster tree @var{Z} generated by the @code{linkage} +## function, @code{inconsistent} computes the inconsistency coefficient for each +## link of the tree, using all the links down to the @var{d}-th level below that +## link. +## +## The default depth @var{d} is 2, which means that only two levels are +## considered: the level of the computed link and the level below that. +## +## Each row of @var{Y} corresponds to the row of same index of @var{Z}. +## The columns of @var{Y} are respectively: the mean of the heights of the links +## used for the calculation, the standard deviation of the heights of those +## links, the number of links used, the inconsistency coefficient. +## +## @strong{Reference} +## Jain, A., and R. Dubes. Algorithms for Clustering Data. +## Upper Saddle River, NJ: Prentice-Hall, 1988. +## @end deftypefn +## +## @seealso{cluster, clusterdata, dendrogram, linkage, pdist, squareform} + +## Author: Stefano Guidoni + +function Y = inconsistent (Z, d = 2) + + ## check the input + if (nargin < 1) || (nargin > 2) + print_usage (); + endif + + ## MATLAB compatibility: + ## when d = 0, which does not make sense, the result of inconsistent is the + ## same as d = 1, which is... inconsistent + if ((d < 0) || (! isscalar (d)) || (mod (d, 1))) + error ("inconsistent: d must be a positive integer scalar"); + endif + + if ((columns (Z) != 3) || (! isnumeric (Z)) || ... + (! (max (Z(end, 1:2)) == rows (Z) * 2))) + error (["inconsistent: Z must be a matrix generated by the linkage " ... + "function"]); + endif + + ## number of observations + n = rows (Z) + 1; + + ## compute the inconsistency coefficient for every link + for i = 1:rows (Z) + v = inconsistent_recursion (i, d); # nested recursive function - see below + + Y(i, 1) = mean (v); + Y(i, 2) = std (v); + Y(i, 3) = length (v); + ## the inconsistency coefficient is (current_link_height - mean) / std; + ## if the standard deviation is zero, it is zero by definition + if (Y(i, 2) != 0) + Y(i, 4) = (v(end) - Y(i, 1)) / Y(i, 2); + else + Y(i, 4) = 0; + endif + endfor + + ## recursive function + ## while depth > 1 search the links (columns 1 and 2 of Z) below the current + ## link and then append the height of the current link to the vector v. + ## The height of the starting link should be the last one of the vector. + function v = inconsistent_recursion (index, depth) + v = []; + if (depth > 1) + for j = 1:2 + if (Z(index, j) > n) + new_index = Z(index, j) - n; + v = [v (inconsistent_recursion (new_index, depth - 1))]; + endif + endfor + endif + v(end+1) = Z(index, 3); + endfunction + +endfunction + + +## Test input validation +%!error inconsistent () +%!error inconsistent ([1 2 1], 2, 3) +%!error inconsistent (ones (2, 2)) +%!error inconsistent ([1 2 1], -1) +%!error inconsistent ([1 2 1], 1.3) +%!error inconsistent ([1 2 1], [1 1]) +%!error inconsistent (ones (2, 3)) + +## Test output +%!test +%! load fisheriris; +%! Z = linkage(meas, 'average', 'chebychev'); +%! assert (cond (inconsistent (Z)), 39.9, 1e-3); + diff --git a/inst/iwishpdf.m b/inst/iwishpdf.m new file mode 100644 index 0000000..8b64bfb --- /dev/null +++ b/inst/iwishpdf.m @@ -0,0 +1,67 @@ +## Copyright (C) 2013 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with Octave; see the file COPYING. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} @var{y} = iwishpdf (@var{W}, @var{Tau}, @var{df}, @var{log_y}=false) +## Compute the probability density function of the Wishart distribution +## +## Inputs: A @var{p} x @var{p} matrix @var{W} where to find the PDF and the @var{p} x @var{p} positive definite scale matrix @var{Tau} and scalar degrees of freedom parameter @var{df} characterizing the inverse Wishart distribution. (For the density to be finite, need @var{df} > (@var{p} - 1).) +## If the flag @var{log_y} is set, return the log probability density -- this helps avoid underflow when the numerical value of the density is very small +## +## Output: @var{y} is the probability density of Wishart(@var{Sigma}, @var{df}) at @var{W}. +## +## @seealso{iwishrnd, wishpdf} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Compute the probability density function of the inverse Wishart distribution + +function [y] = iwishpdf(W, Tau, df, log_y=false) + +if (nargin < 3) + print_usage (); +endif + +p = size(Tau, 1); + +if (df <= (p - 1)) + error('df too small, no finite densities exist') +endif + +#calculate the logarithm of G_d(df/2), the multivariate gamma function +g = (p * (p-1) / 4) * log(pi); +for i = 1:p + g = g + log(gamma((df + (1 - i))/2)); #using lngamma_gsl(.) from the gsl package instead of log(gamma(.)) might help avoid underflow/overflow +endfor + +C = chol(W); + +#use formulas for determinant of positive definite matrix for better efficiency and numerical accuracy +logdet_W = 2*sum(log(diag(C))); +logdet_Tau = 2*sum(log(diag(chol(Tau)))); + +y = -(df*p)/2 * log(2) + (df/2)*logdet_Tau - g - ((df + p + 1)/2)*logdet_W - trace(Tau*chol2inv(C))/2; + +if ~log_y + y = exp(y); +endif + + +endfunction + +##test results cross-checked against diwish function in R MCMCpack library +%!assert(iwishpdf(4, 3, 3.1), 0.04226595, 1E-7); +%!assert(iwishpdf([2 -0.3;-0.3 4], [1 0.3;0.3 1], 4), 1.60166e-05, 1E-10); +%!assert(iwishpdf([6 2 5; 2 10 -5; 5 -5 25], [9 5 5; 5 10 -8; 5 -8 22], 5.1), 4.946831e-12, 1E-17); + +%% Test input validation +%!error iwishpdf () +%!error iwishpdf (1, 2) +%!error iwishpdf (1, 2, 0) + +%!error wishpdf (1, 2) diff --git a/inst/iwishrnd.m b/inst/iwishrnd.m new file mode 100644 index 0000000..c4ce34d --- /dev/null +++ b/inst/iwishrnd.m @@ -0,0 +1,69 @@ +## Copyright (C) 2013 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with Octave; see the file COPYING. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} [@var{W}[, @var{DI}]] = iwishrnd (@var{Psi}, @var{df}[, @var{DI}][, @var{n}=1]) +## Return a random matrix sampled from the inverse Wishart distribution with given parameters +## +## Inputs: the @var{p} x @var{p} positive definite matrix @var{Tau} and scalar degrees of freedom parameter @var{df} (and optionally the transposed Cholesky factor @var{DI} of @var{Sigma} = @code{inv(Tau)}). +## @var{df} can be non-integer as long as @var{df} > @var{d} +## +## Output: a random @var{p} x @var{p} matrix @var{W} from the inverse Wishart(@var{Tau}, @var{df}) distribution. (@code{inv(W)} is from the Wishart(@code{inv(Tau)}, @var{df}) distribution.) If @var{n} > 1, then @var{W} is @var{p} x @var{p} x @var{n} and holds @var{n} such random matrices. (Optionally, the transposed Cholesky factor @var{DI} of @var{Sigma} is also returned.) +## +## Averaged across many samples, the mean of @var{W} should approach @var{Tau} / (@var{df} - @var{p} - 1). +## +## Reference: Yu-Cheng Ku and Peter Bloomfield (2010), Generating Random Wishart Matrices with Fractional Degrees of Freedom in OX, http://www.gwu.edu/~forcpgm/YuChengKu-030510final-WishartYu-ChengKu.pdf +## +## @seealso{wishrnd, iwishpdf} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Random matrices from the inverse Wishart distribution + +function [W, DI] = iwishrnd(Tau, df, DI, n = 1) + +if (nargin < 2) + print_usage (); +endif + +if nargin < 3 || isempty(DI) + try + D = chol(inv(Tau)); + catch + error('Cholesky decomposition failed; Tau probably not positive definite') + end_try_catch + DI = D'; +else + D = DI'; +endif + +w = wishrnd([], df, D, n); + +if n > 1 + p = size(D, 1); + W = nan(p, p, n); +endif + +for i = 1:n + W(:, :, i) = inv(w(:, :, i)); +endfor + +endfunction + + + +%!assert(size (iwishrnd (1,2,1)), [1, 1]); +%!assert(size (iwishrnd ([],2,1)), [1, 1]); +%!assert(size (iwishrnd ([3 1; 1 3], 2.00001, [], 1)), [2, 2]); +%!assert(size (iwishrnd (eye(2), 2, [], 3)), [2, 2, 3]); + +%% Test input validation +%!error iwishrnd () +%!error iwishrnd (1) +%!error iwishrnd ([-3 1; 1 3],1) +%!error iwishrnd ([1; 1],1) diff --git a/inst/jackknife.m b/inst/jackknife.m new file mode 100644 index 0000000..1870239 --- /dev/null +++ b/inst/jackknife.m @@ -0,0 +1,141 @@ +## Copyright (C) 2011 Alexander Klein +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn{Function File} {@var{jackstat} =} jackknife (@var{E}, @var{x}, @dots{}) +## Compute jackknife estimates of a parameter taking one or more given samples as parameters. +## In particular, @var{E} is the estimator to be jackknifed as a function name, handle, +## or inline function, and @var{x} is the sample for which the estimate is to be taken. +## The @var{i}-th entry of @var{jackstat} will contain the value of the estimator +## on the sample @var{x} with its @var{i}-th row omitted. +## +## @example +## @group +## jackstat(@var{i}) = @var{E}(@var{x}(1 : @var{i} - 1, @var{i} + 1 : length(@var{x}))) +## @end group +## @end example +## +## Depending on the number of samples to be used, the estimator must have the appropriate form: +## If only one sample is used, then the estimator need not be concerned with cell arrays, +## for example jackknifing the standard deviation of a sample can be performed with +## @code{@var{jackstat} = jackknife (@@std, rand (100, 1))}. +## If, however, more than one sample is to be used, the samples must all be of equal size, +## and the estimator must address them as elements of a cell-array, +## in which they are aggregated in their order of appearance: +## +## @example +## @group +## @var{jackstat} = jackknife(@@(x) std(x@{1@})/var(x@{2@}), rand (100, 1), randn (100, 1) +## @end group +## @end example +## +## If all goes well, a theoretical value @var{P} for the parameter is already known, +## @var{n} is the sample size, +## @code{@var{t} = @var{n} * @var{E}(@var{x}) - (@var{n} - 1) * mean(@var{jackstat})}, and +## @code{@var{v} = sumsq(@var{n} * @var{E}(@var{x}) - (@var{n} - 1) * @var{jackstat} - @var{t}) / (@var{n} * (@var{n} - 1))}, then +## @code{(@var{t}-@var{P})/sqrt(@var{v})} should follow a t-distribution with @var{n}-1 degrees of freedom. +## +## Jackknifing is a well known method to reduce bias; further details can be found in: +## @itemize @bullet +## @item Rupert G. Miller: The jackknife-a review; Biometrika (1974) 61(1): 1-15; doi:10.1093/biomet/61.1.1 +## @item Rupert G. Miller: Jackknifing Variances; Ann. Math. Statist. Volume 39, Number 2 (1968), 567-582; doi:10.1214/aoms/1177698418 +## @item M. H. Quenouille: Notes on Bias in Estimation; Biometrika Vol. 43, No. 3/4 (Dec., 1956), pp. 353-360; doi:10.1093/biomet/43.3-4.353 +## @end itemize +## @end deftypefn + +## Author: Alexander Klein +## Created: 2011-11-25 + +function jackstat = jackknife ( anEstimator, varargin ) + + + ## Convert function name to handle if necessary, or throw + ## an error. + if ( !strcmp ( typeinfo ( anEstimator ), "function handle" ) ) + + if ( isascii ( anEstimator ) ) + + anEstimator = str2func ( anEstimator ); + + else + + error ( "Estimators must be passed as function names or handles!" ); + end + end + + + ## Simple jackknifing can be done with a single vector argument, and + ## first and foremost with a function that does not care about + ## cell-arrays. + if ( length ( varargin ) == 1 && isnumeric ( varargin { 1 } ) ) + + aSample = varargin { 1 }; + + g = length ( aSample ); + + jackstat = zeros ( 1, g ); + + for k = 1 : g + jackstat ( k ) = anEstimator ( aSample ( [ 1 : k - 1, k + 1 : g ] ) ); + end + + ## More complicated input requires more work, however. + else + + g = cellfun ( @(x) length ( x ), varargin ); + + if ( any ( g - g ( 1 ) ) ) + + error ( "All passed data must be of equal length!" ); + end + + g = g ( 1 ); + + jackstat = zeros ( 1, g ); + + for k = 1 : g + + jackstat ( k ) = anEstimator ( cellfun ( @(x) x( [ 1 : k - 1, k + 1 : g ] ), varargin, "UniformOutput", false ) ); + end + + end +endfunction + + +%!test +%! ##Example from Quenouille, Table 1 +%! d=[0.18 4.00 1.04 0.85 2.14 1.01 3.01 2.33 1.57 2.19]; +%! jackstat = jackknife ( @(x) 1/mean(x), d ); +%! assert ( 10 / mean(d) - 9 * mean(jackstat), 0.5240, 1e-5 ); + +%!demo +%! for k = 1:1000 +%! x=rand(10,1); +%! s(k)=std(x); +%! jackstat=jackknife(@std,x); +%! j(k)=10*std(x) - 9*mean(jackstat); +%! end +%! figure();hist([s',j'], 0:sqrt(1/12)/10:2*sqrt(1/12)) + +%!demo +%! for k = 1:1000 +%! x=randn(1,50); +%! y=rand(1,50); +%! jackstat=jackknife(@(x) std(x{1})/std(x{2}),y,x); +%! j(k)=50*std(y)/std(x) - 49*mean(jackstat); +%! v(k)=sumsq((50*std(y)/std(x) - 49*jackstat) - j(k)) / (50 * 49); +%! end +%! t=(j-sqrt(1/12))./sqrt(v); +%! figure();plot(sort(tcdf(t,49)),"-;Almost linear mapping indicates good fit with t-distribution.;") diff --git a/inst/jsucdf.m b/inst/jsucdf.m new file mode 100644 index 0000000..59ffaf0 --- /dev/null +++ b/inst/jsucdf.m @@ -0,0 +1,61 @@ +## Copyright (C) 2006 Frederick (Rick) A Niles +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} jsucdf (@var{x}, @var{alpha1}, @var{alpha2}) +## For each element of @var{x}, compute the cumulative distribution +## function (CDF) at @var{x} of the Johnson SU distribution with shape parameters +## @var{alpha1} and @var{alpha2}. +## +## Default values are @var{alpha1} = 1, @var{alpha2} = 1. +## @end deftypefn + +## Author: Frederick (Rick) A Niles +## Description: CDF of the Johnson SU distribution + +## This function is derived from normcdf.m + +## This is the TeX equation of this function: +## +## \[ F(x) = \Phi\left(\alpha_1 + \alpha_2 +## \log\left(x + \sqrt{x^2 + 1} \right)\right) \] +## +## where \[ -\infty < x < \infty ; \alpha_2 > 0 \] and $\Phi$ is the +## standard normal cumulative distribution function. $\alpha_1$ and +## $\alpha_2$ are shape parameters. + + +function cdf = jsucdf (x, alpha1, alpha2) + + if (! ((nargin == 1) || (nargin == 3))) + print_usage; + endif + + if (nargin == 1) + m = 0; + v = 1; + endif + + if (!isscalar (alpha1) || !isscalar(alpha2)) + [retval, x, alpha1, alpha2] = common_size (x, alpha1, alpha2); + if (retval > 0) + error ("normcdf: x, alpha1 and alpha2 must be of common size or scalar"); + endif + endif + + one = ones (size (x)); + cdf = stdnormal_cdf (alpha1 .* one + alpha2 .* log (x + sqrt(x.*x + one))); + +endfunction diff --git a/inst/jsupdf.m b/inst/jsupdf.m new file mode 100644 index 0000000..c4ec1af --- /dev/null +++ b/inst/jsupdf.m @@ -0,0 +1,62 @@ +## Copyright (C) 2006 Frederick (Rick) A Niles +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} jsupdf (@var{x}, @var{alpha1}, @var{alpha2}) +## For each element of @var{x}, compute the probability density function +## (PDF) at @var{x} of the Johnson SU distribution with shape parameters @var{alpha1} +## and @var{alpha2}. +## +## Default values are @var{alpha1} = 1, @var{alpha2} = 1. +## @end deftypefn + +## Author: Frederick (Rick) A Niles +## Description: PDF of Johnson SU distribution + +## This function is derived from normpdf.m + +## This is the TeX equation of this function: +## +## \[ f(x) = \frac{\alpha_2}{\sqrt{x^2+1}} \phi\left(\alpha_1+\alpha_2 +## \log{\left(x+\sqrt{x^2+1}\right)}\right) \] +## +## where \[ -\infty < x < \infty ; \alpha_2 > 0 \] and $\phi$ is the +## standard normal probability distribution function. $\alpha_1$ and +## $\alpha_2$ are shape parameters. + +function pdf = jsupdf (x, alpha1, alpha2) + + if (nargin != 1 && nargin != 3) + print_usage; + endif + + if (nargin == 1) + alpha1 = 1; + alpha2 = 1; + endif + + if (!isscalar (alpha1) || !isscalar(alpha2)) + [retval, x, alpha1, alpha2] = common_size (x, alpha1, alpha2); + if (retval > 0) + error ("normpdf: x, alpha1 and alpha2 must be of common size or scalars"); + endif + endif + + one = ones(size(x)); + sr = sqrt(x.*x + one); + pdf = (alpha2 ./ sr) .* stdnormal_pdf (alpha1 .* one + + alpha2 .* log (x + sr)); + +endfunction diff --git a/inst/kmeans.m b/inst/kmeans.m new file mode 100644 index 0000000..e29989b --- /dev/null +++ b/inst/kmeans.m @@ -0,0 +1,564 @@ +## Copyright (C) 2011 Soren Hauberg +## Copyright (C) 2012 Daniel Ward +## Copyright (C) 2015-2016 Lachlan Andrew +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {} {[@var{idx}, @var{centers}, @var{sumd}, @var{dist}] =} kmeans (@var{data}, @var{k}, @var{param1}, @var{value1}, @dots{}) +## Perform a @var{k}-means clustering of the @var{N}x@var{D} table @var{data}. +## If parameter @qcode{start} is specified, then @var{k} may be empty +## in which case @var{k} is set to the number of rows of @var{start}. +## +## The outputs are: +## @table @code +## @item @var{idx} +## An @var{N}x1 vector whose @var{i}th element is the class to which row @var{i} +## of @var{data} is assigned. +## +## @item @var{centers} +## A @var{K}x@var{D} array whose @var{i}th row is the centroid of cluster +## @var{i}. +## +## @item @var{sumd} +## A @var{k}x1 vector whose @var{i}th entry is the sum of the distances +## from samples in cluster @var{i} to centroid @var{i}. +## +## @item @var{dist} +## An @var{N}x@var{k} matrix whose @var{i}@var{j}th element is +## the distance from sample @var{i} to centroid @var{j}. +## @end table +## +## The following parameters may be placed in any order. Each parameter +## must be followed by its value. +## @table @code +## @item @var{Start} +## The initialization method for the centroids. +## @table @code +## @item @code{plus} +## (Default) The k-means++ algorithm. +## @item @code{sample} +# A subset of @var{k} rows from @var{data}, +## sampled uniformly without replacement. +## @item @code{cluster} +## Perform a pilot clustering on 10% of the rows of @var{data}. +## @item @code{uniform} +## Each component of each centroid is drawn uniformly +## from the interval between the maximum and minimum values of that +## component within @var{data}. +## This performs poorly and is implemented only for Matlab compatibility. +## @item A +## A @var{k}x@var{D}x@var{r} matrix, where @var{r} is the number of +## replicates. +## @end table +## +## @item @var{Replicates} +## An positive integer specifying the number of independent clusterings to +## perform. +## The output values are the values for the best clustering, i.e., +## the one with the smallest value of @var{sumd}. +## If @var{Start} is numeric, then @var{Replicates} defaults to +# (and must equal) the size of the third dimension of @var{Start}. +## Otherwise it defaults to 1. +## +## @item @var{MaxIter} +## The maximum number of iterations to perform for each replicate. +## If the maximum change of any centroid is less than 0.001, then +## the replicate terminates even if @var{MaxIter} iterations have no occurred. +## The default is 100. +## +## @item @var{Distance} +## The distance measure used for partitioning and calculating centroids. +## @table @code +## @item @qcode{sqeuclidean} +## The squared Euclidean distance, i.e., +## the sum of the squares of the differences between corresponding components. +## In this case, the centroid is the arithmetic mean of all samples in +## its cluster. +## This is the only distance for which this algorithm is truly "k-means". +## +## @item @qcode{cityblock} +## The sum metric, or L1 distance, i.e., +## the sum of the absolute differences between corresponding components. +## In this case, the centroid is the median of all samples in its cluster. +## This gives the k-medians algorithm. +## +## @item @qcode{cosine} +## (Documentation incomplete.) +## +## @item @qcode{correlation} +## (Documentation incomplete.) +## +## @item @qcode{hamming} +## The number of components in which the sample and the centroid differ. +## In this case, the centroid is the median of all samples in its cluster. +## Unlike Matlab, Octave allows non-logical @var{data}. +## +## @end table +## +## @item @var{EmptyAction} +## What to do when a centroid is not the closest to any data sample. +## @table @code +## @item @qcode{error} +## Throw an error. +## @item @qcode{singleton} +## (Default) Select the row of @var{data} that has the highest error and +## use that as the new centroid. +## @item @qcode{drop} +## Remove the centroid, and continue computation with one fewer centroid. +## The dimensions of the outputs @var{centroids} and @var{d} +## are unchanged, with values for omitted centroids replaced by NA. +## +## @end table +## +## @item @var{Display} +## Display a text summary. +## @table @code +## @item @qcode{off} +## (Default) Display no summary. +## @item @qcode{final} +## Display a summary for each clustering operation. +## @item @qcode{iter} +## Display a summary for each iteration of a clustering operation. +## +## @end table +## @end table +## +## Example: +## +## [~,c] = kmeans (rand(10, 3), 2, "emptyaction", "singleton"); +## +## @seealso{linkage} +## @end deftypefn + +function [classes, centers, sumd, D] = kmeans (data, k, varargin) + [reg, prop] = parseparams (varargin); + + ## defaults for options + emptyaction = "singleton"; + start = "plus"; + replicates = 1; + max_iter = 100; + distance = "sqeuclidean"; + display = "off"; + + replicates_set_explicitly = false; + + ## Remove rows containing NaN / NA, but record which rows are used + data_idx = ! any (isnan (data), 2); + original_rows = rows (data); + data = data(data_idx,:); + + #used for getting the number of samples + n_rows = rows (data); + + #used for convergence of the centroids + err = 1; + + ## Input checking, validate the matrix + if (! isnumeric (data) || ! ismatrix (data) || ! isreal (data)) + error ("kmeans: first input argument must be a DxN real data matrix"); + elseif (! isnumeric (k)) + error ("kmeans: second argument must be numeric"); + endif + + ## Parse options + while (length (prop) > 0) + if (length (prop) < 2) + error ("kmeans: Option '%s' has no argument", prop{1}); + endif + switch (lower (prop{1})) + case "emptyaction" emptyaction = prop{2}; + case "start" start = prop{2}; + case "maxiter" max_iter = prop{2}; + case "distance" distance = prop{2}; + + case "replicates" replicates = prop{2}; + replicates_set_explicitly = true; + + case "display" display = prop{2}; + case {"onlinephase", "options"} + warning ("kmeans: Ignoring unimplemented option '%s'", prop{1}); + + otherwise + error ("kmeans: Unknown option %s", prop{1}); + endswitch + prop = {prop{3:end}}; + endwhile + + ## Process options + + ## check for the 'emptyaction' property + switch (emptyaction) + case {"singleton", "error", "drop"} + ; + otherwise + d = [", " disp(emptyaction)] (1:end-1); # strip trailing \n + if (length (d) > 20) + d = ""; + endif + error ("kmeans: unsupported empty cluster action parameter%s", d); + endswitch + + ## check for the 'replicates' property + if (! isnumeric (replicates) || ! isscalar (replicates) + || ! isreal (replicates) || replicates < 1) + d = [", " disp(replicates)] (1:end-1); # strip trailing \n + if (length (d) > 20) + d = ""; + endif + error ("kmeans: invalid number of replicates%s", d); + endif + + ## check for the 'MaxIter' property + if (! isnumeric (max_iter) || ! isscalar (max_iter) + || ! isreal (max_iter) || max_iter < 1) + d = [", " disp(max_iter)] (1:end-1); # strip trailing \n + if (length (d) > 20) + d = ""; + endif + error ("kmeans: invalid MaxIter%s", d); + endif + + ## check for the 'start' property + switch (lower (start)) + case {"sample", "plus", "cluster"} + start = lower (start); + case {"uniform"} + start = "uniform"; + min_data = min (data); + range = max (data) - min_data; + otherwise + if (! isnumeric (start)) + d = [", " disp(start)] (1:end-1); # strip trailing \n + if (length (d) > 20) + d = ""; + endif + error ("kmeans: invalid start parameter%s", d); + endif + if (isempty (k)) + k = rows (start); + elseif (rows (start) != k) + error (["kmeans: Number of initializers (%d) " ... + "should match number of centroids (%d)"], rows (start), k); + endif + if (replicates_set_explicitly) + if (replicates != size (start, 3)) + error (["kmeans: The third dimension of the initializer (%d) " ... + "should match the number of replicates (%d)"], ... + size (start, 3), replicates); + endif + else + replicates = size (start, 3); + endif + endswitch + + ## check for the 'distance' property + ## dist returns the distance btwn each row of matrix x and a row vector c + switch (lower (distance)) + case "sqeuclidean" + dist = @(x, c) sumsq (bsxfun (@minus, x, c), 2); + centroid = @(x) mean (x, 1); + case "cityblock" + dist = @(x, c) sum (abs (bsxfun (@minus, x, c)), 2); + centroid = @(x) median (x, 1); + case "cosine" + ## Pre-normalize all data. + ## (when Octave implements normr, will use data = normr (data) ) + for i = 1:rows (data) + data(i,:) = data(i,:) / sqrt (sumsq (data(i,:))); + endfor + dist = @(x, c) 1 - (x * c') ./ sqrt (sumsq (c)); + centroid = @(x) mean (x, 1); ## already normalized + case "correlation" + ## Pre-normalize all data. + data = data - mean (data, 2); + ## (when Octave implements normr, will use data = normr (data) ) + for i = 1:rows (data) + data(i,:) = data(i,:) / sqrt (sumsq (data(i,:))); + endfor + dist = @(x, c) 1 - (x * (c - mean (c))') ... + ./ sqrt (sumsq (c - mean (c))); + centroid = @(x) mean (x, 1); ## already normalized + case "hamming" + dist = @(x, c) sum (bsxfun (@ne, x, c), 2); + centroid = @(x) median (x, 1); + otherwise + error ("kmeans: unsupported distance parameter %s", distance); + endswitch + + ## check for the 'display' property + if (! strcmp (display, "off")) + display = lower (display); + switch (display) + case {"off", "final"} ; + case "iter" + printf ("%6s\t%6s\t%8s\t%12s\n", "iter", "phase", "num", "sum"); + otherwise + error ("kmeans: invalid display parameter %s", display); + endswitch + endif + + + ## Done processing options + ######################################## + + ## Now that k has been set (possibly by 'replicates' option), check/use it. + if (! isscalar (k)) + error ("kmeans: second input argument must be a scalar"); + endif + + ## used to hold the distances from each sample to each class + D = zeros (n_rows, k); + + best = Inf; + best_centers = []; + for rep = 1:replicates + ## keep track of the number of data points that change class + old_classes = zeros (rows (data), 1); + n_changes = -1; + + ## check for the 'start' property + switch (lower (start)) + case "sample" + idx = randperm (n_rows, k); + centers = data(idx, :); + case "plus" # k-means++, by Arthur and Vassilios(?) + centers(1,:) = data(randi (n_rows),:); + d = inf (n_rows, 1); # Distance to nearest centroid so far + for i = 2:k + d = min (d, dist (data, centers(i - 1, :))); + centers(i,:) = data(find (cumsum (d) > rand * sum (d), 1), :); + endfor + case "cluster" + idx = randperm (n_rows, max (k, ceil (n_rows / 10))); + [~, centers] = kmeans (data(idx,:), k, "start", "sample", ... + "distance", distance); + case "uniform" + # vectorised 'min_data + range .* rand' + centers = bsxfun (@plus, min_data, + bsxfun (@times, range, rand (k, columns (data)))); + otherwise + centers = start(:,:,rep); + endswitch + + ## Run the algorithm + iter = 1; + + ## Classify once before the loop; to set sumd, and if max_iter == 0 + ## Compute distances and classify + [D, classes, sumd] = update_dist (data, centers, D, k, dist); + + while (err > 0.001 && iter++ <= max_iter && n_changes != 0) + ## Calculate new centroids + replaced_centroids = []; ## Used by "emptyaction = singleton" + for i = 1:k + ## Get binary vector indicating membership in cluster i + membership = (classes == i); + + ## Check for empty clusters + if (! any (membership)) + switch emptyaction + ## if 'singleton', then find the point that is the + ## farthest from any centroid (and not replacing an empty cluster + ## from earlier in this pass) and add it to the empty cluster + case 'singleton' + available = setdiff (1:n_rows, replaced_centroids); + [~, idx] = max (min (D(available,:)')); + idx = available(idx); + replaced_centroids = [replaced_centroids, idx]; + + classes(idx) = i; + membership(idx) = 1; + + ## if 'drop' then set C and D to NA + case 'drop' + centers(i,:) = NA; + D(i,:) = NA; + + ## if 'error' then throw the error + otherwise + error ("kmeans: empty cluster created"); + endswitch + endif ## end check for empty clusters + + ## update the centroids + if (any (membership)) ## if we didn't "drop" the cluster + centers(i, :) = centroid (data(membership, :)); + endif + endfor + + ## Compute distances, classes and sums + [D, classes, new_sumd] = update_dist (data, centers, D, k, dist); + ## calculate the difference in the sum of distances + err = sum (sumd - new_sumd); + ## update the current sum of distances + sumd = new_sumd; + ## compute the number of class changes + n_changes = sum (old_classes != classes); + old_classes = classes; + + ## display iteration status + if (strcmp (display, "iter")) + printf ("%6d\t%6d\t%8d\t%12.3f\n", (iter - 1), 1, ... + n_changes, sum (sumd)); + endif + endwhile + ## throw a warning if the algorithm did not converge + if (iter > max_iter && err > 0.001 && n_changes != 0) + warning ("kmeans: failed to converge in %d iterations", max_iter); + endif + + if (sum (sumd) < sum (best) || isinf (best)) + best = sumd; + best_centers = centers; + endif + + ## display final results + if (strcmp (display, "final")) + printf ("Replicate %d, %d iterations, total sum of distances = %.3f.\n", ... + rep, iter, sum (sumd)); + endif + endfor + centers = best_centers; + ## Compute final distances, classes and sums + [D, classes, sumd] = update_dist (data, centers, D, k, dist); + + ## display final results + if (strcmp (display, "final") || strcmp (display, "iter")) + printf ("Best total sum of distances = %.3f\n", sum (sumd)); + endif + + ## Return with equal size as inputs + if (original_rows != rows (data)) + final = NA (original_rows,1); + final(data_idx) = classes; ## other positions already NaN / NA + classes = final; + endif + +endfunction + +## Update distances, classes and sums +function [D, classes, sumd] = update_dist (data, centers, D, k, dist) + for i = 1:k + D (:, i) = dist (data, centers(i, :)); + endfor + [~, classes] = min (D, [], 2); + ## calculate the sum of within-class distances + sumd = zeros (k, 1); + for i = 1:k + sumd(i) = sum (D(classes == i,i)); + endfor +endfunction +## Test input parsing +%!error kmeans (rand (3,2), 4); + +%!test +%! samples = 4; +%! dims = 3; +%! k = 2; +%! [cls, c, d, z] = kmeans (rand (samples,dims), k, "start", rand (k,dims, 5), +%! "emptyAction", "singleton"); +%! assert (size (cls), [samples, 1]); +%! assert (size (c), [k, dims]); +%! assert (size (d), [k, 1]); +%! assert (size (z), [samples, k]); + +%!test +%! samples = 4; +%! dims = 3; +%! k = 2; +%! [cls, c, d, z] = kmeans (rand (samples,dims), [], "start", rand (k,dims, 5), +%! "emptyAction", "singleton"); +%! assert (size (cls), [samples, 1]); +%! assert (size (c), [k, dims]); +%! assert (size (d), [k, 1]); +%! assert (size (z), [samples, k]); + +%!test +%! kmeans (rand (4,3), 2, "start", rand (2,3, 5), "replicates", 5, +%! "emptyAction", "singleton"); + +%!error kmeans (rand (4,3), 2, "start", rand (2,3, 5), "replicates", 1); + +%!error kmeans (rand (4,3), 2, "start", rand (2,2)); + +%!test +%! kmeans (rand (3,4), 2, "start", "sample", "emptyAction", "singleton"); +%!test +%! kmeans (rand (3,4), 2, "start", "plus", "emptyAction", "singleton"); +%!test +%! kmeans (rand (3,4), 2, "start", "cluster", "emptyAction", "singleton"); +%!test +%! kmeans (rand (3,4), 2, "start", "uniform", "emptyAction", "singleton"); + +%!error kmeans (rand (3,4), 2, "start", "normal"); + +%!error kmeans (rand (4,3), 2, "replicates", i); +%!error kmeans (rand (4,3), 2, "replicates", -1); +%!error kmeans (rand (4,3), 2, "replicates", []); +%!error kmeans (rand (4,3), 2, "replicates", [1 2]); +%!error kmeans (rand (4,3), 2, "replicates", "one"); + +%!error kmeans (rand (4,3), 2, "MAXITER", i); +%!error kmeans (rand (4,3), 2, "MaxIter", -1); +%!error kmeans (rand (4,3), 2, "maxiter", []); +%!error kmeans (rand (4,3), 2, "maxiter", [1 2]); +%!error kmeans (rand (4,3), 2, "maxiter", "one"); + +%!test +%! kmeans (rand (4,3), 2, "distance", "sqeuclidean", "emptyAction", "singleton"); + +%!test +%! kmeans (rand (4,3), 2, "distance", "cityblock", "emptyAction", "singleton"); + +%!test +%! kmeans (rand (4,3), 2, "distance", "cosine", "emptyAction", "singleton"); + +%!test +%! kmeans (rand (4,3), 2, "distance", "correlation", "emptyAction", "singleton"); + +%!test +%! kmeans (rand (4,3), 2, "distance", "hamming", "emptyAction", "singleton"); + +%!error kmeans (rand (4,3), 2, "distance", "manhattan"); + +%!error kmeans ([1 0; 1.1 0], 2, "start", eye(2), "emptyaction", "error"); + +%!test +%! kmeans ([1 0; 1.1 0], 2, "start", eye(2), "emptyaction", "singleton"); + +%!test +%! [cls, c] = kmeans ([1 0; 2 0], 2, "start", [8,0;0,8], "emptyaction", "drop"); +%! assert (cls, [1; 1]); +%! assert (c, [1.5, 0; NA, NA]); + +%!error kmeans ([1 0; 1.1 0], 2, "start", eye(2), "emptyaction", "panic"); + +%!demo +%! ## Generate a two-cluster problem +%! C1 = randn (100, 2) + 1; +%! C2 = randn (100, 2) - 1; +%! data = [C1; C2]; +%! +%! ## Perform clustering +%! [idx, centers] = kmeans (data, 2); +%! +%! ## Plot the result +%! figure; +%! plot (data (idx==1, 1), data (idx==1, 2), 'ro'); +%! hold on; +%! plot (data (idx==2, 1), data (idx==2, 2), 'bs'); +%! plot (centers (:, 1), centers (:, 2), 'kv', 'markersize', 10); +%! hold off; diff --git a/inst/kruskalwallis.m b/inst/kruskalwallis.m new file mode 100644 index 0000000..ff5f94a --- /dev/null +++ b/inst/kruskalwallis.m @@ -0,0 +1,287 @@ +## Copyright (C) 2021 Andreas Bertsatos +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} kruskalwallis (@var{x}) +## @deftypefnx {Function File} {@var{p} =} kruskalwallis (@var{x}, @var{group}) +## @deftypefnx {Function File} {@var{p} =} kruskalwallis (@var{x}, @var{group}, @var{displayopt}) +## @deftypefnx {Function File} {[@var{p}, @var{tbl}] =} kruskalwallis (@var{x}, @dots{}) +## @deftypefnx {Function File} {[@var{p}, @var{tbl}, @var{stats}] =} kruskalwallis (@var{x}, @dots{}) +## +## Perform a Kruskal-Wallis test, the non-parametric alternative of a one-way +## analysis of variance (ANOVA), for comparing the means of two or more groups +## of data under the null hypothesis that the groups are drawn from the same +## population, i.e. the group means are equal. +## +## kruskalwallis can take up to three input arguments: +## +## @itemize +## @item +## @var{x} contains the data and it can either be a vector or matrix. +## If @var{x} is a matrix, then each column is treated as a separate group. +## If @var{x} is a vector, then the @var{group} argument is mandatory. +## @item +## @var{group} contains the names for each group. If @var{x} is a matrix, then +## @var{group} can either be a cell array of strings of a character array, with +## one row per column of @var{x}. If you want to omit this argument, enter an +## empty array ([]). If @var{x} is a vector, then @var{group} must be a vector +## of the same lenth, or a string array or cell array of strings with one row +## for each element of @var{x}. @var{x} values corresponding to the same value +## of @var{group} are placed in the same group. +## @item +## @var{displayopt} is an optional parameter for displaying the groups contained +## in the data in a boxplot. If omitted, it is 'on' by default. If group names +## are defined in @var{group}, these are used to identify the groups in the +## boxplot. Use 'off' to omit displaying this figure. +## @end itemize +## +## kruskalwallis can return up to three output arguments: +## +## @itemize +## @item +## @var{p} is the p-value of the null hypothesis that all group means are equal. +## @item +## @var{tbl} is a cell array containing the results in a standard ANOVA table. +## @item +## @var{stats} is a structure containing statistics useful for performing +## a multiple comparison of means with the MULTCOMPARE function. +## @end itemize +## +## If kruskalwallis is called without any output arguments, then it prints the +## results in a one-way ANOVA table to the standard output. It is also printed +## when @var{displayopt} is 'on'. +## +## Examples: +## +## @example +## x = meshgrid (1:6); +## x = x + normrnd (0, 1, 6, 6); +## [p, atab] = kruskalwallis(x); +## @end example +## +## +## @example +## x = ones (50, 4) .* [-2, 0, 1, 5]; +## x = x + normrnd (0, 2, 50, 4); +## group = @{"A", "B", "C", "D"@}; +## kruskalwallis (x, group); +## @end example +## +## @end deftypefn + +function [p, tbl, stats] = kruskalwallis (x, group, displayopt) + + ## check for valid number of input arguments + narginchk (1, 3); + ## add defaults + if (nargin < 2) + group = []; + endif + if (nargin < 3) + displayopt = 'on'; + endif + plotdata = ~(strcmp (displayopt, 'off')); + + ## Convert group to cell array from character array, make it a column + if (! isempty (group) && ischar (group)) + group = cellstr(group); + endif + if (size (group, 1) == 1) + group = group'; + endif + + ## If X is a matrix, convert it to column vector and create a + ## corresponging column vector for groups + if (length (x) < prod (size (x))) + [n, m] = size (x); + x = x(:); + gi = reshape (repmat ((1:m), n, 1), n*m, 1); + if (length (group) == 0) ## no group names are provided + group = gi; + elseif (size (group, 1) == m) ## group names exist and match columns + group = group(gi,:); + else + error("X columns and GROUP length do not match."); + endif + endif + + ## Identify NaN values (if any) and remove them from X along with + ## their corresponding values from group vector + nonan = ~isnan (x); + x = x(nonan); + group = group(nonan, :); + + ## Convert group to indices and separate names + [group_id, group_names] = grp2idx (group); + group_id = group_id(:); + named = 1; + + ## Rank data for non-parametric analysis + [xr, tieadj] = tieranks (x); + + ## Get group size and mean for each group + groups = size (group_names, 1); + xs = zeros (1, groups); + xm = xs; + for j = 1:groups + group_size = find (group_id == j); + xs(j) = length (group_size); + xm(j) = mean (xr(group_size)); + endfor + + ## Calculate statistics + lx = length (xr); ## Number of samples in groups + gm = mean (xr); ## Grand mean of groups + dfm = length (xm) - 1; ## degrees of freedom for model + dfe = lx - dfm - 1; ## degrees of freedom for error + SSM = xs .* (xm - gm) * (xm - gm)'; ## Sum of Squares for Model + SST = (xr(:) - gm)' * (xr(:) - gm); ## Sum of Squares Total + SSE = SST - SSM; ## Sum of Squares Error + if (dfm > 0) + MSM = SSM / dfm; ## Mean Square for Model + else + MSM = NaN; + endif + if (dfe > 0) + MSE = SSE / dfe; ## Mean Squared Error + else + MSE = NaN; + endif + ## Calculate Chi-sq statistic + ChiSq = (12 * SSM) / (lx * (lx + 1)); + if (tieadj > 0) + ChiSq = ChiSq / (1 - 2 * tieadj / (lx ^ 3 - lx)); + end + p = 1 - chi2cdf (ChiSq, dfm); + + ## Create results table (if requested) + if (nargout > 1) + tbl = {"Source", "SS", "df", "MS", "Chi-sq", "Prob>Chi-sq"; ... + "Groups", SSM, dfm, MSM, ChiSq, p; ... + "Error", SSE, dfe, MSE, "", ""; ... + "Total", SST, dfm + dfe, "", "", ""}; + endif + ## Create stats structure (if requested) for MULTCOMPARE + if (nargout > 2) + if (length (group_names) > 0) + stats.gnames = group_names; + else + stats.gnames = strjust (num2str ((1:length (xm))'), 'left'); + end + stats.n = xs; + stats.source = 'kruskalwallis'; + stats.meanranks = xm; + stats.sumt = 2 * tieadj; + endif + ## Print results table on screen if no output argument was requested + if (nargout == 0 || plotdata) + printf(" Kruskal-Wallis ANOVA Table\n"); + printf("Source SS df MS Chi-sq Prob>Chi-sq\n"); + printf("---------------------------------------------------------\n"); + printf("Columns %10.2f %5.0f %10.2f %8.2f %11.5e\n", ... + SSM, dfm, MSM, ChiSq, p); + printf("Error %10.2f %5.0f %10.2f\n", SSE, dfe, MSE); + printf("Total %10.2f %5.0f\n", SST, dfm + dfe); + endif + ## Plot data using BOXPLOT (unless opted out) + if (plotdata) + boxplot (x, group_id, 'Notch', "on", 'Labels', group_names); + endif +endfunction + +## local function for computing tied ranks on column vectors +function [r, tieadj] = tieranks (x) + ## Sort data + [value, x_idx] = sort (x); + epsx = zeros (size (x)); + epsx = epsx(x_idx); + x_l = numel (x); + ## Count ranks from start (min value) + ranks = [1:x_l]'; + ## Initialize tie adjustments + tieadj = 0; + ## Adjust for ties. + ties = value(1:x_l-1) + epsx(1:x_l-1) >= value(2:x_l) - epsx(2:x_l); + t_idx = find (ties); + t_idx(end+1) = 0; + maxTies = numel (t_idx); + ## Calculate tie adjustments + tiecount = 1; + while (tiecount < maxTies) + tiestart = t_idx(tiecount); + ntied = 2; + while (t_idx(tiecount+1) == t_idx(tiecount) + 1) + tiecount = tiecount + 1; + ntied = ntied + 1; + endwhile + ## Check for tieflag + tieadj = tieadj + ntied * (ntied - 1) * (ntied + 1) / 2; + ## Average tied ranks + ranks(tiestart:tiestart + ntied - 1) = ... + sum (ranks(tiestart:tiestart + ntied - 1)) / ntied; + tiecount = tiecount + 1; + endwhile + ## Remap data to original dimensions + r(x_idx) = ranks; +endfunction + + +%!demo +%! x = meshgrid (1:6); +%! x = x + normrnd (0, 1, 6, 6); +%! kruskalwallis (x, [], 'off'); + +%!demo +%! x = meshgrid (1:6); +%! x = x + normrnd (0, 1, 6, 6); +%! [p, atab] = kruskalwallis(x); + +%!demo +%! x = ones (30, 4) .* [-2, 0, 1, 5]; +%! x = x + normrnd (0, 2, 30, 4); +%! group = {"A", "B", "C", "D"}; +%! kruskalwallis (x, group); + +## testing results against SPSS and R on the GEAR.DAT data file available from +## https://www.itl.nist.gov/div898/handbook/eda/section3/eda354.htm +%!test +%! data = [1.006, 0.996, 0.998, 1.000, 0.992, 0.993, 1.002, 0.999, 0.994, 1.000, ... +%! 0.998, 1.006, 1.000, 1.002, 0.997, 0.998, 0.996, 1.000, 1.006, 0.988, ... +%! 0.991, 0.987, 0.997, 0.999, 0.995, 0.994, 1.000, 0.999, 0.996, 0.996, ... +%! 1.005, 1.002, 0.994, 1.000, 0.995, 0.994, 0.998, 0.996, 1.002, 0.996, ... +%! 0.998, 0.998, 0.982, 0.990, 1.002, 0.984, 0.996, 0.993, 0.980, 0.996, ... +%! 1.009, 1.013, 1.009, 0.997, 0.988, 1.002, 0.995, 0.998, 0.981, 0.996, ... +%! 0.990, 1.004, 0.996, 1.001, 0.998, 1.000, 1.018, 1.010, 0.996, 1.002, ... +%! 0.998, 1.000, 1.006, 1.000, 1.002, 0.996, 0.998, 0.996, 1.002, 1.006, ... +%! 1.002, 0.998, 0.996, 0.995, 0.996, 1.004, 1.004, 0.998, 0.999, 0.991, ... +%! 0.991, 0.995, 0.984, 0.994, 0.997, 0.997, 0.991, 0.998, 1.004, 0.997]; +%! group = [1:10] .* ones (10,10); +%! group = group(:); +%! [p, tbl] = kruskalwallis (data, group, "off"); +%! assert (p, 0.048229, 1e-6); +%! assert (tbl{2,5}, 17.03124, 1e-5); +%! assert (tbl{2,3}, 9, 0); +%! assert (tbl{4,2}, 82655.5, 1e-16); +%! data = reshape (data, 10, 10); +%! [p, tbl, stats] = kruskalwallis (data, [], "off"); +%! assert (p, 0.048229, 1e-6); +%! assert (tbl{2,5}, 17.03124, 1e-5); +%! assert (tbl{2,3}, 9, 0); +%! assert (tbl{4,2}, 82655.5, 1e-16); +%! means = [51.85, 60.45, 37.6, 51.1, 29.5, 54.25, 64.55, 66.7, 53.65, 35.35]; +%! N = 10 * ones (1, 10); +%! assert (stats.meanranks, means, 1e-6); +%! assert (length (stats.gnames), 10, 0); +%! assert (stats.n, N, 0); diff --git a/inst/linkage.m b/inst/linkage.m new file mode 100644 index 0000000..93b370c --- /dev/null +++ b/inst/linkage.m @@ -0,0 +1,263 @@ +## Copyright (C) 2008 Francesco Potortì +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} linkage (@var{d}) +## @deftypefnx {Function File} {@var{y} =} linkage (@var{d}, @var{method}) +## @deftypefnx {Function File} {@var{y} =} linkage (@var{x}) +## @deftypefnx {Function File} {@var{y} =} linkage (@var{x}, @var{method}) +## @deftypefnx {Function File} @ +## {@var{y} =} linkage (@var{x}, @var{method}, @var{metric}) +## @deftypefnx {Function File} @ +## {@var{y} =} linkage (@var{x}, @var{method}, @var{arglist}) +## +## Produce a hierarchical clustering dendrogram +## +## @var{d} is the dissimilarity matrix relative to n observations, +## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. +## Alternatively, @var{x} contains data formatted for input to +## @code{pdist}, @var{metric} is a metric for @code{pdist} and +## @var{arglist} is a cell array containing arguments that are passed to +## @code{pdist}. +## +## @code{linkage} starts by putting each observation into a singleton +## cluster and numbering those from 1 to n. Then it merges two +## clusters, chosen according to @var{method}, to create a new cluster +## numbered n+1, and so on until all observations are grouped into +## a single cluster numbered 2(n-1). Row k of the +## (m-1)x3 output matrix relates to cluster n+k: the first +## two columns are the numbers of the two component clusters and column +## 3 contains their distance. +## +## @var{method} defines the way the distance between two clusters is +## computed and how they are recomputed when two clusters are merged: +## +## @table @samp +## @item "single" (default) +## Distance between two clusters is the minimum distance between two +## elements belonging each to one cluster. Produces a cluster tree +## known as minimum spanning tree. +## +## @item "complete" +## Furthest distance between two elements belonging each to one cluster. +## +## @item "average" +## Unweighted pair group method with averaging (UPGMA). +## The mean distance between all pair of elements each belonging to one +## cluster. +## +## @item "weighted" +## Weighted pair group method with averaging (WPGMA). +## When two clusters A and B are joined together, the new distance to a +## cluster C is the mean between distances A-C and B-C. +## +## @item "centroid" +## Unweighted Pair-Group Method using Centroids (UPGMC). +## Assumes Euclidean metric. The distance between cluster centroids, +## each centroid being the center of mass of a cluster. +## +## @item "median" +## Weighted pair-group method using centroids (WPGMC). +## Assumes Euclidean metric. Distance between cluster centroids. When +## two clusters are joined together, the new centroid is the midpoint +## between the joined centroids. +## +## @item "ward" +## Ward's sum of squared deviations about the group mean (ESS). +## Also known as minimum variance or inner squared distance. +## Assumes Euclidean metric. How much the moment of inertia of the +## merged cluster exceeds the sum of those of the individual clusters. +## @end table +## +## @strong{Reference} +## Ward, J. H. Hierarchical Grouping to Optimize an Objective Function +## J. Am. Statist. Assoc. 1963, 58, 236-244, +## @url{http://iv.slis.indiana.edu/sw/data/ward.pdf}. +## @end deftypefn +## +## @seealso{pdist,squareform} + +## Author: Francesco Potortì + +function dgram = linkage (d, method = "single", distarg, savememory) + + ## check the input + if (nargin == 4) && (strcmpi (savememory, "savememory")) + warning ("Octave:linkage_savemem", ... + "linkage: option 'savememory' not implemented"); + elseif (nargin < 1) || (nargin > 3) + print_usage (); + endif + + if (isempty (d)) + error ("linkage: d cannot be empty"); + endif + + methods = struct ... + ("name", { "single"; "complete"; "average"; "weighted"; + "centroid"; "median"; "ward" }, + "distfunc", {(@(x) min(x)) # single + (@(x) max(x)) # complete + (@(x,i,j,w) sum(diag(w([i,j]))*x)/sum(w([i,j]))) # average + (@(x) mean(x)) # weighted + (@massdist) # centroid + (@(x,i) massdist(x,i)) # median + (@inertialdist) # ward + }); + mask = strcmp (lower (method), {methods.name}); + if (! any (mask)) + error ("linkage: %s: unknown method", method); + endif + dist = {methods.distfunc}{mask}; + + if (nargin >= 3 && ! isvector (d)) + if (ischar (distarg)) + d = pdist (d, distarg); + elseif (iscell (distarg)) + d = pdist (d, distarg{:}); + else + print_usage (); + endif + elseif (nargin < 3) + if (! isvector (d)) + d = pdist (d); + endif + else + print_usage (); + endif + + d = squareform (d, "tomatrix"); # dissimilarity NxN matrix + n = rows (d); # the number of observations + diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements + d(diagidx) = Inf; # consider a cluster as far from itself + ## For equal-distance nodes, the order in which clusters are + ## merged is arbitrary. Rotating the initial matrix produces an + ## ordering similar to Matlab's. + cname = n:-1:1; # cluster names in d + d = rot90 (d, 2); # exchange low and high cluster numbers + weight = ones (1, n); # cluster weights + dgram = zeros (n-1, 3); # clusters from n+1 to 2*n-1 + for cluster = n+1 : 2*n-1 + ## Find the two nearest clusters + [m midx] = min (d(:)); + [r, c] = ind2sub (size (d), midx); + ## Here is the new cluster + dgram(cluster-n, :) = [cname(r) cname(c) d(r, c)]; + ## Put it in place of the first one and remove the second + cname(r) = cluster; + cname(c) = []; + ## Compute the new distances. + ## (Octave-7+ needs switch stmt to avoid 'called with too many inputs' err.) + switch find (mask) + case {1, 2, 4} # 1 arg + newd = dist (d([r c], :)); + case {3, 5, 7} # 4 args + newd = dist (d([r c], :), r, c, weight); + case 6 # 2 args + newd = dist (d([r c], :), r); + otherwise + endswitch + newd(r) = Inf; # Take care of the diagonal element + ## Put distances in place of the first ones, remove the second ones + d(r,:) = newd; + d(:,r) = newd'; + d(c,:) = []; + d(:,c) = []; + ## The new weight is the sum of the components' weights + weight(r) += weight(c); + weight(c) = []; + endfor + ## Sort the cluster numbers, as Matlab does + dgram(:,1:2) = sort (dgram(:,1:2), 2); + + ## Check that distances are monotonically increasing + if (any (diff (dgram(:,3)) < 0)) + warning ("Octave:clustering", + "linkage: cluster distances do not monotonically increase\n\ + you should probably use a method different from \"%s\"", method); + endif + +endfunction + + +## Take two row vectors, which are the Euclidean distances of clusters I +## and J from the others. Column I of second row contains the distance +## between clusters I and J. The centre of gravity of the new cluster +## is on the segment joining the old ones. W are the weights of all +## clusters. Use the law of cosines to find the distances of the new +## cluster from all the others. +function y = massdist (x, i, j, w) + x .^= 2; # Squared Euclidean distances + if (nargin == 2) # Median distance + qi = 0.5; # Equal weights ("weighted") + else # Centroid distance + qi = 1 / (1 + w(j) / w(i)); # Proportional weights ("unweighted") + endif + y = sqrt (qi * x(1, :) + (1 - qi) * (x(2, :) - qi * x(2, i))); +endfunction + + +## Take two row vectors, which are the inertial distances of clusters I +## and J from the others. Column I of second row contains the inertial +## distance between clusters I and J. The centre of gravity of the new +## cluster K is on the segment joining I and J. W are the weights of +## all clusters. Convert inertial to Euclidean distances, then use the +## law of cosines to find the Euclidean distances of K from all the +## other clusters, convert them back to inertial distances and return +## them. +function y = inertialdist (x, i, j, w) + wi = w(i); # The cluster + wj = w(j); # weights. + s = [wi + w; # Sum of weights for + wj + w]; # all cluster pairs. + p = [wi * w; # Product of weights for + wj * w]; # all cluster pairs. + x = x.^2 .* s ./ p; # Convert inertial dist. to squared Eucl. + sij = wi + wj; # Sum of weights of I and J + qi = wi / sij; # Normalise the weight of I + ## Squared Euclidean distances between all clusters and new cluster K + x = qi * x(1, :) + (1 - qi) * (x(2, :) - qi * x(2, i)); + y = sqrt (x * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial +endfunction + + +%!shared x, t +%! x = reshape (mod (magic (6),5), [], 3); +%! t = 1e-6; + +%!assert (cond (linkage (pdist (x))), 34.119045, t); +%!assert (cond (linkage (pdist (x), "complete")), 21.793345, t); +%!assert (cond (linkage (pdist (x), "average")), 27.045012, t); +%!assert (cond (linkage (pdist (x), "weighted")), 27.412889, t); + +%! lastwarn(); # Clear last warning before the test +%!warning linkage (pdist (x), "centroid"); + +%!test +%! warning off Octave:clustering +%! assert (cond (linkage (pdist (x), "centroid")), 27.457477, t); +%! warning on Octave:clustering + +%!warning linkage (pdist (x), "median"); + +%!test +%! warning off Octave:clustering +%! assert (cond (linkage (pdist (x), "median")), 27.683325, t); +%! warning on Octave:clustering + +%!assert (cond (linkage (pdist (x), "ward")), 17.195198, t); +%!assert (cond (linkage (x, "ward", "euclidean")), 17.195198, t); +%!assert (cond (linkage (x, "ward", {"euclidean"})), 17.195198, t); +%!assert (cond (linkage (x, "ward", {"minkowski", 2})), 17.195198, t); diff --git a/inst/lognstat.m b/inst/lognstat.m new file mode 100644 index 0000000..6f17fa2 --- /dev/null +++ b/inst/lognstat.m @@ -0,0 +1,123 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} lognstat (@var{mu}, @var{sigma}) +## Compute mean and variance of the lognormal distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{mu} is the first parameter of the lognormal distribution +## +## @item +## @var{sigma} is the second parameter of the lognormal distribution. +## @var{sigma} must be positive or zero +## @end itemize +## @var{mu} and @var{sigma} must be of common size or one of them must be +## scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the lognormal distribution +## +## @item +## @var{v} is the variance of the lognormal distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## mu = 0:0.2:1; +## sigma = 0.2:0.2:1.2; +## [m, v] = lognstat (mu, sigma) +## @end group +## +## @group +## [m, v] = lognstat (0, sigma) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the lognormal distribution + +function [m, v] = lognstat (mu, sigma) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (mu) && ! ismatrix (mu)) + error ("lognstat: mu must be a numeric matrix"); + endif + if (! isempty (sigma) && ! ismatrix (sigma)) + error ("lognstat: sigma must be a numeric matrix"); + endif + + if (! isscalar (mu) || ! isscalar (sigma)) + [retval, mu, sigma] = common_size (mu, sigma); + if (retval > 0) + error ("lognstat: mu and sigma must be of common size or scalar"); + endif + endif + + # Calculate moments + m = exp (mu + (sigma .^ 2) ./ 2); + v = (exp (sigma .^ 2) - 1) .* exp (2 .* mu + sigma .^ 2); + + # Continue argument check + k = find (! (sigma >= 0) | ! (sigma < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! mu = 0:0.2:1; +%! sigma = 0.2:0.2:1.2; +%! [m, v] = lognstat (mu, sigma); +%! expected_m = [1.0202, 1.3231, 1.7860, 2.5093, 3.6693, 5.5845]; +%! expected_v = [0.0425, 0.3038, 1.3823, 5.6447, 23.1345, 100.4437]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! sigma = 0.2:0.2:1.2; +%! [m, v] = lognstat (0, sigma); +%! expected_m = [1.0202, 1.0833, 1.1972, 1.3771, 1.6487, 2.0544]; +%! expected_v = [0.0425, 0.2036, 0.6211, 1.7002, 4.6708, 13.5936]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/mahal.m b/inst/mahal.m new file mode 100644 index 0000000..1b553d1 --- /dev/null +++ b/inst/mahal.m @@ -0,0 +1,93 @@ +## Copyright (C) 2015 Lachlan Andrew +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## This program, is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {} mahal (@var{y}, @var{x}) +## Mahalanobis' D-square distance. +## +## Return the Mahalanobis' D-square distance of the points in +## @var{y} from the distribution implied by points @var{x}. +## +## Specifically, it uses a Cholesky decomposition to set +## +## @example +## answer(i) = (@var{y}(i,:) - mean (@var{x})) * inv (A) * (@var{y}(i,:)-mean (@var{x}))' +## @end example +## +## where A is the covariance of @var{x}. +## +## The data @var{x} and @var{y} must have the same number of components +## (columns), but may have a different number of observations (rows). +## +## @end deftypefn + +## Author: Lachlan Andrew +## Created: September 2015 +## Based on function mahalanobis by Friedrich Leisch + +function retval = mahal (y, x) + + if (nargin != 2) + print_usage (); + endif + + if (! (isnumeric (x) || islogical (x)) || ! (isnumeric (y) || islogical (y))) + error ("mahal: X and Y must be numeric matrices or vectors"); + endif + + if (! ismatrix (x) || ! ismatrix (y)) + error ("mahal: X and Y must be 2-D matrices or vectors"); + endif + + [xr, xc] = size (x); + [yr, yc] = size (y); + + if (xc != yc) + error ("mahal: X and Y must have the same number of columns"); + endif + + if (isinteger (x)) + x = double (x); + endif + + xm = mean (x, 1); + + ## Center data by subtracting mean of x + x = bsxfun (@minus, x, xm); + y = bsxfun (@minus, y, xm); + + w = (x' * x) / (xr - 1); + + retval = sumsq (y / chol (w), 2); + +endfunction + + +## Test input validation +%!error mahal () +%!error mahal (1, 2, 3) +%!error mahal ("A", "B") +%!error mahal ([1, 2], ["A", "B"]) +%!error mahal (ones (2, 2, 2)) +%!error mahal (ones (2, 2), ones (2, 2, 2)) +%!error mahal (ones (2, 2), ones (2, 3)) + +%!test +%! X = [1 0; 0 1; 1 1; 0 0]; +%! assert (mahal (X, X), [1.5; 1.5; 1.5; 1.5], 10*eps) +%! assert (mahal (X, X+1), [7.5; 7.5; 1.5; 13.5], 10*eps) + +%!assert (mahal ([true; true], [false; true]), [0.5; 0.5], eps) diff --git a/inst/mhsample.m b/inst/mhsample.m new file mode 100644 index 0000000..3863fd3 --- /dev/null +++ b/inst/mhsample.m @@ -0,0 +1,365 @@ +######################################################################## +## +## Copyright (C) 1993-2021 The Octave Project Developers +## +## See the file COPYRIGHT.md in the top-level directory of this +## distribution or . +## +## This file is part of Octave. +## +## Octave is free software: you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . +## +######################################################################## + +## -*- texinfo -*- +## @deftypefn {} {[@var{smpl}, @var{accept}] =} mhsample (@var{start}, @var{nsamples}, @var{property}, @var{value}, ...) +## Draws @var{nsamples} samples from a target stationary distribution @var{pdf} +## using Metropolis-Hastings algorithm. +## +## Inputs: +## +## @itemize +## @item +## @var{start} is a @var{nchain} by @var{dim} matrix of starting points for each +## Markov chain. Each row is the starting point of a different chain and each +## column corresponds to a different dimension. +## +## @item +## @var{nsamples} is the number of samples, the length of each Markov chain. +## @end itemize +## +## Some property-value pairs can or must be specified, they are: +## +## (Required) One of: +## +## @itemize +## @item +## "pdf" @var{pdf}: a function handle of the target stationary distribution to +## be sampled. The function should accept different locations in each row and +## each column corresponds to a different dimension. +## +## or +## +## @item +## "logpdf" @var{logpdf}: a function handle of the log of the target stationary +## distribution to be sampled. The function should accept different locations +## in each row and each column corresponds to a different dimension. +## @end itemize +## +## In case optional argument @var{symmetric} is set to false (the default), one +## of: +## +## @itemize +## @item +## "proppdf" @var{proppdf}: a function handle of the proposal distribution that +## is sampled from with @var{proprnd} to give the next point in the chain. The +## function should accept two inputs, the random variable and the current +## location each input should accept different locations in each row and each +## column corresponds to a different dimension. +## +## or +## +## @item +## "logproppdf" @var{logproppdf}: the log of "proppdf". +## @end itemize +## +## The following input property/pair values may be needed depending on the +## desired outut: +## +## @itemize +## @item +## "proprnd" @var{proprnd}: (Required) a function handle which generates random +## numbers from @var{proppdf}. The function should accept different locations +## in each row and each column corresponds to a different dimension +## corresponding with the current location. +## +## @item +## "symmetric" @var{symmetric}: true or false based on whether @var{proppdf} is +## a symmetric distribution. If true, @var{proppdf} (or @var{logproppdf}) need +## not be specified. The default is false. +## +## @item +## "burnin" @var{burnin} the number of points to discard at the beginning, the +## default is 0. +## +## @item +## "thin" @var{thin}: omits @var{thin}-1 of every @var{thin} points in the +## generated Markov chain. The default is 1. +## +## @item +## "nchain" @var{nchain}: the number of Markov chains to generate. The default +## is 1. +## @end itemize +## +## Outputs: +## +## @itemize +## @item +## @var{smpl}: a @var{nsamples} x @var{dim} x @var{nchain} tensor of random +## values drawn from @var{pdf}, where the rows are different random values, the +## columns correspond to the dimensions of @var{pdf}, and the third dimension +## corresponds to different Markov chains. +## +## @item +## @var{accept} is a vector of the acceptance rate for each chain. +## @end itemize +## +## Example : Sampling from a normal distribution +## +## @example +## @group +## start = 1; +## nsamples = 1e3; +## pdf = @@(x) exp (-.5 * x .^ 2) / (pi ^ .5 * 2 ^ .5); +## proppdf = @@(x,y) 1 / 6; +## proprnd = @@(x) 6 * (rand (size (x)) - .5) + x; +## [smpl, accept] = mhsample (start, nsamples, "pdf", pdf, "proppdf", ... +## proppdf, "proprnd", proprnd, "thin", 4); +## histfit (smpl); +## @end group +## @end example +## +## @seealso{rand, slicesample} +## @end deftypefn + +function [smpl, accept] = mhsample (start, nsamples, varargin) + + if (nargin < 6) + print_usage (); + endif + + sizestart = size (start); + pdf = []; + proppdf = []; + logpdf = []; + logproppdf = []; + proprnd = []; + sym = false; + K = 0; # burnin + m = 1; # thin + nchain = 1; + + for k = 1:2:length (varargin) + if (ischar (varargin{k})) + switch lower(varargin{k}) + case "pdf" + if (isa (varargin{k+1}, "function_handle")) + pdf = varargin{k+1}; + else + error ("mhsample: pdf must be a function handle"); + endif + + case "proppdf" + if (isa (varargin{k+1}, "function_handle")) + proppdf = varargin{k+1}; + else + error ("mhsample: proppdf must be a function handle"); + endif + + case "logpdf" + if (isa (varargin{k+1}, "function_handle")) + pdf = varargin{k+1}; + else + error ("mhsample: logpdf must be a function handle"); + endif + + case "logproppdf" + if (isa (varargin{k+1}, "function_handle")) + proppdf = varargin{k+1}; + else + error ("mhsample: logproppdf must be a function handle"); + endif + + case "proprnd" + if (isa (varargin{k+1}, "function_handle")) + proprnd = varargin{k+1}; + else + error ("mhsample: proprnd must be a function handle"); + endif + + case "symmetric" + if (isa (varargin{k+1}, "logical")) + sym = varargin{k+1}; + else + error ("mhsample: sym must be true or false"); + endif + + case "burnin" + if (varargin{k+1}>=0) + K = varargin{k+1}; + else + error ("mhsample: K must be greater than or equal to 0"); + endif + + case "thin" + if (varargin{k+1} >= 1) + m = varargin{k+1}; + else + error ("mhsample: m must be greater than or equal to 1"); + endif + + case "nchain" + if (varargin{k+1} >= 1) + nchain = varargin{k+1}; + else + error ("mhsample: nchain must be greater than or equal to 1"); + endif + + otherwise + warning (["mhsample: Ignoring unknown option " varargin{k}]); + endswitch + else + error (["mhsample: " varargin{k} " is not a valid property."]); + endif + endfor + + if (! isempty (pdf) && isempty (logpdf)) + logpdf=@(x) rloge (pdf (x)); + elseif (isempty (pdf) && isempty (logpdf)) + error ("mhsample: pdf or logpdf must be input."); + endif + if (! isempty (proppdf) && isempty (logproppdf)) + logproppdf = @(x, y) rloge (proppdf (x, y)); + elseif (isempty (proppdf) && isempty (logproppdf) && ! sym) + error ("mhsample: proppdf or logproppdf must be input unless 'symetrical' is true."); + endif + if (! isa (proprnd, "function_handle")) + error ("mhsample: proprnd must be a function handle."); + endif + if (length (sizestart) == 2) + sizestart = [sizestart 0]; + end + smpl = zeros (nsamples, sizestart(2), nchain); + + if (all (sizestart([1 3]) == [1 nchain])) + ## Could remove, not Matlab compatable but allows continuing chains + smpl(1, :, :) = start; + elseif (all (sizestart([1 3]) == [nchain 0])) + smpl(1, :, :) = permute (start, [3, 2, 1]); + elseif (all (sizestart([1 3]) == [1 0])) + ## Could remove, not Matlab compatable but allows all chains to start + ## at the same location + smpl(1, :, :) = repmat (start,[1, 1, nchain]); + else + error ("mhsample: start must be a nchain by dim matrix."); + endif + cx = permute (smpl(1, :, :),[3, 2, 1]); + accept = zeros (nchain, 1); + i = 1; + rnd = log (rand (nchain, nsamples*m+K)); + for k = 1:nsamples*m+K + canacc = rem (k-K, m) == 0; + px = proprnd (cx); + if (sym) + A = logpdf (px) - logpdf(cx); + else + A = (logpdf (px) + logproppdf (cx, px)) - (logpdf (cx) + logproppdf (px, cx)); + endif + ac = rnd(:, k) < min (A, 0); + cx(ac, :) = px(ac, :); + accept(ac)++; + if (canacc) + smpl(i, :, :) = permute (cx, [3, 2, 1]); + end + if (k > K && canacc) + i++; + endif + endfor + accept ./= (nsamples * m + K); + +endfunction + + +function y = rloge (x) + + y = -inf (size (x)); + xg0 = x > 0; + y(xg0) = log (x(xg0)); + +endfunction + + +%!demo +%! ## Define function to sample +%! d = 2; +%! mu = [-1; 2]; +%! Sigma = rand (d); +%! Sigma = (Sigma + Sigma'); +%! Sigma += eye (d) * abs (eigs (Sigma, 1, "sa")) * 1.1; +%! pdf = @(x)(2*pi)^(-d/2)*det(Sigma)^-.5*exp(-.5*sum((x.'-mu).*(Sigma\(x.'-mu)),1)); +%! ## Inputs +%! start = ones (1, 2); +%! nsamples = 500; +%! sym = true; +%! K = 500; +%! m = 10; +%! proprnd = @(x) (rand (size (x)) - .5) * 3 + x; +%! [smpl, accept] = mhsample (start, nsamples, "pdf", pdf, "proprnd", proprnd, ... +%! "symmetric", sym, "burnin", K, "thin", m); +%! figure; +%! hold on; +%! plot (smpl(:, 1), smpl(:, 2), 'x'); +%! [x, y] = meshgrid (linspace (-6, 4), linspace(-3, 7)); +%! z = reshape (pdf ([x(:), y(:)]), size(x)); +%! mesh (x, y, z, "facecolor", "None"); +%! ## Using sample points to find the volume of half a sphere with radius of .5 +%! f = @(x) ((.25-(x(:,1)+1).^2-(x(:,2)-2).^2).^.5.*(((x(:,1)+1).^2+(x(:,2)-2).^2)<.25)).'; +%! int = mean (f (smpl) ./ pdf (smpl)); +%! errest = std (f (smpl) ./ pdf (smpl)) / nsamples ^ .5; +%! trueerr = abs (2 / 3 * pi * .25 ^ (3 / 2) - int); +%! printf ("Monte Carlo integral estimate int f(x) dx = %f\n", int); +%! printf ("Monte Carlo integral error estimate %f\n", errest); +%! printf ("The actual error %f\n", trueerr); +%! mesh (x, y, reshape (f([x(:), y(:)]), size(x)), "facecolor", "None"); + +%!demo +%! ## Integrate truncated normal distribution to find normilization constant +%! pdf = @(x) exp (-.5*x.^2)/(pi^.5*2^.5); +%! nsamples = 1e3; +%! proprnd = @(x) (rand (size (x)) - .5) * 3 + x; +%! [smpl, accept] = mhsample (1, nsamples, "pdf", pdf, "proprnd", proprnd, ... +%! "symmetric", true, "thin", 4); +%! f = @(x) exp(-.5 * x .^ 2) .* (x >= -2 & x <= 2); +%! x = linspace (-3, 3, 1000); +%! area(x, f(x)); +%! xlabel ('x'); +%! ylabel ('f(x)'); +%! int = mean (f (smpl) ./ pdf (smpl)); +%! errest = std (f (smpl) ./ pdf (smpl)) / nsamples^ .5; +%! trueerr = abs (erf (2 ^ .5) * 2 ^ .5 * pi ^ .5 - int); +%! printf ("Monte Carlo integral estimate int f(x) dx = %f\n", int); +%! printf ("Monte Carlo integral error estimate %f\n", errest); +%! printf ("The actual error %f\n", trueerr); + + +%!test +%! nchain = 1e4; +%! start = rand (nchain, 1); +%! nsamples = 1e3; +%! pdf = @(x) exp (-.5*(x-1).^2)/(2*pi)^.5; +%! proppdf = @(x, y) 1/3; +%! proprnd = @(x) 3 * (rand (size (x)) - .5) + x; +%! [smpl, accept] = mhsample (start, nsamples, "pdf", pdf, "proppdf", proppdf, ... +%! "proprnd", proprnd, "thin", 2, "nchain", nchain, ... +%! "burnin", 0); +%! assert (mean (mean (smpl, 1), 3), 1, .01); +%! assert (mean (var (smpl, 1), 3), 1, .01) + +%!error mhsample (); +%!error mhsample (1); +%!error mhsample (1, 1); +%!error mhsample (1, 1, "pdf", @(x)x); +%!error mhsample (1, 1, "pdf", @(x)x, "proprnd", @(x)x+rand(size(x))); + diff --git a/inst/mnpdf.m b/inst/mnpdf.m new file mode 100644 index 0000000..fb1920e --- /dev/null +++ b/inst/mnpdf.m @@ -0,0 +1,134 @@ +## Copyright (C) 2012 Arno Onken +## +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} mnpdf (@var{x}, @var{p}) +## Compute the probability density function of the multinomial distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is vector with a single sample of a multinomial distribution with +## parameter @var{p} or a matrix of random samples from multinomial +## distributions. In the latter case, each row of @var{x} is a sample from a +## multinomial distribution with the corresponding row of @var{p} being its +## parameter. +## +## @item +## @var{p} is a vector with the probabilities of the categories or a matrix +## with each row containing the probabilities of a multinomial sample. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{y} is a vector of probabilites of the random samples @var{x} from the +## multinomial distribution with corresponding parameter @var{p}. The parameter +## @var{n} of the multinomial distribution is the sum of the elements of each +## row of @var{x}. The length of @var{y} is the number of columns of @var{x}. +## If a row of @var{p} does not sum to @code{1}, then the corresponding element +## of @var{y} will be @code{NaN}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = [1, 4, 2]; +## p = [0.2, 0.5, 0.3]; +## y = mnpdf (x, p); +## @end group +## +## @group +## x = [1, 4, 2; 1, 0, 9]; +## p = [0.2, 0.5, 0.3; 0.1, 0.1, 0.8]; +## y = mnpdf (x, p); +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, 2001. +## +## @item +## Merran Evans, Nicholas Hastings and Brian Peacock. @cite{Statistical +## Distributions}. pages 134-136, Wiley, New York, third edition, 2000. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: PDF of the multinomial distribution + +function y = mnpdf (x, p) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! ismatrix (x) || any (x(:) < 0 | round (x(:) != x(:)))) + error ("mnpdf: x must be a matrix of non-negative integer values"); + endif + if (! ismatrix (p) || any (p(:) < 0)) + error ("mnpdf: p must be a non-empty matrix with rows of probabilities"); + endif + + # Adjust input sizes + if (! isvector (x) || ! isvector (p)) + if (isvector (x)) + x = x(:)'; + endif + if (isvector (p)) + p = p(:)'; + endif + if (size (x, 1) == 1 && size (p, 1) > 1) + x = repmat (x, size (p, 1), 1); + elseif (size (x, 1) > 1 && size (p, 1) == 1) + p = repmat (p, size (x, 1), 1); + endif + endif + # Continue argument check + if (any (size (x) != size (p))) + error ("mnpdf: x and p must have compatible sizes"); + endif + + # Count total number of elements of each multinomial sample + n = sum (x, 2); + # Compute probability density function of the multinomial distribution + t = x .* log (p); + t(x == 0) = 0; + y = exp (gammaln (n+1) - sum (gammaln (x+1), 2) + sum (t, 2)); + # Set invalid rows to NaN + k = (abs (sum (p, 2) - 1) > 1e-6); + y(k) = NaN; + +endfunction + +%!test +%! x = [1, 4, 2]; +%! p = [0.2, 0.5, 0.3]; +%! y = mnpdf (x, p); +%! assert (y, 0.11812, 0.001); + +%!test +%! x = [1, 4, 2; 1, 0, 9]; +%! p = [0.2, 0.5, 0.3; 0.1, 0.1, 0.8]; +%! y = mnpdf (x, p); +%! assert (y, [0.11812; 0.13422], 0.001); diff --git a/inst/mnrnd.m b/inst/mnrnd.m new file mode 100644 index 0000000..1332faf --- /dev/null +++ b/inst/mnrnd.m @@ -0,0 +1,184 @@ +## Copyright (C) 2012 Arno Onken +## +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{x} =} mnrnd (@var{n}, @var{p}) +## @deftypefnx {Function File} {@var{x} =} mnrnd (@var{n}, @var{p}, @var{s}) +## Generate random samples from the multinomial distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{n} is the first parameter of the multinomial distribution. @var{n} can +## be scalar or a vector containing the number of trials of each multinomial +## sample. The elements of @var{n} must be non-negative integers. +## +## @item +## @var{p} is the second parameter of the multinomial distribution. @var{p} can +## be a vector with the probabilities of the categories or a matrix with each +## row containing the probabilities of a multinomial sample. If @var{p} has +## more than one row and @var{n} is non-scalar, then the number of rows of +## @var{p} must match the number of elements of @var{n}. +## +## @item +## @var{s} is the number of multinomial samples to be generated. @var{s} must +## be a non-negative integer. If @var{s} is specified, then @var{n} must be +## scalar and @var{p} must be a vector. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{x} is a matrix of random samples from the multinomial distribution with +## corresponding parameters @var{n} and @var{p}. Each row corresponds to one +## multinomial sample. The number of columns, therefore, corresponds to the +## number of columns of @var{p}. If @var{s} is not specified, then the number +## of rows of @var{x} is the maximum of the number of elements of @var{n} and +## the number of rows of @var{p}. If a row of @var{p} does not sum to @code{1}, +## then the corresponding row of @var{x} will contain only @code{NaN} values. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## n = 10; +## p = [0.2, 0.5, 0.3]; +## x = mnrnd (n, p); +## @end group +## +## @group +## n = 10 * ones (3, 1); +## p = [0.2, 0.5, 0.3]; +## x = mnrnd (n, p); +## @end group +## +## @group +## n = (1:2)'; +## p = [0.2, 0.5, 0.3; 0.1, 0.1, 0.8]; +## x = mnrnd (n, p); +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, 2001. +## +## @item +## Merran Evans, Nicholas Hastings and Brian Peacock. @cite{Statistical +## Distributions}. pages 134-136, Wiley, New York, third edition, 2000. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Random samples from the multinomial distribution + +function x = mnrnd (n, p, s) + + # Check arguments + if (nargin == 3) + if (! isscalar (n) || n < 0 || round (n) != n) + error ("mnrnd: n must be a non-negative integer"); + endif + if (! isvector (p) || any (p < 0 | p > 1)) + error ("mnrnd: p must be a vector of probabilities"); + endif + if (! isscalar (s) || s < 0 || round (s) != s) + error ("mnrnd: s must be a non-negative integer"); + endif + elseif (nargin == 2) + if (isvector (p) && size (p, 1) > 1) + p = p'; + endif + if (! isvector (n) || any (n < 0 | round (n) != n) || size (n, 2) > 1) + error ("mnrnd: n must be a non-negative integer column vector"); + endif + if (! ismatrix (p) || isempty (p) || any (p < 0 | p > 1)) + error ("mnrnd: p must be a non-empty matrix with rows of probabilities"); + endif + if (! isscalar (n) && size (p, 1) > 1 && length (n) != size (p, 1)) + error ("mnrnd: the length of n must match the number of rows of p"); + endif + else + print_usage (); + endif + + # Adjust input sizes + if (nargin == 3) + n = n * ones (s, 1); + p = repmat (p(:)', s, 1); + elseif (nargin == 2) + if (isscalar (n) && size (p, 1) > 1) + n = n * ones (size (p, 1), 1); + elseif (size (p, 1) == 1) + p = repmat (p, length (n), 1); + endif + endif + sz = size (p); + + # Upper bounds of categories + ub = cumsum (p, 2); + # Make sure that the greatest upper bound is 1 + gub = ub(:, end); + ub(:, end) = 1; + # Lower bounds of categories + lb = [zeros(sz(1), 1) ub(:, 1:(end-1))]; + + # Draw multinomial samples + x = zeros (sz); + for i = 1:sz(1) + # Draw uniform random numbers + r = repmat (rand (n(i), 1), 1, sz(2)); + # Compare the random numbers of r to the cumulated probabilities of p and + # count the number of samples for each category + x(i, :) = sum (r <= repmat (ub(i, :), n(i), 1) & r > repmat (lb(i, :), n(i), 1), 1); + endfor + # Set invalid rows to NaN + k = (abs (gub - 1) > 1e-6); + x(k, :) = NaN; + +endfunction + +%!test +%! n = 10; +%! p = [0.2, 0.5, 0.3]; +%! x = mnrnd (n, p); +%! assert (size (x), size (p)); +%! assert (all (x >= 0)); +%! assert (all (round (x) == x)); +%! assert (sum (x) == n); + +%!test +%! n = 10 * ones (3, 1); +%! p = [0.2, 0.5, 0.3]; +%! x = mnrnd (n, p); +%! assert (size (x), [length(n), length(p)]); +%! assert (all (x >= 0)); +%! assert (all (round (x) == x)); +%! assert (all (sum (x, 2) == n)); + +%!test +%! n = (1:2)'; +%! p = [0.2, 0.5, 0.3; 0.1, 0.1, 0.8]; +%! x = mnrnd (n, p); +%! assert (size (x), size (p)); +%! assert (all (x >= 0)); +%! assert (all (round (x) == x)); +%! assert (all (sum (x, 2) == n)); diff --git a/inst/monotone_smooth.m b/inst/monotone_smooth.m new file mode 100644 index 0000000..59f199f --- /dev/null +++ b/inst/monotone_smooth.m @@ -0,0 +1,162 @@ +## Copyright (C) 2011 Nir Krakauer +## Copyright (C) 2011 Carnë Draug +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{yy} =} monotone_smooth (@var{x}, @var{y}, @var{h}) +## Produce a smooth monotone increasing approximation to a sampled functional +## dependence +## +## A kernel method is used (an Epanechnikov smoothing kernel is +## applied to y(x); this is integrated to yield the monotone increasing form. +## See Reference 1 for details.) +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is a vector of values of the independent variable. +## +## @item +## @var{y} is a vector of values of the dependent variable, of the same size as +## @var{x}. For best performance, it is recommended that the @var{y} already be +## fairly smooth, e.g. by applying a kernel smoothing to the original values if +## they are noisy. +## +## @item +## @var{h} is the kernel bandwidth to use. If @var{h} is not given, a "reasonable" +## value is computed. +## +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{yy} is the vector of smooth monotone increasing function values at @var{x}. +## +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = 0:0.1:10; +## y = (x .^ 2) + 3 * randn(size(x)); %typically non-monotonic from the added noise +## ys = ([y(1) y(1:(end-1))] + y + [y(2:end) y(end)])/3; %crudely smoothed via +## moving average, but still typically non-monotonic +## yy = monotone_smooth(x, ys); %yy is monotone increasing in x +## plot(x, y, '+', x, ys, x, yy) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Holger Dette, Natalie Neumeyer and Kay F. Pilz (2006), A simple nonparametric +## estimator of a strictly monotone regression function, @cite{Bernoulli}, 12:469-490 +## @item +## Regine Scheder (2007), R Package 'monoProc', Version 1.0-6, +## @url{http://cran.r-project.org/web/packages/monoProc/monoProc.pdf} (The +## implementation here is based on the monoProc function mono.1d) +## @end enumerate +## @end deftypefn + +## Author: Nir Krakauer +## Description: Nonparametric monotone increasing regression + +function yy = monotone_smooth (x, y, h) + + if (nargin < 2 || nargin > 3) + print_usage (); + elseif (!isnumeric (x) || !isvector (x)) + error ("first argument x must be a numeric vector") + elseif (!isnumeric (y) || !isvector (y)) + error ("second argument y must be a numeric vector") + elseif (numel (x) != numel (y)) + error ("x and y must have the same number of elements") + elseif (nargin == 3 && (!isscalar (h) || !isnumeric (h))) + error ("third argument 'h' (kernel bandwith) must a numeric scalar") + endif + + n = numel(x); + + %set filter bandwidth at a reasonable default value, if not specified + if (nargin != 3) + s = std(x); + h = s / (n^0.2); + end + + x_min = min(x); + x_max = max(x); + + y_min = min(y); + y_max = max(y); + + %transform range of x to [0, 1] + xl = (x - x_min) / (x_max - x_min); + + yy = ones(size(y)); + + %Epanechnikov smoothing kernel (with finite support) + %K_epanech_kernel = @(z) (3/4) * ((1 - z).^2) .* (abs(z) < 1); + + K_epanech_int = @(z) mean(((abs(z) < 1)/2) - (3/4) * (z .* (abs(z) < 1) - (1/3) * (z.^3) .* (abs(z) < 1)) + (z < -1)); + + %integral of kernels up to t + monotone_inverse = @(t) K_epanech_int((y - t) / h); + + %find the value of the monotone smooth function at each point in x + niter_max = 150; %maximum number of iterations for estimating each value (should not be reached in most cases) + for l = 1:n + + tmax = y_max; + tmin = y_min; + wmin = monotone_inverse(tmin); + wmax = monotone_inverse(tmax); + if (wmax == wmin) + yy(l) = tmin; + else + wt = xl(l); + iter_max_reached = 1; + for i = 1:niter_max + wt_scaled = (wt - wmin) / (wmax - wmin); + tn = tmin + wt_scaled * (tmax - tmin) ; + wn = monotone_inverse(tn); + wn_scaled = (wn - wmin) / (wmax - wmin); + + %if (abs(wt-wn) < 1E-4) || (tn < (y_min-0.1)) || (tn > (y_max+0.1)) + %% criterion for break in the R code -- replaced by the following line to + %% hopefully be less dependent on the scale of y + if (abs(wt_scaled-wn_scaled) < 1E-4) || (wt_scaled < -0.1) || (wt_scaled > 1.1) + iter_max_reached = 0; + break + endif + if wn > wt + tmax = tn; + wmax = wn; + else + tmin = tn; + wmin = wn; + endif + endfor + if iter_max_reached + warning("at x = %g, maximum number of iterations %d reached without convergence; approximation may not be optimal", x(l), niter_max) + endif + yy(l) = tmin + (wt - wmin) * (tmax - tmin) / (wmax - wmin); + endif + endfor +endfunction diff --git a/inst/mvncdf.m b/inst/mvncdf.m new file mode 100644 index 0000000..864badc --- /dev/null +++ b/inst/mvncdf.m @@ -0,0 +1,173 @@ +## Copyright (C) 2008 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} mvncdf (@var{x}, @var{mu}, @var{sigma}) +## @deftypefnx {Function File} {} mvncdf (@var{a}, @var{x}, @var{mu}, @var{sigma}) +## @deftypefnx {Function File} {[@var{p}, @var{err}] =} mvncdf (@dots{}) +## Compute the cumulative distribution function of the multivariate +## normal distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the upper limit for integration where each row corresponds +## to an observation. +## +## @item +## @var{mu} is the mean. +## +## @item +## @var{sigma} is the correlation matrix. +## +## @item +## @var{a} is the lower limit for integration where each row corresponds +## to an observation. @var{a} must have the same size as @var{x}. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the cumulative distribution at each row of @var{x} and +## @var{a}. +## +## @item +## @var{err} is the estimated error. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = [1 2]; +## mu = [0.5 1.5]; +## sigma = [1.0 0.5; 0.5 1.0]; +## p = mvncdf (x, mu, sigma) +## @end group +## +## @group +## a = [-inf 0]; +## p = mvncdf (a, x, mu, sigma) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Alan Genz and Frank Bretz. Numerical Computation of Multivariate +## t-Probabilities with Application to Power Calculation of Multiple +## Constrasts. @cite{Journal of Statistical Computation and Simulation}, +## 63, pages 361-378, 1999. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: CDF of the multivariate normal distribution + +function [p, err] = mvncdf (varargin) + + # Monte-Carlo confidence factor for the standard error: 99 % + gamma = 2.5; + # Tolerance + err_eps = 1e-3; + + if (length (varargin) == 1) + x = varargin{1}; + mu = []; + sigma = eye (size (x, 2)); + a = -Inf .* ones (size (x)); + elseif (length (varargin) == 3) + x = varargin{1}; + mu = varargin{2}; + sigma = varargin{3}; + a = -Inf .* ones (size (x)); + elseif (length (varargin) == 4) + a = varargin{1}; + x = varargin{2}; + mu = varargin{3}; + sigma = varargin{4}; + else + print_usage (); + endif + + # Dimension + q = size (sigma, 1); + cases = size (x, 1); + + # Default value for mu + if (isempty (mu)) + mu = zeros (1, q); + endif + + # Check parameters + if (size (x, 2) != q) + error ("mvncdf: x must have the same number of columns as sigma"); + endif + + if (any (size (x) != size (a))) + error ("mvncdf: a must have the same size as x"); + endif + + if (isscalar (mu)) + mu = ones (1, q) .* mu; + elseif (! isvector (mu) || size (mu, 2) != q) + error ("mvncdf: mu must be a scalar or a vector with the same number of columns as x"); + endif + + x = x - repmat (mu, cases, 1); + + if (q < 1 || size (sigma, 2) != q || any (any (sigma != sigma')) || min (eig (sigma)) <= 0) + error ("mvncdf: sigma must be nonempty symmetric positive definite"); + endif + + c = chol (sigma)'; + + # Number of integral transformations + n = 1; + + p = zeros (cases, 1); + varsum = zeros (cases, 1); + + err = ones (cases, 1) .* err_eps; + # Apply crude Monte-Carlo estimation + while any (err >= err_eps) + # Sample from q-1 dimensional unit hypercube + w = rand (cases, q - 1); + + # Transformation of the multivariate normal integral + dvev = normcdf ([a(:, 1) / c(1, 1), x(:, 1) / c(1, 1)]); + dv = dvev(:, 1); + ev = dvev(:, 2); + fv = ev - dv; + y = zeros (cases, q - 1); + for i = 1:(q - 1) + y(:, i) = norminv (dv + w(:, i) .* (ev - dv)); + dvev = normcdf ([(a(:, i + 1) - c(i + 1, 1:i) .* y(:, 1:i)) ./ c(i + 1, i + 1), (x(:, i + 1) - c(i + 1, 1:i) .* y(:, 1:i)) ./ c(i + 1, i + 1)]); + dv = dvev(:, 1); + ev = dvev(:, 2); + fv = (ev - dv) .* fv; + endfor + + n++; + # Estimate standard error + varsum += (n - 1) .* ((fv - p) .^ 2) ./ n; + err = gamma .* sqrt (varsum ./ (n .* (n - 1))); + p += (fv - p) ./ n; + endwhile + +endfunction diff --git a/inst/mvnpdf.m b/inst/mvnpdf.m new file mode 100644 index 0000000..2d8b2e2 --- /dev/null +++ b/inst/mvnpdf.m @@ -0,0 +1,132 @@ +## Author: Paul Kienzle +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} mvnpdf (@var{x}) +## @deftypefnx{Function File} {@var{y} =} mvnpdf (@var{x}, @var{mu}) +## @deftypefnx{Function File} {@var{y} =} mvnpdf (@var{x}, @var{mu}, @var{sigma}) +## Compute multivariate normal pdf for @var{x} given mean @var{mu} and covariance matrix +## @var{sigma}. The dimension of @var{x} is @var{d} x @var{p}, @var{mu} is +## @var{1} x @var{p} and @var{sigma} is @var{p} x @var{p}. The normal pdf is +## defined as +## +## @example +## @iftex +## @tex +## $$ 1/y^2 = (2 pi)^p |\Sigma| \exp \{ (x-\mu)^T \Sigma^{-1} (x-\mu) \} $$ +## @end tex +## @end iftex +## @ifnottex +## 1/@var{y}^2 = (2 pi)^@var{p} |@var{Sigma}| exp @{ (@var{x}-@var{mu})' inv(@var{Sigma})@ +## (@var{x}-@var{mu}) @} +## @end ifnottex +## @end example +## +## @strong{References} +## +## NIST Engineering Statistics Handbook 6.5.4.2 +## http://www.itl.nist.gov/div898/handbook/pmc/section5/pmc542.htm +## +## @strong{Algorithm} +## +## Using Cholesky factorization on the positive definite covariance matrix: +## +## @example +## @var{r} = chol (@var{sigma}); +## @end example +## +## where @var{r}'*@var{r} = @var{sigma}. Being upper triangular, the determinant +## of @var{r} is trivially the product of the diagonal, and the determinant of +## @var{sigma} is the square of this: +## +## @example +## @var{det} = prod (diag (@var{r}))^2; +## @end example +## +## The formula asks for the square root of the determinant, so no need to +## square it. +## +## The exponential argument @var{A} = @var{x}' * inv (@var{sigma}) * @var{x} +## +## @example +## @var{A} = @var{x}' * inv (@var{sigma}) * @var{x} +## = @var{x}' * inv (@var{r}' * @var{r}) * @var{x} +## = @var{x}' * inv (@var{r}) * inv(@var{r}') * @var{x} +## @end example +## +## Given that inv (@var{r}') == inv(@var{r})', at least in theory if not numerically, +## +## @example +## @var{A} = (@var{x}' / @var{r}) * (@var{x}'/@var{r})' = sumsq (@var{x}'/@var{r}) +## @end example +## +## The interface takes the parameters to the multivariate normal in columns rather than +## rows, so we are actually dealing with the transpose: +## +## @example +## @var{A} = sumsq (@var{x}/r) +## @end example +## +## and the final result is: +## +## @example +## @var{r} = chol (@var{sigma}) +## @var{y} = (2*pi)^(-@var{p}/2) * exp (-sumsq ((@var{x}-@var{mu})/@var{r}, 2)/2) / prod (diag (@var{r})) +## @end example +## +## @seealso{mvncdf, mvnrnd} +## @end deftypefn + +function pdf = mvnpdf (x, mu = 0, sigma = 1) + ## Check input + if (!ismatrix (x)) + error ("mvnpdf: first input must be a matrix"); + endif + + if (!isvector (mu) && !isscalar (mu)) + error ("mvnpdf: second input must be a real scalar or vector"); + endif + + if (!ismatrix (sigma) || !issquare (sigma)) + error ("mvnpdf: third input must be a square matrix"); + endif + + [ps, ps] = size (sigma); + [d, p] = size (x); + if (p != ps) + error ("mvnpdf: dimensions of data and covariance matrix does not match"); + endif + + if (numel (mu) != p && numel (mu) != 1) + error ("mvnpdf: dimensions of data does not match dimensions of mean value"); + endif + + mu = mu (:).'; + if (all (size (mu) == [1, p])) + mu = repmat (mu, [d, 1]); + endif + + if (nargin < 3) + pdf = (2*pi)^(-p/2) * exp (-sumsq (x-mu, 2)/2); + else + r = chol (sigma); + pdf = (2*pi)^(-p/2) * exp (-sumsq ((x-mu)/r, 2)/2) / prod (diag (r)); + endif +endfunction + +%!demo +%! mu = [0, 0]; +%! sigma = [1, 0.1; 0.1, 0.5]; +%! [X, Y] = meshgrid (linspace (-3, 3, 25)); +%! XY = [X(:), Y(:)]; +%! Z = mvnpdf (XY, mu, sigma); +%! mesh (X, Y, reshape (Z, size (X))); +%! colormap jet + +%!test +%! mu = [1,-1]; +%! sigma = [.9 .4; .4 .3]; +%! x = [ 0.5 -1.2; -0.5 -1.4; 0 -1.5]; +%! p = [ 0.41680003660313; 0.10278162359708; 0.27187267524566 ]; +%! q = mvnpdf (x, mu, sigma); +%! assert (p, q, 10*eps); diff --git a/inst/mvnrnd.m b/inst/mvnrnd.m new file mode 100644 index 0000000..a39231e --- /dev/null +++ b/inst/mvnrnd.m @@ -0,0 +1,140 @@ +## Copyright (C) 2003 Iain Murray +## +## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @var{s} = mvnrnd (@var{mu}, @var{Sigma}) +## @deftypefnx{Function File} @var{s} = mvnrnd (@var{mu}, @var{Sigma}, @var{n}) +## @deftypefnx{Function File} @var{s} = mvnrnd (@dots{}, @var{tol}) +## Draw @var{n} random @var{d}-dimensional vectors from a multivariate Gaussian distribution with mean @var{mu}(@var{n}x@var{d}) and covariance matrix +## @var{Sigma}(@var{d}x@var{d}). +## +## @var{mu} must be @var{n}-by-@var{d} (or 1-by-@var{d} if @var{n} is given) or a scalar. +## +## If the argument @var{tol} is given the eigenvalues of @var{Sigma} are checked for positivity against -100*tol. The default value of tol is @code{eps*norm (Sigma, "fro")}. +## +## @end deftypefn + +function s = mvnrnd (mu, Sigma, K, tol=eps*norm (Sigma, "fro")) + + % Iain Murray 2003 -- I got sick of this simple thing not being in Octave and locking up a stats-toolbox license in Matlab for no good reason. + % May 2004 take a third arg, cases. Makes it more compatible with Matlab's. + + % Paul Kienzle + % * Add GPL notice. + % * Add docs for argument K + + % 2012 Juan Pablo Carbajal + % * Uses Octave 3.6.2 broadcast. + % * Stabilizes chol by perturbing Sigma with a epsilon multiple of the identity. + % The effect on the generated samples is to add additional independent noise of variance epsilon. Ref: GPML Rasmussen & Williams. 2006. pp 200-201 + % * Improved doc. + % * Added tolerance to the positive definite check + % * Used chol with option 'upper'. + + % 2014 Nir Krakauer + % * Add tests. + % * Allow mu to be scalar, in which case it's assumed that all elements share this mean. + + + %perform some input checking + if ~issquare (Sigma) + error ('Sigma must be a square covariance matrix.'); + end + + d = size(Sigma, 1); + + % If mu is column vector and Sigma not a scalar then assume user didn't read help but let them off and flip mu. Don't be more liberal than this or it will encourage errors (eg what should you do if mu is square?). + if (size (mu, 2) == 1) && (d != 1) + mu = mu'; + end + + if nargin >= 3 + n = K; + else + n = size(mu, 1); %1 if mu is scalar + end + + if (~isscalar (mu)) && any(size (mu) != [1,d]) && any(size (mu) != [n,d]) + error ('mu must be nxd, 1xd, or scalar, where Sigma has dimensions dxd.'); + end + + warning ("off", "Octave:broadcast","local"); + + try + U = chol (Sigma + tol*eye (d),"upper"); + catch + [E , Lambda] = eig (Sigma); + + if min (diag (Lambda)) < -100*tol + error('Sigma must be positive semi-definite. Lowest eigenvalue %g', ... + min (diag (Lambda))); + else + Lambda(Lambda<0) = 0; + end + warning ("mvnrnd:InvalidInput","Cholesky factorization failed. Using diagonalized matrix.") + U = sqrt (Lambda) * E'; + end + + s = randn(n,d)*U + mu; + + warning ("on", "Octave:broadcast"); +endfunction + +% {{{ END OF CODE --- Guess I should provide an explanation: +% +% We can draw from axis aligned unit Gaussians with randn(d) +% x ~ A*exp(-0.5*x'*x) +% We can then rotate this distribution using +% y = U'*x +% Note that +% x = inv(U')*y +% Our new variable y is distributed according to: +% y ~ B*exp(-0.5*y'*inv(U'*U)*y) +% or +% y ~ N(0,Sigma) +% where +% Sigma = U'*U +% For a given Sigma we can use the chol function to find the corresponding U, +% draw x and find y. We can adjust for a non-zero mean by just adding it on. +% +% But the Cholsky decomposition function doesn't always work... +% Consider Sigma=[1 1;1 1]. Now inv(Sigma) doesn't actually exist, but Matlab's +% mvnrnd provides samples with this covariance st x(1)~N(0,1) x(2)=x(1). The +% fast way to deal with this would do something similar to chol but be clever +% when the rows aren't linearly independent. However, I can't be bothered, so +% another way of doing the decomposition is by diagonalising Sigma (which is +% slower but works). +% if +% [E,Lambda]=eig(Sigma) +% then +% Sigma = E*Lambda*E' +% so +% U = sqrt(Lambda)*E' +% If any Lambdas are negative then Sigma just isn't even positive semi-definite +% so we can give up. +% +% Paul Kienzle adds: +% Where it exists, chol(Sigma) is numerically well behaved. chol(hilb(12)) for doubles and for 100 digit floating point differ in the last digit. +% Where chol(Sigma) doesn't exist, X*sqrt(Lambda)*E' will be somewhat accurate. For example, the elements of sqrt(Lambda)*E' for hilb(12), hilb(55) and hilb(120) are accurate to around 1e-8 or better. This was tested using the TNT+JAMA for eig and chol templates, and qlib for 100 digit precision. +% }}} + +%!shared m, n, C, rho +%! m = 10; n = 3; rho = 0.4; C = rho*ones(n, n) + (1 - rho)*eye(n); +%!assert(size(mvnrnd(0, C, m)), [m n]) +%!assert(size(mvnrnd(zeros(1, n), C, m)), [m n]) +%!assert(size(mvnrnd(zeros(n, 1), C, m)), [m n]) +%!assert(size(mvnrnd(zeros(m, n), C, m)), [m n]) +%!assert(size(mvnrnd(zeros(m, n), C)), [m n]) +%!assert(size(mvnrnd(zeros(1, n), C)), [1 n]) +%!assert(size(mvnrnd(zeros(n, 1), C)), [1 n]) +%!error(mvnrnd(zeros(m+1, n), C, m)) +%!error(mvnrnd(zeros(1, n+1), C, m)) +%!error(mvnrnd(zeros(n+1, 1), C, m)) +%!error(mvnrnd(zeros(m, n), eye(n+1), m)) +%!error(mvnrnd(zeros(m, n), eye(n+1, n), m)) + diff --git a/inst/mvtcdf.m b/inst/mvtcdf.m new file mode 100644 index 0000000..8a271a1 --- /dev/null +++ b/inst/mvtcdf.m @@ -0,0 +1,166 @@ +## Copyright (C) 2008 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} mvtcdf (@var{x}, @var{sigma}, @var{nu}) +## @deftypefnx {Function File} {} mvtcdf (@var{a}, @var{x}, @var{sigma}, @var{nu}) +## @deftypefnx {Function File} {[@var{p}, @var{err}] =} mvtcdf (@dots{}) +## Compute the cumulative distribution function of the multivariate +## Student's t distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the upper limit for integration where each row corresponds +## to an observation. +## +## @item +## @var{sigma} is the correlation matrix. +## +## @item +## @var{nu} is the degrees of freedom. +## +## @item +## @var{a} is the lower limit for integration where each row corresponds +## to an observation. @var{a} must have the same size as @var{x}. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the cumulative distribution at each row of @var{x} and +## @var{a}. +## +## @item +## @var{err} is the estimated error. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = [1 2]; +## sigma = [1.0 0.5; 0.5 1.0]; +## nu = 4; +## p = mvtcdf (x, sigma, nu) +## @end group +## +## @group +## a = [-inf 0]; +## p = mvtcdf (a, x, sigma, nu) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Alan Genz and Frank Bretz. Numerical Computation of Multivariate +## t-Probabilities with Application to Power Calculation of Multiple +## Constrasts. @cite{Journal of Statistical Computation and Simulation}, +## 63, pages 361-378, 1999. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: CDF of the multivariate Student's t distribution + +function [p, err] = mvtcdf (varargin) + + # Monte-Carlo confidence factor for the standard error: 99 % + gamma = 2.5; + # Tolerance + err_eps = 1e-3; + + if (length (varargin) == 3) + x = varargin{1}; + sigma = varargin{2}; + nu = varargin{3}; + a = -Inf .* ones (size (x)); + elseif (length (varargin) == 4) + a = varargin{1}; + x = varargin{2}; + sigma = varargin{3}; + nu = varargin{4}; + else + print_usage (); + endif + + # Dimension + q = size (sigma, 1); + cases = size (x, 1); + + # Check parameters + if (size (x, 2) != q) + error ("mvtcdf: x must have the same number of columns as sigma"); + endif + + if (any (size (x) != size (a))) + error ("mvtcdf: a must have the same size as x"); + endif + + if (! isscalar (nu) && (! isvector (nu) || length (nu) != cases)) + error ("mvtcdf: nu must be a scalar or a vector with the same number of rows as x"); + endif + + # Convert to correlation matrix if necessary + if (any (diag (sigma) != 1)) + svar = repmat (diag (sigma), 1, q); + sigma = sigma ./ sqrt (svar .* svar'); + endif + if (q < 1 || size (sigma, 2) != q || any (any (sigma != sigma')) || min (eig (sigma)) <= 0) + error ("mvtcdf: sigma must be nonempty symmetric positive definite"); + endif + + nu = nu(:); + c = chol (sigma)'; + + # Number of integral transformations + n = 1; + + p = zeros (cases, 1); + varsum = zeros (cases, 1); + + err = ones (cases, 1) .* err_eps; + # Apply crude Monte-Carlo estimation + while any (err >= err_eps) + # Sample from q-1 dimensional unit hypercube + w = rand (cases, q - 1); + + # Transformation of the multivariate t-integral + dvev = tcdf ([a(:, 1) / c(1, 1), x(:, 1) / c(1, 1)], nu); + dv = dvev(:, 1); + ev = dvev(:, 2); + fv = ev - dv; + y = zeros (cases, q - 1); + for i = 1:(q - 1) + y(:, i) = tinv (dv + w(:, i) .* (ev - dv), nu + i - 1) .* sqrt ((nu + sum (y(:, 1:(i-1)) .^ 2, 2)) ./ (nu + i - 1)); + tf = (sqrt ((nu + i) ./ (nu + sum (y(:, 1:i) .^ 2, 2)))) ./ c(i + 1, i + 1); + dvev = tcdf ([(a(:, i + 1) - c(i + 1, 1:i) .* y(:, 1:i)) .* tf, (x(:, i + 1) - c(i + 1, 1:i) .* y(:, 1:i)) .* tf], nu + i); + dv = dvev(:, 1); + ev = dvev(:, 2); + fv = (ev - dv) .* fv; + endfor + + n++; + # Estimate standard error + varsum += (n - 1) .* ((fv - p) .^ 2) ./ n; + err = gamma .* sqrt (varsum ./ (n .* (n - 1))); + p += (fv - p) ./ n; + endwhile + +endfunction diff --git a/inst/mvtpdf.m b/inst/mvtpdf.m new file mode 100644 index 0000000..711836f --- /dev/null +++ b/inst/mvtpdf.m @@ -0,0 +1,105 @@ +## Copyright (C) 2015 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} mvtpdf (@var{x}, @var{sigma}, @var{nu}) +## Compute the probability density function of the multivariate Student's t distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} are the points at which to find the probability, where each row corresponds +## to an observation. (@var{n} by @var{d} matrix) +## +## @item +## @var{sigma} is the scale matrix. (@var{d} by @var{d} symmetric positive definite matrix) +## +## @item +## @var{nu} is the degrees of freedom. (scalar or @var{n} vector) +## +## @end itemize +## +## The distribution is assumed to be centered (zero mean). +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the probability density for each row of @var{x}. (@var{n} by 1 vector) +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = [1 2]; +## sigma = [1.0 0.5; 0.5 1.0]; +## nu = 4; +## p = mvtpdf (x, sigma, nu) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Michael Roth, On the Multivariate t Distribution, Technical report from Automatic Control at Linkoepings universitet, @url{http://users.isy.liu.se/en/rt/roth/student.pdf} +## @end enumerate +## @end deftypefn + +## Author: Nir Krakauer +## Description: PDF of the multivariate Student's t distribution + +function [p] = mvtpdf (x, sigma, nu) + + if (nargin != 3) + print_usage (); + endif + + # Dimensions + d = size (sigma, 1); + n = size (x, 1); + + # Check parameters + if (size (x, 2) != d) + error ("mvtpdf: x must have the same number of columns as sigma"); + endif + if (! isscalar (nu) && (! isvector (nu) || numel (nu) != n)) + error ("mvtpdf: nu must be a scalar or a vector with the same number of rows as x"); + endif + if (d < 1 || size (sigma, 2) != d || ! issymmetric (sigma)) + error ("mvtpdf: sigma must be nonempty and symmetric"); + endif + + try + U = chol (sigma); + catch + error ("mvtpdf: sigma must be positive definite"); + end_try_catch + + nu = nu(:); + sqrt_det_sigma = prod(diag(U)); #square root of determinant of sigma + + c = (gamma((nu+d)/2) ./ gamma(nu/2)) ./ (sqrt_det_sigma * (nu*pi).^(d/2)); #scale factor for PDF + p = c ./ ((1 + sumsq(U' \ x') ./ nu') .^ ((nu' + d)/2))'; #note: sumsq(U' \ x') is equivalent to the quadratic form x*inv(sigma)*x' + + +endfunction + +#test results verified with R mvtnorm package dmvt function +%!assert (mvtpdf ([0 0], eye(2), 1), 0.1591549, 1E-7) #dmvt(x = c(0,0), sigma = diag(2), log = FALSE) +%!assert (mvtpdf ([1 0], [1 0.5; 0.5 1], 2), 0.06615947, 1E-7) #dmvt(x = c(1,0), sigma = matrix(c(1, 0.5, 0.5, 1), nrow=2, ncol=2), df = 2, log = FALSE) +%!assert (mvtpdf ([1 0.4 0; 1.2 0.5 0.5; 1.4 0.6 1], [1 0.5 0.3; 0.5 1 0.6; 0.3 0.6 1], [5 6 7]), [0.04713313 0.03722421 0.02069011]', 1E-7) #dmvt(x = c(1,0.4,0), sigma = matrix(c(1, 0.5, 0.3, 0.5, 1, 0.6, 0.3, 0.6, 1), nrow=3, ncol=3), df = 5, log = FALSE); dmvt(x = c(1.2,0.5,0.5), sigma = matrix(c(1, 0.5, 0.3, 0.5, 1, 0.6, 0.3, 0.6, 1), nrow=3, ncol=3), df = 6, log = FALSE); dmvt(x = c(1.4,0.6,1), sigma = matrix(c(1, 0.5, 0.3, 0.5, 1, 0.6, 0.3, 0.6, 1), nrow=3, ncol=3), df = 7, log = FALSE) diff --git a/inst/mvtrnd.m b/inst/mvtrnd.m new file mode 100644 index 0000000..251d2c8 --- /dev/null +++ b/inst/mvtrnd.m @@ -0,0 +1,144 @@ +## Copyright (C) 2012 Arno Onken , Iñigo Urteaga +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{x} =} mvtrnd (@var{sigma}, @var{nu}) +## @deftypefnx {Function File} {@var{x} =} mvtrnd (@var{sigma}, @var{nu}, @var{n}) +## Generate random samples from the multivariate t-distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{sigma} is the matrix of correlation coefficients. If there are any +## non-unit diagonal elements then @var{sigma} will be normalized, so that the +## resulting covariance of the obtained samples @var{x} follows: +## @code{cov (x) = nu/(nu-2) * sigma ./ (sqrt (diag (sigma) * diag (sigma)))}. +## In order to obtain samples distributed according to a standard multivariate +## t-distribution, @var{sigma} must be equal to the identity matrix. To generate +## multivariate t-distribution samples @var{x} with arbitrary covariance matrix +## @var{sigma}, the following scaling might be used: +## @code{x = mvtrnd (sigma, nu, n) * diag (sqrt (diag (sigma)))}. +## +## @item +## @var{nu} is the degrees of freedom for the multivariate t-distribution. +## @var{nu} must be a vector with the same number of elements as samples to be +## generated or be scalar. +## +## @item +## @var{n} is the number of rows of the matrix to be generated. @var{n} must be +## a non-negative integer and corresponds to the number of samples to be +## generated. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{x} is a matrix of random samples from the multivariate t-distribution +## with @var{n} row samples. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## sigma = [1, 0.5; 0.5, 1]; +## nu = 3; +## n = 10; +## x = mvtrnd (sigma, nu, n); +## @end group +## +## @group +## sigma = [1, 0.5; 0.5, 1]; +## nu = [2; 3]; +## n = 2; +## x = mvtrnd (sigma, nu, 2); +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, 2001. +## +## @item +## Samuel Kotz and Saralees Nadarajah. @cite{Multivariate t Distributions and +## Their Applications}. Cambridge University Press, Cambridge, 2004. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Random samples from the multivariate t-distribution + +function x = mvtrnd (sigma, nu, n) + + # Check arguments + if (nargin < 2) + print_usage (); + endif + + if (! ismatrix (sigma) || any (any (sigma != sigma')) || min (eig (sigma)) <= 0) + error ("mvtrnd: sigma must be a positive definite matrix"); + endif + + if (!isvector (nu) || any (nu <= 0)) + error ("mvtrnd: nu must be a positive scalar or vector"); + endif + nu = nu(:); + + if (nargin > 2) + if (! isscalar (n) || n < 0 | round (n) != n) + error ("mvtrnd: n must be a non-negative integer") + endif + if (isscalar (nu)) + nu = nu * ones (n, 1); + else + if (length (nu) != n) + error ("mvtrnd: n must match the length of nu") + endif + endif + else + n = length (nu); + endif + + # Normalize sigma + if (any (diag (sigma) != 1)) + sigma = sigma ./ sqrt (diag (sigma) * diag (sigma)'); + endif + + # Dimension + d = size (sigma, 1); + # Draw samples + y = mvnrnd (zeros (1, d), sigma, n); + u = repmat (chi2rnd (nu), 1, d); + x = y .* sqrt (repmat (nu, 1, d) ./ u); +endfunction + +%!test +%! sigma = [1, 0.5; 0.5, 1]; +%! nu = 3; +%! n = 10; +%! x = mvtrnd (sigma, nu, n); +%! assert (size (x), [10, 2]); + +%!test +%! sigma = [1, 0.5; 0.5, 1]; +%! nu = [2; 3]; +%! n = 2; +%! x = mvtrnd (sigma, nu, 2); +%! assert (size (x), [2, 2]); diff --git a/inst/nakacdf.m b/inst/nakacdf.m new file mode 100644 index 0000000..0685bc7 --- /dev/null +++ b/inst/nakacdf.m @@ -0,0 +1,103 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} nakacdf (@var{x}, @var{m}, @var{w}) +## For each element of @var{x}, compute the cumulative distribution function +## (CDF) at @var{x} of the Nakagami distribution with shape parameter @var{m} +## and scale parameter @var{w}. +## +## @end deftypefn + +## Author: Dag Lyberg +## Description: CDF of the Nakagami distribution + +function cdf = nakacdf (x, m, w) + + if (nargin != 3) + print_usage (); + endif + + if (! isscalar (m) || ! isscalar (w)) + [retval, x, m, w] = common_size (x, m, w); + if (retval > 0) + error ("nakacdf: X, M and W must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (m) || iscomplex (w)) + error ("nakacdf: X, M and W must not be complex"); + endif + + if (isa (x, "single") || isa (m, "single") || isa (w, "single")) + inv = zeros (size (x), "single"); + else + inv = zeros (size (x)); + endif + + k = isnan (x) | ! (m > 0) | ! (w > 0); + cdf(k) = NaN; + + k = (x == Inf) & (0 < m) & (m < Inf) & (0 < w) & (w < Inf); + cdf(k) = 1; + + k = (0 < x) & (x < Inf) & (0 < m) & (m < Inf) & (0 < w) & (w < Inf); + if (isscalar(x) && isscalar (m) && isscalar(w)) + left = m; + right = (m/w) * x^2; + cdf(k) = gammainc(right, left); + elseif (isscalar (m) && isscalar(w)) + left = m * ones(size(x)); + right = (m/w) * x.^2; + cdf(k) = gammainc(right(k), left(k)); + else + left = m .* ones(size(x)); + right = (m./w) .* x.^2; + cdf(k) = gammainc(right(k), left(k)); + endif + +endfunction + + +%!shared x,y +%! x = [-1, 0, 1, 2, Inf]; +%! y = [0, 0, 0.63212055882855778, 0.98168436111126578, 1]; +%!assert (nakacdf (x, ones (1,5), ones (1,5)), y, eps) +%!assert (nakacdf (x, 1, 1), y, eps) +%!assert (nakacdf (x, [1, 1, NaN, 1, 1], 1), [y(1:2), NaN, y(4:5)]) +%!assert (nakacdf (x, 1, [1, 1, NaN, 1, 1]), [y(1:2), NaN, y(4:5)]) +%!assert (nakacdf ([x, NaN], 1, 1), [y, NaN], eps) + +## Test class of input preserved +%!assert (nakacdf (single ([x, NaN]), 1, 1), single ([y, NaN]), eps('single')) +%!assert (nakacdf ([x, NaN], single (1), 1), single ([y, NaN]), eps('single')) +%!assert (nakacdf ([x, NaN], 1, single (1)), single ([y, NaN]), eps('single')) + +## Test input validation +%!error nakacdf () +%!error nakacdf (1) +%!error nakacdf (1,2) +%!error nakacdf (1,2,3,4) +%!error nakacdf (ones (3), ones (2), ones(2)) +%!error nakacdf (ones (2), ones (3), ones(2)) +%!error nakacdf (ones (2), ones (2), ones(3)) +%!error nakacdf (i, 2, 2) +%!error nakacdf (2, i, 2) +%!error nakacdf (2, 2, i) + diff --git a/inst/nakainv.m b/inst/nakainv.m new file mode 100644 index 0000000..777218b --- /dev/null +++ b/inst/nakainv.m @@ -0,0 +1,102 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} nakainv (@var{x}, @var{m}, @var{w}) +## For each element of @var{x}, compute the quantile (the inverse of the CDF) +## at @var{x} of the Nakagami distribution with shape parameter @var{m} and +## scale parameter @var{w}. +## +## @end deftypefn + +## Author: Dag Lyberg +## Description: Quantile function of the Nakagami distribution + +function inv = nakainv (x, m, w) + + if (nargin != 3) + print_usage (); + endif + + if (! isscalar (m) || ! isscalar (w)) + [retval, x, m, w] = common_size (x, m, w); + if (retval > 0) + error ("nakainv: X, M and W must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (m) || iscomplex (w)) + error ("nakainv: X, M, and W must not be complex"); + endif + + if (isa (x, "single") || isa (m, "single") || isa (w, "single")) + inv = zeros (size (x), "single"); + else + inv = zeros (size (x)); + endif + + k = isnan (x) | ! (0 <= x) | ! (x <= 1) | ! (-Inf < m) | ! (m < Inf) ... + | ! (0 < w) | ! (w < Inf); + inv(k) = NaN; + + k = (x == 1) & (-Inf < m) & (m < Inf) & (0 < w) & (w < Inf); + inv(k) = Inf; + + k = (0 < x) & (x < 1) & (0 < m) & (m < Inf) & (0 < w) & (w < Inf); + if (isscalar (m) && isscalar(w)) + m_gamma = m; + w_gamma = w/m; + inv(k) = gaminv(x(k), m_gamma, w_gamma); + inv(k) = sqrt(inv(k)); + else + m_gamma = m; + w_gamma = w./m; + inv(k) = gaminv(x(k), m_gamma(k), w_gamma(k)); + inv(k) = sqrt(inv(k)); + endif + +endfunction + + +%!shared x,y +%! x = [-Inf, -1, 0, 1/2, 1, 2, Inf]; +%! y = [NaN, NaN, 0, 0.83255461115769769, Inf, NaN, NaN]; +%!assert (nakainv (x, ones (1,7), ones (1,7)), y, eps) +%!assert (nakainv (x, 1, 1), y, eps) +%!assert (nakainv (x, [1, 1, 1, NaN, 1, 1, 1], 1), [y(1:3), NaN, y(5:7)], eps) +%!assert (nakainv (x, 1, [1, 1, 1, NaN, 1, 1, 1]), [y(1:3), NaN, y(5:7)], eps) +%!assert (nakainv ([x, NaN], 1, 1), [y, NaN], eps) + +## Test class of input preserved +%!assert (nakainv (single ([x, NaN]), 1, 1), single ([y, NaN])) +%!assert (nakainv ([x, NaN], single (1), 1), single ([y, NaN])) +%!assert (nakainv ([x, NaN], 1, single (1)), single ([y, NaN])) + +## Test input validation +%!error nakainv () +%!error nakainv (1) +%!error nakainv (1,2) +%!error nakainv (1,2,3,4) +%!error nakainv (ones (3), ones (2), ones(2)) +%!error nakainv (ones (2), ones (3), ones(2)) +%!error nakainv (ones (2), ones (2), ones(3)) +%!error nakainv (i, 2, 2) +%!error nakainv (2, i, 2) +%!error nakainv (2, 2, i) + diff --git a/inst/nakapdf.m b/inst/nakapdf.m new file mode 100644 index 0000000..6e01600 --- /dev/null +++ b/inst/nakapdf.m @@ -0,0 +1,95 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} nakapdf (@var{x}, @var{m}, @var{w}) +## For each element of @var{x}, compute the probability density function (PDF) +## at @var{x} of the Nakagami distribution with shape parameter @var{m} and +## scale parameter @var{w}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: PDF of the Nakagami distribution + +function pdf = nakapdf (x, m, w) + + if (nargin != 3) + print_usage (); + endif + + if (! isscalar (m) || ! isscalar (w)) + [retval, x, m, w] = common_size (x, m, w); + if (retval > 0) + error ("nakapdf: X, M and W must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (m) || iscomplex (w)) + error ("nakapdf: X, M and W must not be complex"); + endif + + if (isa (x, "single") || isa (m, "single") || isa (w, "single")) + pdf = zeros (size (x), "single"); + else + pdf = zeros (size (x)); + endif + + k = isnan (x) | ! (m > 0.5) | ! (w > 0); + pdf(k) = NaN; + + k = (0 < x) & (x < Inf) & (0 < m) & (m < Inf) & (0 < w) & (w < Inf); + if (isscalar (m) && isscalar(w)) + pdf(k) = exp (log (2) + m*log (m) - log (gamma (m)) - ... + m*log (w) + (2*m-1) * ... + log (x(k)) - (m/w) * x(k).^2); + else + pdf(k) = exp(log(2) + m(k).*log (m(k)) - log (gamma (m(k))) - ... + m(k).*log (w(k)) + (2*m(k)-1) ... + .* log (x(k)) - (m(k)./w(k)) .* x(k).^2); + endif + +endfunction + + +%!shared x,y +%! x = [-1, 0, 1, 2, Inf]; +%! y = [0, 0, 0.73575888234288467, 0.073262555554936715, 0]; +%!assert (nakapdf (x, ones (1,5), ones (1,5)), y, eps) +%!assert (nakapdf (x, 1, 1), y, eps) +%!assert (nakapdf (x, [1, 1, NaN, 1, 1], 1), [y(1:2), NaN, y(4:5)], eps) +%!assert (nakapdf (x, 1, [1, 1, NaN, 1, 1]), [y(1:2), NaN, y(4:5)], eps) +%!assert (nakapdf ([x, NaN], 1, 1), [y, NaN], eps) + +## Test class of input preserved +%!assert (nakapdf (single ([x, NaN]), 1, 1), single ([y, NaN])) +%!assert (nakapdf ([x, NaN], single (1), 1), single ([y, NaN])) +%!assert (nakapdf ([x, NaN], 1, single (1)), single ([y, NaN])) + +## Test input validation +%!error nakapdf () +%!error nakapdf (1) +%!error nakapdf (1,2) +%!error nakapdf (1,2,3,4) +%!error nakapdf (ones (3), ones (2), ones(2)) +%!error nakapdf (ones (2), ones (3), ones(2)) +%!error nakapdf (ones (2), ones (2), ones(3)) +%!error nakapdf (i, 2, 2) +%!error nakapdf (2, i, 2) +%!error nakapdf (2, 2, i) + diff --git a/inst/nakarnd.m b/inst/nakarnd.m new file mode 100644 index 0000000..d86d0d5 --- /dev/null +++ b/inst/nakarnd.m @@ -0,0 +1,137 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} nakarnd (@var{m}, @var{w}) +## @deftypefnx {} {} nakarnd (@var{m}, @var{w}, @var{r}) +## @deftypefnx {} {} nakarnd (@var{m}, @var{w}, @var{r}, @var{c}, @dots{}) +## @deftypefnx {} {} nakarnd (@var{m}, @var{w}, [@var{sz}]) +## Return a matrix of random samples from the Nakagami distribution with +## shape parameter @var{m} and scale @var{w}. +## +## When called with a single size argument, return a square matrix with +## the dimension specified. When called with more than one scalar argument the +## first two arguments are taken as the number of rows and columns and any +## further arguments specify additional matrix dimensions. The size may also +## be specified with a vector of dimensions @var{sz}. +## +## If no size arguments are given then the result matrix is the common size of +## @var{m} and @var{w}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Random deviates from the Nakagami distribution + +function rnd = nakarnd (m, w, varargin) + + if (nargin < 2) + print_usage (); + endif + + if (! isscalar (m) || ! isscalar (w)) + [retval, m, w] = common_size (m, w); + if (retval > 0) + error ("nakarnd: M and W must be of common size or scalars"); + endif + endif + + if (iscomplex (m) || iscomplex (w)) + error ("nakarnd: M and W must not be complex"); + endif + + if (nargin == 2) + sz = size (m); + elseif (nargin == 3) + if (isscalar (varargin{1}) && varargin{1} >= 0) + sz = [varargin{1}, varargin{1}]; + elseif (isrow (varargin{1}) && all (varargin{1} >= 0)) + sz = varargin{1}; + else + error ("nakarnd: dimension vector must be row vector of non-negative integers"); + endif + elseif (nargin > 3) + if (any (cellfun (@(x) (! isscalar (x) || x < 0), varargin))) + error ("nakarnd: dimensions must be non-negative integers"); + endif + sz = [varargin{:}]; + endif + + if (! isscalar (m) && ! isequal (size (w), sz)) + error ("nakagrnd: M and W must be scalar or of size SZ"); + endif + + if (isa (m, "single") || isa (w, "single")) + cls = "single"; + else + cls = "double"; + endif + + if (isscalar (m) && isscalar (w)) + if ((0 < m) && (m < Inf) && (0 < w) && (w < Inf)) + m_gamma = m; + w_gamma = w/m; + rnd = gamrnd(m_gamma, w_gamma, sz); + rnd = sqrt(rnd); + else + rnd = NaN (sz, cls); + endif + else + rnd = NaN (sz, cls); + k = (0 < m) & (m < Inf) & (0 < w) & (w < Inf); + m_gamma = m; + w_gamma = w./m; + rnd(k) = gamrnd(m_gamma(k), w_gamma(k)); + rnd(k) = sqrt(rnd(k)); + endif + +endfunction + + +%!assert (size (nakarnd (1,1)), [1, 1]) +%!assert (size (nakarnd (ones (2,1), 1)), [2, 1]) +%!assert (size (nakarnd (ones (2,2), 1)), [2, 2]) +%!assert (size (nakarnd (1, ones (2,1))), [2, 1]) +%!assert (size (nakarnd (1, ones (2,2))), [2, 2]) +%!assert (size (nakarnd (1,1, 3)), [3, 3]) +%!assert (size (nakarnd (1,1, [4 1])), [4, 1]) +%!assert (size (nakarnd (1,1, 4, 1)), [4, 1]) + +## Test class of input preserved +%!assert (class (nakarnd (1,1)), "double") +%!assert (class (nakarnd (single (1),1)), "single") +%!assert (class (nakarnd (single ([1 1]),1)), "single") +%!assert (class (nakarnd (1,single (1))), "single") +%!assert (class (nakarnd (1,single ([1 1]))), "single") + +## Test input validation +%!error nakarnd () +%!error nakarnd (1) +%!error nakarnd (zeros (3), ones (2)) +%!error nakarnd (zeros (2), ones (3)) +%!error nakarnd (i, 2) +%!error nakarnd (1, i) +%!error nakarnd (1,2, -1) +%!error nakarnd (1,2, ones (2)) +%!error nakarnd (1, 2, [2 -1 2]) +%!error nakarnd (1,2, 1, ones (2)) +%!error nakarnd (1,2, 1, -1) +%!error nakarnd (ones (2,2), 2, 3) +%!error nakarnd (ones (2,2), 2, [3, 2]) +%!error nakarnd (ones (2,2), 2, 2, 3) + diff --git a/inst/nanmax.m b/inst/nanmax.m new file mode 100644 index 0000000..f998dd3 --- /dev/null +++ b/inst/nanmax.m @@ -0,0 +1,53 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{v}, @var{idx}] =} nanmax (@var{X}) +## @deftypefnx{Function File} {[@var{v}, @var{idx}] =} nanmax (@var{X}, @var{Y}) +## Find the maximal element while ignoring NaN values. +## +## @code{nanmax} is identical to the @code{max} function except that NaN values +## are ignored. If all values in a column are NaN, the maximum is +## returned as NaN rather than []. +## +## @seealso{max, nansum, nanmin, nanmean, nanmedian} +## @end deftypefn + +function [v, idx] = nanmax (X, Y, DIM) + if nargin < 1 || nargin > 3 + print_usage; + elseif nargin == 1 || (nargin == 2 && isempty(Y)) + nanvals = isnan(X); + X(nanvals) = -Inf; + [v, idx] = max (X); + v(all(nanvals)) = NaN; + elseif (nargin == 3 && isempty(Y)) + nanvals = isnan(X); + X(nanvals) = -Inf; + [v, idx] = max (X,[],DIM); + v(all(nanvals,DIM)) = NaN; + else + Xnan = isnan(X); + Ynan = isnan(Y); + X(Xnan) = -Inf; + Y(Ynan) = -Inf; + if (nargin == 3) + [v, idx] = max(X,Y,DIM); + else + [v, idx] = max(X,Y); + endif + v(Xnan & Ynan) = NaN; + endif +endfunction diff --git a/inst/nanmean.m b/inst/nanmean.m new file mode 100644 index 0000000..9b3c224 --- /dev/null +++ b/inst/nanmean.m @@ -0,0 +1,36 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{v} =} nanmean (@var{X}) +## @deftypefnx{Function File} {@var{v} =} nanmean (@var{X}, @var{dim}) +## Compute the mean value while ignoring NaN values. +## +## @code{nanmean} is identical to the @code{mean} function except that NaN values +## are ignored. If all values are NaN, the mean is returned as NaN. +## +## @seealso{mean, nanmin, nanmax, nansum, nanmedian} +## @end deftypefn + +function v = nanmean (X, varargin) + if nargin < 1 + print_usage; + else + n = sum (!isnan(X), varargin{:}); + n(n == 0) = NaN; + X(isnan(X)) = 0; + v = sum (X, varargin{:}) ./ n; + endif +endfunction diff --git a/inst/nanmedian.m b/inst/nanmedian.m new file mode 100644 index 0000000..5df58a4 --- /dev/null +++ b/inst/nanmedian.m @@ -0,0 +1,69 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @var{v} = nanmedian (@var{x}) +## @deftypefnx{Function File} @var{v} = nanmedian (@var{x}, @var{dim}) +## Compute the median of data while ignoring NaN values. +## +## This function is identical to the @code{median} function except that NaN values +## are ignored. If all values are NaN, the median is returned as NaN. +## +## @seealso{median, nanmin, nanmax, nansum, nanmean} +## @end deftypefn + +function v = nanmedian (X, varargin) + if nargin < 1 || nargin > 2 + print_usage; + endif + if nargin < 2 + dim = min(find(size(X)>1)); + if isempty(dim), dim=1; endif; + else + dim = varargin{:}; + endif + + sz = size (X); + if (prod (sz) > 1) + ## Find lengths of datasets after excluding NaNs; valid datasets + ## are those that are not empty after you remove all the NaNs + n = sz(dim) - sum (isnan(X),varargin{:}); + + ## When n is equal to zero, force it to one, so that median + ## picks up a NaN value below + n (n==0) = 1; + + ## Sort the datasets, with the NaN going to the end of the data + X = sort (X, varargin{:}); + + ## Determine the offset for each column in single index mode + colidx = reshape((0:(prod(sz) / sz(dim) - 1)), size(n)); + colidx = floor(colidx / prod(sz(1:dim-1))) * prod(sz(1:dim)) + ... + mod(colidx,prod(sz(1:dim-1))); + stride = prod(sz(1:dim-1)); + + ## Average the two central values of the sorted list to compute + ## the median, but only do so for valid rows. If the dataset + ## is odd length, the single central value will be used twice. + ## E.g., + ## for n==5, ceil(2.5+0.5) is 3 and floor(2.5+0.5) is also 3 + ## for n==6, ceil(3.0+0.5) is 4 and floor(3.0+0.5) is 3 + ## correction made for stride of data "stride*ceil(2.5-0.5)+1" + v = (X(colidx + stride*ceil(n./2-0.5) + 1) + ... + X(colidx + stride*floor(n./2-0.5) + 1)) ./ 2; + else + error ("nanmedian: invalid matrix argument"); + endif +endfunction diff --git a/inst/nanmin.m b/inst/nanmin.m new file mode 100644 index 0000000..fa41d8e --- /dev/null +++ b/inst/nanmin.m @@ -0,0 +1,54 @@ +## Copyright (C) 2001 Paul Kienzle +## Copyright (C) 2003 Alois Schloegl +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{v}, @var{idx}] =} nanmin (@var{X}) +## @deftypefnx{Function File} {[@var{v}, @var{idx}] =} nanmin (@var{X}, @var{Y}) +## Find the minimal element while ignoring NaN values. +## +## @code{nanmin} is identical to the @code{min} function except that NaN values +## are ignored. If all values in a column are NaN, the minimum is +## returned as NaN rather than []. +## +## @seealso{min, nansum, nanmax, nanmean, nanmedian} +## @end deftypefn + +function [v, idx] = nanmin (X, Y, DIM) + if nargin < 1 || nargin > 3 + print_usage; + elseif nargin == 1 || (nargin == 2 && isempty(Y)) + nanvals = isnan(X); + X(nanvals) = Inf; + [v, idx] = min (X); + v(all(nanvals)) = NaN; + elseif (nargin == 3 && isempty(Y)) + nanvals = isnan(X); + X(nanvals) = Inf; + [v, idx] = min (X,[],DIM); + v(all(nanvals,DIM)) = NaN; + else + Xnan = isnan(X); + Ynan = isnan(Y); + X(Xnan) = Inf; + Y(Ynan) = Inf; + if (nargin == 3) + [v, idx] = min(X,Y,DIM); + else + [v, idx] = min(X,Y); + endif + v(Xnan & Ynan) = NaN; + endif +endfunction diff --git a/inst/nanstd.m b/inst/nanstd.m new file mode 100644 index 0000000..d80b8c5 --- /dev/null +++ b/inst/nanstd.m @@ -0,0 +1,88 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{v} =} nanstd (@var{X}) +## @deftypefnx{Function File} {@var{v} =} nanstd (@var{X}, @var{opt}) +## @deftypefnx{Function File} {@var{v} =} nanstd (@var{X}, @var{opt}, @var{dim}) +## Compute the standard deviation while ignoring NaN values. +## +## @code{nanstd} is identical to the @code{std} function except that NaN values are +## ignored. If all values are NaN, the standard deviation is returned as NaN. +## If there is only a single non-NaN value, the deviation is returned as 0. +## +## The argument @var{opt} determines the type of normalization to use. Valid values +## are +## +## @table @asis +## @item 0: +## normalizes with @math{N-1}, provides the square root of best unbiased estimator of +## the variance [default] +## @item 1: +## normalizes with @math{N}, this provides the square root of the second moment around +## the mean +## @end table +## +## The third argument @var{dim} determines the dimension along which the standard +## deviation is calculated. +## +## @seealso{std, nanmin, nanmax, nansum, nanmedian, nanmean} +## @end deftypefn + +function v = nanstd (X, opt, varargin) + if nargin < 1 + print_usage; + else + if nargin < 3 + dim = min(find(size(X)>1)); + if isempty(dim), dim=1; endif; + else + dim = varargin{1}; + endif + if ((nargin < 2) || isempty(opt)) + opt = 0; + endif + + ## determine the number of non-missing points in each data set + n = sum (!isnan(X), varargin{:}); + + ## replace missing data with zero and compute the mean + X(isnan(X)) = 0; + meanX = sum (X, varargin{:}) ./ n; + + ## subtract the mean from the data and compute the sum squared + sz = ones(1,length(size(X))); + sz(dim) = size(X,dim); + v = sumsq (X - repmat(meanX,sz), varargin{:}); + + ## because the missing data was set to zero each missing data + ## point will contribute (-meanX)^2 to sumsq, so remove these + v = v - (meanX .^ 2) .* (size(X,dim) - n); + + if (opt == 0) + ## compute the standard deviation from the corrected sumsq using + ## max(n-1,1) in the denominator so that the std for a single point is 0 + v = sqrt ( v ./ max(n - 1, 1) ); + elseif (opt == 1) + ## compute the standard deviation from the corrected sumsq + v = sqrt ( v ./ n ); + else + error ("std: unrecognized normalization type"); + endif + + ## make sure that we return a real number + v = real (v); + endif +endfunction diff --git a/inst/nansum.m b/inst/nansum.m new file mode 100644 index 0000000..33d8598 --- /dev/null +++ b/inst/nansum.m @@ -0,0 +1,52 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Built-in Function} {} nansum (@var{x}) +## @deftypefnx {Built-in Function} {} nansum (@var{x}, @var{dim}) +## @deftypefnx {Built-in Function} {} nansum (@dots{}, @qcode{"native"}) +## @deftypefnx {Built-in Function} {} nansum (@dots{}, @qcode{"double"}) +## @deftypefnx {Built-in Function} {} nansum (@dots{}, @qcode{"extra"}) +## Compute the sum while ignoring NaN values. +## +## @code{nansum} is identical to the @code{sum} function except that NaN +## values are treated as 0 and so ignored. If all values are NaN, the sum is +## returned as 0. +## +## See help text of @code{sum} for details on the options. +## +## @seealso{sum, nanmin, nanmax, nanmean, nanmedian} +## @end deftypefn + +function v = nansum (X, varargin) + if (nargin < 1) + print_usage (); + else + X(isnan (X)) = 0; + v = sum (X, varargin{:}); + endif +endfunction + +%!assert (nansum ([2 4 NaN 7]), 13) +%!assert (nansum ([2 4 NaN Inf]), Inf) + +%!assert (nansum ([1 NaN 3; NaN 5 6; 7 8 NaN]), [8 13 9]) +%!assert (nansum ([1 NaN 3; NaN 5 6; 7 8 NaN], 2), [4; 11; 15]) +%!assert (nansum (single ([1 NaN 3; NaN 5 6; 7 8 NaN])), single ([8 13 9])) +%!assert (nansum (single ([1 NaN 3; NaN 5 6; 7 8 NaN]), "double"), [8 13 9]) + +%!assert (nansum (uint8 ([2 4 1 7])), 14) +%!assert (nansum (uint8 ([2 4 1 7]), "native"), uint8 (14)) +%!assert (nansum (uint8 ([2 4 1 7])), 14) diff --git a/inst/nanvar.m b/inst/nanvar.m new file mode 100644 index 0000000..e8e15aa --- /dev/null +++ b/inst/nanvar.m @@ -0,0 +1,65 @@ +# Copyright (C) 2008 Sylvain Pelissier +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} nanvar (@var{x}) +## @deftypefnx{Function File} {@var{v} =} nanvar (@var{X}, @var{opt}) +## @deftypefnx{Function File} {@var{v} =} nanvar (@var{X}, @var{opt}, @var{dim}) +## Compute the variance while ignoring NaN values. +## +## For vector arguments, return the (real) variance of the values. +## For matrix arguments, return a row vector containing the variance for +## each column. +## +## The argument @var{opt} determines the type of normalization to use. +## Valid values are +## +## @table @asis +## @item 0: +## Normalizes with @math{N-1}, provides the best unbiased estimator of the +## variance [default]. +## @item 1: +## Normalizes with @math{N}, this provides the second moment around the mean. +## @end table +## +## The third argument @var{dim} determines the dimension along which the +## variance is calculated. +## +## @seealso{var, nanmean, nanstd, nanmax, nanmin} +## @end deftypefn + +function y = nanvar(x,w,dim) + if nargin < 1 + print_usage (); + else + if ((nargin < 2) || isempty(w)) + w = 0; + endif + + if nargin < 3 + dim = min(find(size(x)>1)); + if isempty(dim) + dim=1; + endif + endif + + y = nanstd(x,w,dim).^2; + endif +endfunction + +## Tests +%!shared x +%! x = [1 2 nan 3 4 5]; +%!assert (nanvar (x), var (x(! isnan (x))), 10*eps) diff --git a/inst/nbinstat.m b/inst/nbinstat.m new file mode 100644 index 0000000..3908421 --- /dev/null +++ b/inst/nbinstat.m @@ -0,0 +1,124 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} nbinstat (@var{n}, @var{p}) +## Compute mean and variance of the negative binomial distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{n} is the first parameter of the negative binomial distribution. The elements +## of @var{n} must be natural numbers +## +## @item +## @var{p} is the second parameter of the negative binomial distribution. The +## elements of @var{p} must be probabilities +## @end itemize +## @var{n} and @var{p} must be of common size or one of them must be scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the negative binomial distribution +## +## @item +## @var{v} is the variance of the negative binomial distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## n = 1:4; +## p = 0.2:0.2:0.8; +## [m, v] = nbinstat (n, p) +## @end group +## +## @group +## [m, v] = nbinstat (n, 0.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the negative binomial distribution + +function [m, v] = nbinstat (n, p) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (n) && ! ismatrix (n)) + error ("nbinstat: n must be a numeric matrix"); + endif + if (! isempty (p) && ! ismatrix (p)) + error ("nbinstat: p must be a numeric matrix"); + endif + + if (! isscalar (n) || ! isscalar (p)) + [retval, n, p] = common_size (n, p); + if (retval > 0) + error ("nbinstat: n and p must be of common size or scalar"); + endif + endif + + # Calculate moments + q = 1 - p; + m = n .* q ./ p; + v = n .* q ./ (p .^ 2); + + # Continue argument check + k = find (! (n > 0) | ! (n < Inf) | ! (p > 0) | ! (p < 1)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! n = 1:4; +%! p = 0.2:0.2:0.8; +%! [m, v] = nbinstat (n, p); +%! expected_m = [ 4.0000, 3.0000, 2.0000, 1.0000]; +%! expected_v = [20.0000, 7.5000, 3.3333, 1.2500]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! n = 1:4; +%! [m, v] = nbinstat (n, 0.5); +%! expected_m = [1, 2, 3, 4]; +%! expected_v = [2, 4, 6, 8]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/ncx2pdf.m b/inst/ncx2pdf.m new file mode 100644 index 0000000..ebfe070 --- /dev/null +++ b/inst/ncx2pdf.m @@ -0,0 +1,45 @@ +## Copyright (C) 2018 gold holk +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} ncx2pdf (@var{X}, @var{N}, @var{LAMBDA}) +## @deftypefnx {Function File} ncx2pdf (@dots{}, @var{TERM}) +## compute the non-central chi square probalitity density function +## at @var{X} , degree of freedom @var{N} , +## and non-centrality parameter @var{LAMBDA} . +## +## @var{TERM} is the term number of series, default is 32. +## +## @end deftypefn + +## Author: gold holk +## Created: 2018-10-25 + +function f = ncx2pdf(x, n, lambda, term = 32) + f = exp(-lambda/2) * arrayfun(@(x) sum_expression([0:term],x,n,lambda), x); +end + +function t = sum_expression(j,v,n,l) + # j is vector, v is scalar. + numerator = (l/2).^j .* v.^(n/2+j-1) * exp(-v/2); + denominator = factorial(j) .* 2.^(n/2+j) .* gamma(n/2+j); + t = sum(numerator ./ denominator); +end + + +%!assert (ncx2pdf (3, 4, 0), chi2pdf(3, 4), eps) +%!assert (ncx2pdf (5, 3, 1), 0.091858459565020, 1E-15) #compared with Matlab's values +%!assert (ncx2pdf (4, 5, 2), 0.109411958414115, 1E-15) + diff --git a/inst/normalise_distribution.m b/inst/normalise_distribution.m new file mode 100644 index 0000000..d679a05 --- /dev/null +++ b/inst/normalise_distribution.m @@ -0,0 +1,284 @@ +## Copyright (C) 2011 Alexander Klein +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}) +## @deftypefnx{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}, @var{DISTRIBUTION}) +## @deftypefnx{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}, @var{DISTRIBUTION}, @var{DIMENSION}) +## +## Transform a set of data so as to be N(0,1) distributed according to an idea +## by van Albada and Robinson. +## This is achieved by first passing it through its own cumulative distribution +## function (CDF) in order to get a uniform distribution, and then mapping +## the uniform to a normal distribution. +## The data must be passed as a vector or matrix in @var{DATA}. +## If the CDF is unknown, then [] can be passed in @var{DISTRIBUTION}, and in +## this case the empirical CDF will be used. +## Otherwise, if the CDFs for all data are known, they can be passed in +## @var{DISTRIBUTION}, +## either in the form of a single function name as a string, +## or a single function handle, +## or a cell array consisting of either all function names as strings, +## or all function handles. +## In the latter case, the number of CDFs passed must match the number +## of rows, or columns respectively, to normalise. +## If the data are passed as a matrix, then the transformation will +## operate either along the first non-singleton dimension, +## or along @var{DIMENSION} if present. +## +## Notes: +## The empirical CDF will map any two sets of data +## having the same size and their ties in the same places after sorting +## to some permutation of the same normalised data: +## @example +## @code{normalise_distribution([1 2 2 3 4])} +## @result{} -1.28 0.00 0.00 0.52 1.28 +## +## @code{normalise_distribution([1 10 100 10 1000])} +## @result{} -1.28 0.00 0.52 0.00 1.28 +## @end example +## +## Original source: +## S.J. van Albada, P.A. Robinson +## "Transformation of arbitrary distributions to the +## normal distribution with application to EEG +## test-retest reliability" +## Journal of Neuroscience Methods, Volume 161, Issue 2, +## 15 April 2007, Pages 205-211 +## ISSN 0165-0270, 10.1016/j.jneumeth.2006.11.004. +## (http://www.sciencedirect.com/science/article/pii/S0165027006005668) +## @end deftypefn + +function [ normalised ] = normalise_distribution ( data, distribution, dimension ) + + if ( nargin < 1 || nargin > 3 ) + print_usage; + elseif ( !ismatrix ( data ) || length ( size ( data ) ) > 2 ) + error ( "First argument must be a vector or matrix" ); + end + + + if ( nargin >= 2 ) + + if ( !isempty ( distribution ) ) + + #Wrap a single handle in a cell array. + if ( strcmp ( typeinfo ( distribution ), typeinfo ( @(x)(x) ) ) ) + + distribution = { distribution }; + + #Do we have a string argument instead? + elseif ( ischar ( distribution ) ) + + ##Is it a single string? + if ( rows ( distribution ) == 1 ) + + distribution = { str2func( distribution ) }; + else + error ( ["Second argument cannot contain more than one string" ... + " unless in a cell array"] ); + end + + + ##Do we have a cell array of distributions instead? + elseif ( iscell ( distribution ) ) + + ##Does it consist of strings only? + if ( all ( cellfun ( @ischar, distribution ) ) ) + + distribution = cellfun ( @str2func, distribution, "UniformOutput", false ); + end + + ##Does it eventually consist of function handles only + if ( !all ( cellfun ( @ ( h ) ( strcmp ( typeinfo ( h ), typeinfo ( @(x)(x) ) ) ), distribution ) ) ) + + error ( ["Second argument must contain either" ... + " a single function name or handle or " ... + " a cell array of either all function names or handles!"] ); + end + + else + error ( "Illegal second argument: ", typeinfo ( distribution ) ); + end + + end + + else + + distribution = []; + end + + + if ( nargin == 3 ) + + if ( !isscalar ( dimension ) || ( dimension != 1 && dimension != 2 ) ) + error ( "Third argument must be either 1 or 2" ); + end + + else + if ( isvector ( data ) && rows ( data ) == 1 ) + + dimension = 2; + + else + + dimension = 1; + end + end + + trp = ( dimension == 2 ); + + if ( trp ) + data = data'; + end + + r = rows ( data ); + c = columns ( data ); + normalised = NA ( r, c ); + + ##Do we know the distribution of the sample? + if ( isempty ( distribution ) ) + + precomputed_normalisation = []; + + + for k = 1 : columns ( data ) + + ##Note that this line is in accordance with equation (16) in the + ##original text. The author's original program, however, produces + ##different values in the presence of ties, namely those you'd + ##get replacing "last" by "first". + [ uniq, indices ] = unique ( sort ( data ( :, k ) ), "last" ); + + + ##Does the sample have ties? + if ( rows ( uniq ) != r ) + + ##Transform to uniform, then normal distribution. + uniform = ( indices - 1/2 ) / r; + normal = norminv ( uniform ); + + else + ## Without ties everything is pretty much straightforward as + ## stated in the text. + if ( isempty ( precomputed_normalisation ) ) + + precomputed_normalisation = norminv ( 1 / (2*r) : 1/r : 1 - 1 / (2*r) ); + end + + normal = precomputed_normalisation; + end + + #Find the original indices in the unsorted sample. + #This somewhat quirky way of doing it is still faster than + #using a for-loop. + [ ignore, ignore, target_indices ] = unique ( data (:, k ) ); + + #Put normalised values in the places where they belong. + f_remap = @( k ) ( normal ( k ) ); + normalised ( :, k ) = arrayfun ( f_remap, target_indices ); + + end + + else + ##With known distributions, everything boils down to a few lines of code + + ##The same distribution for all data? + if ( all ( size ( distribution ) == 1 ) ) + + normalised = norminv ( distribution {1,1} ( data ) ); + + elseif ( length ( vec ( distribution ) ) == c ) + + for k = 1 : c + + normalised ( :, k ) = norminv ( distribution { k } ( data ) ( :, k ) ); + end + + else + error ( "Number of distributions does not match data size! ") + + end + end + + if ( trp ) + + normalised = normalised'; + end + +endfunction + +%!test +%! v = normalise_distribution ( [ 1 2 3 ], [], 1 ); +%! assert ( v, [ 0 0 0 ] ) + +%!test +%! v = normalise_distribution ( [ 1 2 3 ], [], 2 ); +%! assert ( v, norminv ( [ 1 3 5 ] / 6 ), 3 * eps ) + +%!test +%! v = normalise_distribution ( [ 1 2 3 ]', [], 2 ); +%! assert ( v, [ 0 0 0 ]' ) + +%!test +%! v = normalise_distribution ( [ 1 2 3 ]' , [], 1 ); +%! assert ( v, norminv ( [ 1 3 5 ]' / 6 ), 3 * eps ) + +%!test +%! v = normalise_distribution ( [ 1 1 2 2 3 3 ], [], 2 ); +%! assert ( v, norminv ( [ 3 3 7 7 11 11 ] / 12 ), 3 * eps ) + +%!test +%! v = normalise_distribution ( [ 1 1 2 2 3 3 ]', [], 1 ); +%! assert ( v, norminv ( [ 3 3 7 7 11 11 ]' / 12 ), 3 * eps ) + +%!test +%! A = randn ( 10 ); +%! N = normalise_distribution ( A, @normcdf ); +%! assert ( A, N, 1000 * eps ) + +%!xtest +%! A = exprnd ( 1, 100 ); +%! N = normalise_distribution ( A, @ ( x ) ( expcdf ( x, 1 ) ) ); +%! assert ( mean ( vec ( N ) ), 0, 0.1 ) +%! assert ( std ( vec ( N ) ), 1, 0.1 ) + +%!xtest +%! A = rand (1000,1); +%! N = normalise_distribution ( A, "unifcdf" ); +%! assert ( mean ( vec ( N ) ), 0, 0.1 ) +%! assert ( std ( vec ( N ) ), 1, 0.1 ) + +%!xtest +%! A = [rand(1000,1), randn( 1000, 1)]; +%! N = normalise_distribution ( A, { "unifcdf", "normcdf" } ); +%! assert ( mean ( N ), [ 0, 0 ], 0.1 ) +%! assert ( std ( N ), [ 1, 1 ], 0.1 ) + +%!xtest +%! A = [rand(1000,1), randn( 1000, 1), exprnd( 1, 1000, 1 )]'; +%! N = normalise_distribution ( A, { @unifcdf; @normcdf; @( x )( expcdf ( x, 1 ) ) }, 2 ); +%! assert ( mean ( N, 2 ), [ 0, 0, 0 ]', 0.1 ) +%! assert ( std ( N, [], 2 ), [ 1, 1, 1 ]', 0.1 ) + +%!xtest +%! A = exprnd ( 1, 1000, 9 ); A ( 300 : 500, 4:6 ) = 17; +%! N = normalise_distribution ( A ); +%! assert ( mean ( N ), [ 0 0 0 0.38 0.38 0.38 0 0 0 ], 0.1 ); +%! assert ( var ( N ), [ 1 1 1 2.59 2.59 2.59 1 1 1 ], 0.1 ); + +%!test +%! fail ("normalise_distribution( zeros ( 3, 4 ), { @unifcdf; @normcdf; @( x )( expcdf ( x, 1 ) ) } )", ... +%! "Number of distributions does not match data size!"); diff --git a/inst/normplot.m b/inst/normplot.m new file mode 100644 index 0000000..fb42e49 --- /dev/null +++ b/inst/normplot.m @@ -0,0 +1,75 @@ +## Author: Paul Kienzle +## This program is granted to the public domain. + +## -*- texinfo -*- +## @deftypefn {Function File} normplot (@var{X}) +## Produce normal probability plot for each column of @var{X}. +## +## The line joing the 1st and 3rd quantile is drawn on the +## graph. If the underlying distribution is normal, the +## points will cluster around this line. +## +## Note that this function sets the title, xlabel, ylabel, +## axis, grid, tics and hold properties of the graph. These +## need to be cleared before subsequent graphs using 'clf'. +## @end deftypefn + +function normplot(X) + if nargin!=1, print_usage; end + if (rows(X) == 1), X=X(:); end + + # Transform data + n = rows(X); + if n<2, error("normplot requires a vector"); end + q = norminv([1:n]'/(n+1)); + Y = sort(X); + + # Find the line joining the first to the third quartile for each column + q1 = ceil(n/4); + q3 = n-q1+1; + m = (q(q3)-q(q1))./(Y(q3,:)-Y(q1,:)); + p = [ m; q(q1)-m.*Y(q1,:) ]; + + # Plot the lines one at a time. Plot the lines before overlaying the + # normals so that the default label is 'line n'. + if columns(Y)==1, + leg = "+;;"; + else + leg = "%d+;Column %d;"; + endif + + for i=1:columns(Y) + plot(Y(:,i),q,sprintf(leg,i,i)); hold on; + + # estimate the mean and standard deviation by linear regression + # [v,dv] = wpolyfit(q,Y(:,i),1) + end + + # Overlay the estimated normal lines. + for i=1:columns(Y) + # Use the end points and one point guaranteed to be in the view since + # gnuplot skips any lines whose points are all outside the view. + pts = [Y(1,i);Y(q1,i);Y(end,i)]; + plot(pts, polyval(p(:,i),pts), [num2str(i),";;"]); + end + hold off; + + # plot labels + title "Normal Probability Plot" + ylabel "% Probability" + xlabel "Data" + + # plot grid + t = [0.00001;0.0001;0.001;0.01;0.1;0.3;1;2;5;10;25;50; + 75;90;95;98;99;99.7;99.9;99.99;99.999;99.9999;99.99999]; + set(gca, "ytick", norminv(t/100), "yticklabel", num2str(t)); + grid on + + # Set view range with a bit of space around data + miny = min(Y(:)); minq = min(q(1),norminv(0.05)); + maxy = max(Y(:)); maxq = max(q(end),norminv(0.95)); + yspace = (maxy-miny)*0.05; qspace = (q(end)-q(1))*0.05; + axis ([miny-yspace, maxy+yspace, minq-qspace, maxq+qspace]); + +end + diff --git a/inst/normstat.m b/inst/normstat.m new file mode 100644 index 0000000..3e590fe --- /dev/null +++ b/inst/normstat.m @@ -0,0 +1,122 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{mn}, @var{v}] =} normstat (@var{m}, @var{s}) +## Compute mean and variance of the normal distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{m} is the mean of the normal distribution +## +## @item +## @var{s} is the standard deviation of the normal distribution. +## @var{s} must be positive +## @end itemize +## @var{m} and @var{s} must be of common size or one of them must be +## scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{mn} is the mean of the normal distribution +## +## @item +## @var{v} is the variance of the normal distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## m = 1:6; +## s = 0:0.2:1; +## [mn, v] = normstat (m, s) +## @end group +## +## @group +## [mn, v] = normstat (0, s) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the normal distribution + +function [mn, v] = normstat (m, s) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (m) && ! ismatrix (m)) + error ("normstat: m must be a numeric matrix"); + endif + if (! isempty (s) && ! ismatrix (s)) + error ("normstat: s must be a numeric matrix"); + endif + + if (! isscalar (m) || ! isscalar (s)) + [retval, m, s] = common_size (m, s); + if (retval > 0) + error ("normstat: m and s must be of common size or scalar"); + endif + endif + + # Set moments + mn = m; + v = s .* s; + + # Continue argument check + k = find (! (s > 0) | ! (s < Inf)); + if (any (k)) + mn(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! m = 1:6; +%! s = 0.2:0.2:1.2; +%! [mn, v] = normstat (m, s); +%! expected_v = [0.0400, 0.1600, 0.3600, 0.6400, 1.0000, 1.4400]; +%! assert (mn, m); +%! assert (v, expected_v, 0.001); + +%!test +%! s = 0.2:0.2:1.2; +%! [mn, v] = normstat (0, s); +%! expected_mn = [0, 0, 0, 0, 0, 0]; +%! expected_v = [0.0400, 0.1600, 0.3600, 0.6400, 1.0000, 1.4400]; +%! assert (mn, expected_mn, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/optimalleaforder.m b/inst/optimalleaforder.m new file mode 100644 index 0000000..9b51928 --- /dev/null +++ b/inst/optimalleaforder.m @@ -0,0 +1,326 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {@var{leafOrder} =} optimalleaforder (@var{tree}, @var{D}) +## @deftypefnx {Function File} @ +## {@var{leafOrder} =} optimalleaforder (..., @var{Name}, @var{Value}) +## +## Compute the optimal leaf ordering of a hierarchical binary cluster tree. +## +## The optimal leaf ordering of a tree is the ordering which minimizes the sum +## of the distances between each leaf and its adjacent leaves, without altering +## the structure of the tree, that is without redefining the clusters of the +## tree. +## +## Required inputs: +## @itemize +## @item +## @var{tree}: a hierarchical cluster tree @var{tree} generated by the +## @code{linkage} function. +## +## @item +## @var{D}: a matrix of distances as computed by @code{pdist}. +## @end itemize +## +## Optional inputs can be the following property/value pairs: +## @itemize +## @item +## property 'Criteria' at the moment can only have the value 'adjacent', +## for minimizing the distances between leaves. +## +## @item +## property 'Transformation' can have one of the values 'linear', 'inverse' +## or a handle to a custom function which computes @var{S} the similarity +## matrix. +## @end itemize +## +## optimalleaforder's output @var{leafOrder} is the optimal leaf ordering. +## +## @strong{Reference} +## Bar-Joseph, Z., Gifford, D.K., and Jaakkola, T.S. Fast optimal leaf ordering +## for hierarchical clustering. Bioinformatics vol. 17 suppl. 1, 2001. +## @end deftypefn +## +## @seealso{dendrogram,linkage,pdist} + +function leafOrder = optimalleaforder ( varargin ) + + ## check the input + if ( nargin < 2 ) + print_usage (); + endif + + tree = varargin{1}; + D = varargin{2}; + criterion = "adjacent"; # default and only value at the moment + transformation = "linear"; + + if ((columns (tree) != 3) || (! isnumeric (tree)) || ... + (! (max (tree(end, 1:2)) == rows (tree) * 2))) + error (["optimalleaforder: tree must be a matrix as generated by the " ... + "linkage function"]); + endif + + ## read the paired arguments + if (! all (cellfun ("ischar", varargin(3:end)))) + error ("optimalleaforder: character inputs expected for arguments 3 and up"); + else + varargin(3:end) = lower (varargin(3:end)); + endif + pair_index = 3; + while (pair_index <= (nargin - 1)) + switch (varargin{pair_index}) + case "criteria" + criterion = varargin{pair_index + 1}; + if (strcmp (criterion, "group")) + ## MATLAB compatibility: + ## the 'group' criterion is not implemented + error ("optimalleaforder: unavailable criterion 'group'"); + elseif (! strcmp (criterion, "adjacent")) + error ("optimalleaforder: invalid criterion %s", criterion); + endif + case "transformation" + transformation = varargin{pair_index + 1}; + otherwise + error ("optimalleaforder: unknown property %s", varargin{pair_index}); + endswitch + + pair_index += 2; + endwhile + + ## D can be either a vector or a matrix, + ## but it is easier to work with a matrix + if (isvector (D)) + D = squareform (D); + endif + + n = rows (D); + m = rows (tree); + + if (n != (m + 1)) + error (["optimalleaforder: D must be a matrix or vector generated by " ... + "the pdist function"]); + endif + + + ## the similarity matrix, basically an inverted distance matrix + S = zeros (n); + + if (strcmpi (transformation, "linear")) + ## linear similarity + maxD = max (max (D)); + S = maxD - D; + elseif (strcmpi (transformation, "inverse")) + ## similarity as inverted distance + S = 1 ./ D; + elseif (is_function_handle (transformation)) + ## custom similarity + S = feval (transformation, D); + else + error ("optimalleaforder: invalid transformation %s", transformation); + endif + + + ## main body + + ## for each node v we compute the maximum similarity of the subtree M(w,u,v), + ## where the leftmost leaf is w and the rightmost is u; remember that + ## M(w,u,v) = M(u,w,v) + M = zeros (n, n, n + m); + + ## O is a utility matrix: for each node of the tree we store the left and + ## right leaves of the optimal subtree + O = [1:( n + m ); 1:( n + m ); (zeros (1, (n + m)))]'; + + ## compute M for every node v + for iter = 1 : m + v = iter + n; # current node + l = optimalleaforder_getLeafList (tree(iter, 1)); # the left subtree + r = optimalleaforder_getLeafList (tree(iter, 2)); # the right subtree + + if (tree(iter,1) > n) + l_l = optimalleaforder_getLeafList (tree(tree(iter, 1) - n, 1)); + l_r = optimalleaforder_getLeafList (tree(tree(iter, 1) - n, 2)); + else + l_l = l_r = l; + endif + + if (tree(iter,2) > n) + r_l = optimalleaforder_getLeafList (tree(tree(iter, 2) - n, 1)); + r_r = optimalleaforder_getLeafList (tree(tree(iter, 2) - n, 2)); + else + r_l = r_r = r; + endif + + ## let's find the maximum value of M(w,u,v) when: w is a leaf of the left + ## subtree of v and u is a leaf of the right subtree of v + for i = 1 : length (l) + if (isempty (find (l(i) == l_l))) + x = l_l; + else + x = l_r; + endif + for j = 1 : length (r) + if (isempty (find (r(j) == r_l))) + y = r_l; + else + y = r_r; + endif + + ## max(M(w,u,v)) = max(M(w,k,v_l)) + max(M(h,u,v_r)) + S(k,h) + ## where: v_l is the left child of v and v_r the right child of v + M_tmp = repmat (M(l(i), x(:), tree(iter, 1)), length (y), 1) + ... + repmat (M(y(:), r(j), tree(iter, 2)), 1, length (x)) + ... + S(y(:), x(:)); + M_max = max (max (M_tmp)); # this is M(l(i), r(j), v) + [h, k] = find (M_tmp == M_max); + + M(l(i), r(j), v) = M_max; + M(r(j), l(i), v) = M(l(i), r(j), v); + + if (M_max > O(v,3)) + O(v, 1) = l(i); # this is w + O(v, 2) = r(j); # this is u + O(v, 3) = M_max; # this is M(w, u, v) + endif + endfor + endfor + endfor + + ## reordering: + ## we found the M(w,u,v) corresponding to the optimal leaf order, now we can + ## compute the optimal leaf order given our M(w,u,v) + + ## the return value + leafOrder = zeros ( 1, n ); + leafOrder(1) = O(end, 1); + leafOrder(n) = O(end, 2); + + ## the inverse operation, only easier, to get the leaf order: now we know the + ## leftmost and rightmost leaves of the best subtree, we may have to flip it + ## though + for iter = m : -1 : 1 + v = iter + n; + + extremes = O(v, [1, 2]); + + l_node = tree(iter, 1); + r_node = tree(iter, 2); + + l = optimalleaforder_getLeafList (l_node); + r = optimalleaforder_getLeafList (r_node); + + if (l_node > n) + l_l = optimalleaforder_getLeafList (tree(l_node - n, 1)); + l_r = optimalleaforder_getLeafList (tree(l_node - n, 2)); + else + l_l = l_r = l; + endif + + if (r_node > n) + r_l = optimalleaforder_getLeafList (tree(r_node - n, 1)); + r_r = optimalleaforder_getLeafList (tree(r_node - n, 2)); + else + r_l = r_r = r; + endif + + ## this means that we need to flip the subtree + if (isempty (find (extremes(1) == l))) + l_tmp = l; + l_l_tmp = l_l; + l_r_tmp = l_r; + + l = r; + l_l = r_l; + l_r = r_r; + + r = l_tmp; + r_l = l_l_tmp; + r_r = l_r_tmp; + + node_tmp = l_node; + l_node = r_node; + r_node = node_tmp; + endif + + if (isempty (find (extremes(1) == l_l))) + x = l_l; + else + x = l_r; + endif + + if (isempty (find (extremes(2) == r_l))) + y = r_l; + else + y = r_r; + endif + + M_tmp = repmat (M(extremes(1), x(:), l_node), length (y), 1) + ... + repmat (M(y(:), extremes(2), r_node), 1, length (x)) + ... + S(y(:), x(:)); + M_max = max (max (M_tmp)); + [h, k] = find (M_tmp == M_max); + + O(l_node, 1) = extremes(1); + O(l_node, 2) = x(k); + O(r_node, 1) = y(h); + O(r_node, 2) = extremes(2); + + p_1 = find (leafOrder == extremes(1)); + p_2 = find (leafOrder == extremes(2)); + + leafOrder (p_1 + (length (l)) - 1) = x(k); + leafOrder (p_1 + (length (l))) = y(h); + endfor + + ## function: optimalleaforder_getLeafList + ## get the list of leaves under a given node + function vector = optimalleaforder_getLeafList (nodes_to_visit) + vector = []; + while (! isempty (nodes_to_visit)) + currentnode = nodes_to_visit(1); + nodes_to_visit(1) = []; + if (currentnode > n) + node = currentnode - n; + nodes_to_visit = [tree(node, [2 1]) nodes_to_visit]; + endif + + if (currentnode <= n) + vector = [vector currentnode]; + endif + endwhile + endfunction + +endfunction + + +## Test input validation +%!error optimalleaforder () +%!error optimalleaforder (1) +%!error optimalleaforder (ones (2, 2), 1) +%!error optimalleaforder ([1 2 3], [1 2; 3 4], "criteria", 5) +%!error optimalleaforder ([1 2 1], [1 2 3]) +%!error optimalleaforder ([1 2 1], 1, "xxx", "xxx") +%!error optimalleaforder ([1 2 1], 1, "Transformation", "xxx") + +## Demonstration +%!demo +%! X = randn (10, 2); +%! D = pdist (X); +%! tree = linkage(D, 'average'); +%! optimalleaforder (tree, D, 'Transformation', 'linear') + diff --git a/inst/pca.m b/inst/pca.m new file mode 100644 index 0000000..2c47354 --- /dev/null +++ b/inst/pca.m @@ -0,0 +1,534 @@ +## Copyright (C) 2013-2019 Fernando Damian Nieuwveldt +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License +## as published by the Free Software Foundation; either version 3 +## of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{coeff}]} = pca(@var{X}) +## @deftypefnx {Function File} {[@var{coeff}]} = pca(@var{X}, Name, Value) +## @deftypefnx {Function File} {[@var{coeff},@var{score},@var{latent}]} = pca(@dots{}) +## @deftypefnx {Function File} {[@var{coeff},@var{score},@var{latent},@var{tsquared}]} = pca(@dots{}) +## @deftypefnx {Function File} {[@var{coeff},@var{score},@var{latent},@var{tsquared},@var{explained},@var{mu}]} = pca(@dots{}) +## Performs a principal component analysis on a data matrix X +## +## A principal component analysis of a data matrix of @code{n} observations in a +## @code{p}-dimensional space returns a @code{p}-by-@code{p} transformation +## matrix, to perform a change of basis on the data. The first component of the +## new basis is the direction that maximizes the variance of the projected data. +## +## Input argument: +## @itemize @bullet +## @item +## @var{x} : a @code{n}-by-@code{p} data matrix +## @end itemize +## +## Pair arguments: +## @itemize @bullet +## @item +## @code{Algorithm} : the algorithm to use, it can be either @code{eig}, +## for eigenvalue decomposition, or @code{svd} (default), for singular value +## decomposition +## @item +## @code{Centered} : boolean indicator for centering the observation data, it is +## @code{true} by default +## @item +## @code{Economy} : boolean indicator for the economy size output, it is +## @code{true} by default; @code{pca} returns only the elements of @var{latent} +## that are not necessarily zero, and the corresponding columns of @var{coeff} +## and @var{score}, that is, when @code{n <= p}, only the first @code{n - 1} +## @item +## @code{NumComponents} : the number of components @code{k} to return, if +## @code{k < p}, then only the first @code{k} columns of @var{coeff} +## and @var{score} are returned +## @item +## @code{Rows} : action to take with missing values, it can be either +## @code{complete} (default), missing values are removed before computation, +## @code{pairwise} (only with algorithm @code{eig}), the covariance of rows with +## missing data is computed using the available data, but the covariance matrix +## could be not positive definite, which triggers the termination of @code{pca}, +## @code{complete}, missing values are not allowed, @code{pca} terminates with +## an error if there are any +## @item +## @code{Weights} : observation weights, it is a vector of positive values of +## length @code{n} +## @item +## @code{VariableWeights} : variable weights, it can be either a vector of +## positive values of length @code{p} or the string @code{variance} to use the +## sample variance as weights +## @end itemize +## +## Return values: +## @itemize @bullet +## @item +## @var{coeff} : the principal component coefficients, a @code{p}-by-@code{p} +## transformation matrix +## @item +## @var{score} : the principal component scores, the representation of @var{x} +## in the principal component space +## @item +## @var{latent} : the principal component variances, i.e., the eigenvalues of +## the covariance matrix of @var{x} +## @item +## @var{tsquared} : Hotelling's T-squared Statistic for each observation in +## @var{x} +## @item +## @var{explained} : the percentage of the variance explained by each principal +## component +## @item +## @var{mu} : the estimated mean of each variable of @var{x}, it is zero if the +## data are not centered +## @end itemize +## +## Matlab compatibility note: the alternating least square method 'als' and +## associated options 'Coeff0', 'Score0', and 'Options' are not yet implemented +## +## @subheading References +## +## @enumerate +## @item +## Jolliffe, I. T., Principal Component Analysis, 2nd Edition, Springer, 2002 +## +## @end enumerate +## @end deftypefn + +## BUGS: +## - tsquared with weights (xtest below) + +##FIXME +## -- can change isnan to ismissing once the latter is implemented in Octave +## -- change mystd to std and remove the helper function mystd once weighting is available in the Octave function + +function [coeff, score, latent, tsquared, explained, mu] = pca (X, varargin) + + if (nargin < 1) + print_usage (); + endif + + [nobs, nvars] = size (X); + + ## default options + optAlgorithmS = "svd"; + optCenteredB = true; + optEconomyB = true; + optNumComponentsI = nvars; + optWeights = []; + optVariableWeights = []; + optRowsB = false; + TF = []; + + ## parse parameters + pair_index = 1; + while (pair_index <= (nargin - 1)) + switch (lower (varargin{pair_index})) + ## decomposition algorithm: singular value decomposition, eigenvalue + ## decomposition or (currently unavailable) alternating least square + case "algorithm" + optAlgorithmS = varargin{pair_index + 1}; + switch (optAlgorithmS) + case {"svd", "eig"} + ; + case "als" + error ("pca: alternating least square algorithm not implemented"); + otherwise + error ("pca: invalid algorithm %s", optAlgorithmS); + endswitch + ## centering of the columns, around the mean + case "centered" + if (isbool (varargin{pair_index + 1})) + optCenteredB = varargin{pair_index + 1}; + else + error ("pca: 'centered' requires a boolean value"); + endif + ## limit the size of the output to the degrees of freedom, when a smaller + ## number than the number of variables + case "economy" + if (isbool (varargin{pair_index + 1})) + optEconomyB = varargin{pair_index + 1}; + else + error ("pca: 'economy' requires a boolean value"); + endif + ## choose the number of components to show + case "numcomponents" + optNumComponentsI = varargin{pair_index + 1}; + if ((! isscalar (optNumComponentsI)) || + (! isnumeric (optNumComponentsI)) || + optNumComponentsI != floor (optNumComponentsI) || + optNumComponentsI <= 0 || + optNumComponentsI > nvars) + error (["pca: the number of components must be a positive integer"... + "number smaller or equal to the number of variables"]); + endif + ## observation weights: some observations can be more accurate than others + case "weights" + optWeights = varargin{pair_index + 1}; + if ((! isvector (optWeights)) || + length (optWeights) != nobs || + length (find (optWeights < 0)) > 0) + error ("pca: weights must be a numerical array of positive numbers"); + endif + + if (rows (optWeights) == 1 ) + optWeights = transpose (optWeights); + endif + ## variable weights: weights used for the variables + case "variableweights" + optVariableWeights = varargin{pair_index + 1}; + if (ischar (optVariableWeights) && + strcmpi (optVariableWeights, "variance")) + optVariableWeights = "variance"; # take care of this later + elseif ((! isvector (optVariableWeights)) || + length (optVariableWeights) != nvars || + (! isnumeric (optVariableWeights)) || + length (find (optVariableWeights < 0)) > 0) + error (["pca: variable weights must be a numerical array of "... + "positive numbers or the string 'variance'"]); + else + optVariableWeights = 1 ./ sqrt (optVariableWeights); + + ## it is used as a row vector + if (columns (optVariableWeights) == 1 ) + optVariableWeights = transpose (optVariableWeights); + endif + endif + ## rows: policy for missing values + case "rows" + switch (varargin{pair_index + 1}) + case "complete" + optRowsB = false; + case "pairwise" + optRowsB = true; + case "all" + if (any (isnan (X))) + error (["pca: when all rows are requested the dataset cannot"... + " include NaN values"]); + endif + otherwise + error ("pca: %s is an invalid value for rows", ... + varargin{pair_index + 1}); + endswitch + case {"coeff0", "score0", "options"} + error ("pca: parameter %s is only valid with the 'als' method, which is not yet implemented", varargin{pair_index}); + otherwise + error ("pca: unknown property %s", varargin{pair_index}); + endswitch + + pair_index += 2; + endwhile + + ## Preparing the dataset according to the chosen policy for missing values + if (optRowsB) + if (! strcmp (optAlgorithmS, "eig")) + optAlgorithmS = "eig"; + warning (["pca: setting algorithm to 'eig' because 'rows' option is "... + "set to 'pairwise'"]); + endif + + TF = isnan (X); + missingRows = zeros (nobs, 1); + nmissing = 0; + else + ## "complete": remove all the rows with missing values + TF = isnan (X); + missingRows = any (TF, 2); + nmissing = sum (missingRows); + endif + + ## indices of the available rows + ridcs = find (missingRows == 0); + + ## Center the columns to mean zero if requested + if (optCenteredB) + if (isempty (optWeights) && nmissing == 0 && ! optRowsB) + ## no weights and no missing values + mu = mean (X); + elseif (nmissing == 0 && ! optRowsB) + ## weighted observations: some observations are more valuable, i.e. they + ## can be trusted more + mu = sum (optWeights .* X) ./ sum (optWeights); + else + ## missing values: the mean is computed column by column + mu = zeros (1, nvars); + + if (isempty (optWeights)) + for iter = 1 : nvars + mu(iter) = mean (X(find (TF(:, iter) == 0), iter)); + endfor + else + ## weighted mean with missing data + for iter = 1 : nvars + mu(iter) = sum (X(find (TF(:, iter) == 0), iter) .* ... + optWeights(find (TF(:, iter) == 0))) ./ ... + sum (optWeights(find (TF(:, iter) == 0))); + endfor + endif + endif + + Xc = X - mu; + else + Xc = X; + + ## The mean of the variables of the original dataset: + ## return zero if the dataset is not centered + mu = zeros (1, nvars); + endif + + ## Change the columns according to the variable weights + if (! isempty (optVariableWeights)) + if (ischar (optVariableWeights)) + if (isempty (optWeights)) + sqrtBias = 1; # see below + optVariableWeights = std (X); + else + ## unbiased variance estimation: the bias when using reliability weights + ## is 1 - var(weights) / std(weigths)^2 + sqrtBias = sqrt (1 - (sumsq (optWeights) / sum (optWeights) ^ 2)); + optVariableWeights = mystd (X, optWeights) / sqrtBias; + endif + endif + Xc = Xc ./ optVariableWeights; + endif + + ## Compute the observation weight matrix + if (isempty (optWeights)) + Wd = eye (nobs - nmissing); + else + Wd = diag (optWeights) ./ sum (optWeights); + endif + + ## Compute the coefficients + switch (optAlgorithmS) + case "svd" + ## Check if there are more variables than observations + if (nvars <= nobs) + [U, S, coeff] = svd (sqrt (Wd) * Xc(ridcs,:), "econ"); + else + ## Calculate the svd on the transpose matrix, much faster + if (optEconomyB) + [coeff, S, V] = svd (Xc(ridcs,:)' * sqrt (Wd), "econ"); + else + [coeff, S, V] = svd (Xc(ridcs,:)' * sqrt (Wd)); + endif + endif + case "eig" + ## this method requires the computation of the sample covariance matrix + if (optRowsB) + ## pairwise: + ## in this case the degrees of freedom for each element of the matrix + ## are equal to the number of valid rows for the couple of columns + ## used to compute the element + Xpairwise = Xc; + Xpairwise(find (isnan (Xc))) = 0; + + Ndegrees = (nobs - 1) * ones (nvars, nvars); + for i_iter = 1 : nvars + for j_iter = i_iter : nvars + Ndegrees(i_iter, j_iter) = Ndegrees(i_iter, j_iter) - ... + sum (any (TF(:,[i_iter j_iter]), 2)); + Ndegrees(j_iter, i_iter) = Ndegrees(i_iter, j_iter); + endfor + endfor + + Mcov = Xpairwise' * Wd * Xpairwise ./ Ndegrees; + else + ## the degrees of freedom are not really important here + ndegrees = nobs - nmissing - 1; + Mcov = Xc(ridcs, :)' * Wd * Xc(ridcs, :) / ndegrees; + endif + + [coeff, S] = eigs (Mcov, nvars); + endswitch + + ## Change the coefficients according to the variable weights + if (! isempty (optVariableWeights)) + coeff = coeff .* transpose (optVariableWeights); + endif + + ## MATLAB compatibility: the sign convention is that the + ## greatest absolute value for each column is positive + switchSignV = find (max (coeff) < abs (min (coeff))); + if (! isempty (switchSignV)) + coeff(:, switchSignV) = -1 * coeff(:, switchSignV); + endif + + ## Compute the scores + if (nargout > 1) + ## This is for the score when using variable weights, it is not really + ## a new definition of Xc + if (! isempty (optVariableWeights)) + Xc = Xc ./ optVariableWeights; + endif + + ## Get the Scores + score = Xc(ridcs,:) * coeff; + + ## Get the rank of the score matrix + r = rank (score); + + ## If there is missing data, put it back + ## FIXME: this needs tests + if (nmissing) + scoretmp = zeros (nobs, nvars); + scoretmp(find (missingRows == 0), :) = score; + scoretmp(find (missingRows), :) = NaN; + score = scoretmp; + endif + + ## Only use the first r columns, pad rest with zeros if economy != true + score = score(:, 1:r) ; + + if (! optEconomyB) + score = [score, (zeros (nobs , nvars-r))]; + else + coeff = coeff(: , 1:r); + endif + endif + + ## Compute the variances + if (nargout > 2) + ## degrees of freedom: n - 1 for centered data + if (optCenteredB) + dof = size (Xc(ridcs,:), 1) - 1; + else + dof = size (Xc(ridcs,:), 1); + endif + + ## This is the same as the eigenvalues of the covariance matrix of X + if (strcmp (optAlgorithmS, "eig")) + latent = diag (S, 0); + else + latent = (diag (S'*S) / dof)(1:r); + endif + + ## If observation weights were used, we need to scale back these values + if (! isempty (optWeights)) + latent = latent .* sum (optWeights(ridcs)); + endif + + if (! optEconomyB) + latent= [latent; (zeros (nvars - r, 1))]; + endif + endif + + ## Compute the Hotelling T-square statistics + ## MATLAB compatibility: when using weighted observations the T-square + ## statistics differ by some rounding error + if (nargout > 3) + ## Calculate the Hotelling T-Square statistic for the observations + ## formally: tsquared = sumsq (zscore (score(:, 1:r)),2); + if (! isempty (optWeights)) + ## probably splitting the weights, using the square roots, is not the + ## best solution, numerically + weightedScore = score .* sqrt (optWeights); + tsquared = mahal (weightedScore(ridcs, 1:r), weightedScore(ridcs, 1:r))... + ./ optWeights; + else + tsquared = mahal (score(ridcs, 1:r), score(ridcs, 1:r)); + endif + endif + + ## Compute the variance explained by each principal component + if (nargout > 4) + explained = 100 * latent / sum (latent); + endif + + ## When a number of components is chosen, the coefficients and score matrix + ## only show that number of columns + if (optNumComponentsI != nvars) + coeff = coeff(:, 1:optNumComponentsI); + score = score(:, 1:optNumComponentsI); + endif +endfunction + +#return the weighted standard deviation +function retval = mystd (x, w) + (dim = find (size(x) != 1, 1)) || (dim = 1); + den = sum (w); + mu = sum (w .* x, dim) ./ sum (w); + retval = sum (w .* ((x - mu) .^ 2), dim) / den; + retval = sqrt (retval); +endfunction + +%!shared COEFF,SCORE,latent,tsquare,m,x,R,V,lambda,i,S,F + +#NIST Engineering Statistics Handbook example (6.5.5.2) +%!test +%! x=[7 4 3 +%! 4 1 8 +%! 6 3 5 +%! 8 6 1 +%! 8 5 7 +%! 7 2 9 +%! 5 3 3 +%! 9 5 8 +%! 7 4 5 +%! 8 2 2]; +%! R = corrcoef (x); +%! [V, lambda] = eig (R); +%! [~, i] = sort(diag(lambda), "descend"); #arrange largest PC first +%! S = V(:, i) * diag(sqrt(diag(lambda)(i))); +%!assert(diag(S(:, 1:2)*S(:, 1:2)'), [0.8662; 0.8420; 0.9876], 1E-4); #contribution of first 2 PCs to each original variable +%! B = V(:, i) * diag( 1./ sqrt(diag(lambda)(i))); +%! F = zscore(x)*B; +%! [COEFF,SCORE,latent,tsquare] = pca(zscore(x, 1)); +%!assert(tsquare,sumsq(F, 2),1E4*eps); + +%!test +%! x=[1,2,3;2,1,3]'; +%! [COEFF,SCORE,latent,tsquare] = pca(x, "Economy", false); +%! m=[sqrt(2),sqrt(2);sqrt(2),-sqrt(2);-2*sqrt(2),0]/2; +%! m(:,1) = m(:,1)*sign(COEFF(1,1)); +%! m(:,2) = m(:,2)*sign(COEFF(1,2)); + +%!assert(COEFF,m(1:2,:),10*eps); +%!assert(SCORE,-m,10*eps); +%!assert(latent,[1.5;.5],10*eps); +%!assert(tsquare,[4;4;4]/3,10*eps); + +#test with observation weights (using Matlab's results for this case as a reference) +%! [COEFF,SCORE,latent,tsquare] = pca(x, "Economy", false, "weights", [1 2 1], "variableweights", "variance"); +%!assert(COEFF, [0.632455532033676 -0.632455532033676; 0.741619848709566 0.741619848709566], 10*eps); +%!assert(SCORE, [-0.622019449426284 0.959119380657905; -0.505649896847432 -0.505649896847431; 1.633319243121148 0.052180413036957], 10*eps); +%!assert(latent, [1.783001790889027; 0.716998209110974], 10*eps); +%!xtest assert(tsquare, [1.5; 0.5; 1.5], 10*eps); #currently, [4; 2; 4]/3 is actually returned; see comments above + +%!test +%! x=x'; +%! [COEFF,SCORE,latent,tsquare] = pca(x, "Economy", false); +%! m=[sqrt(2),sqrt(2),0;-sqrt(2),sqrt(2),0;0,0,2]/2; +%! m(:,1) = m(:,1)*sign(COEFF(1,1)); +%! m(:,2) = m(:,2)*sign(COEFF(1,2)); +%! m(:,3) = m(:,3)*sign(COEFF(3,3)); + +%!assert(COEFF,m,10*eps); +%!assert(SCORE(:,1),-m(1:2,1),10*eps); +%!assert(SCORE(:,2:3),zeros(2),10*eps); +%!assert(latent,[1;0;0],10*eps); +%!assert(tsquare,[0.5;0.5],10*eps) + +%!test +%! [COEFF,SCORE,latent,tsquare] = pca(x); + +%!assert(COEFF,m(:, 1),10*eps); +%!assert(SCORE,-m(1:2,1),10*eps); +%!assert(latent,[1],10*eps); +%!assert(tsquare,[0.5;0.5],10*eps) + +%!error pca([1 2; 3 4], "Algorithm", "xxx") +%!error <'centered' requires a boolean value> pca([1 2; 3 4], "Centered", "xxx") +%!error pca([1 2; 3 4], "NumComponents", -4) +%!error pca([1 2; 3 4], "Rows", 1) +%!error pca([1 2; 3 4], "Weights", [1 2 3]) +%!error pca([1 2; 3 4], "Weights", [-1 2]) +%!error pca([1 2; 3 4], "VariableWeights", [-1 2]) +%!error pca([1 2; 3 4], "VariableWeights", "xxx") +%!error pca([1 2; 3 4], "XXX", 1) diff --git a/inst/pcacov.m b/inst/pcacov.m new file mode 100644 index 0000000..8952f96 --- /dev/null +++ b/inst/pcacov.m @@ -0,0 +1,74 @@ +## Copyright (C) 2013-2019 Fernando Damian Nieuwveldt +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License +## as published by the Free Software Foundation; either version 3 +## of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{COEFF}]} = pcacov(@var{X}) +## @deftypefnx {Function File} {[@var{COEFF},@var{latent}]} = pcacov(@var{X}) +## @deftypefnx {Function File} {[@var{COEFF},@var{latent},@var{explained}]} = pcacov(@var{X}) +## Perform principal component analysis on the nxn covariance matrix X +## +## @itemize @bullet +## @item +## @var{COEFF} : a nxn matrix with columns containing the principal component coefficients +## @item +## @var{latent} : a vector containing the principal component variances +## @item +## @var{explained} : a vector containing the percentage of the total variance explained by each principal component +## +## @end itemize +## +## @subheading References +## +## @enumerate +## @item +## Jolliffe, I. T., Principal Component Analysis, 2nd Edition, Springer, 2002 +## +## @end enumerate +## @end deftypefn + +## Author: Fernando Damian Nieuwveldt +## Description: Principal Components Analysis using a covariance matrix +function [COEFF, latent, explained] = pcacov(X) + + [U,S,V] = svd(X); + + if nargout == 1 + COEFF = U; + elseif nargout == 2 + COEFF = U; + latent = diag(S); + else + COEFF = U; + latent = diag(S); + explained = 100*latent./sum(latent); + end +endfunction +%!demo +%! X = [ 7 26 6 60; +%! 1 29 15 52; +%! 11 56 8 20; +%! 11 31 8 47; +%! 7 52 6 33; +%! 11 55 9 22; +%! 3 71 17 6; +%! 1 31 22 44; +%! 2 54 18 22; +%! 21 47 4 26; +%! 1 40 23 34; +%! 11 66 9 12; +%! 10 68 8 12 +%! ]; +%! covx = cov(X); +%! [COEFF,latent,explained] = pcacov(covx) + diff --git a/inst/pcares.m b/inst/pcares.m new file mode 100644 index 0000000..9603fa7 --- /dev/null +++ b/inst/pcares.m @@ -0,0 +1,89 @@ +## Copyright (C) 2013-2019 Fernando Damian Nieuwveldt +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License +## as published by the Free Software Foundation; either version 3 +## of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{residuals},@var{reconstructed}]}=pcares(@var{X}, @var{NDIM}) +## Calulate residuals from principal component analysis +## +## @itemize @bullet +## @item +## @var{X} : N x P Matrix with N observations and P variables, the variables will be mean centered +## @item +## @var{ndim} : Is a scalar indicating the number of principal components to use and should be <= P +## @end itemize +## +## @subheading References +## +## @enumerate +## @item +## Jolliffe, I. T., Principal Component Analysis, 2nd Edition, Springer, 2002 +## +## @end enumerate +## @end deftypefn + +## Author: Fernando Damian Nieuwveldt +## Description: Residuals from Principal Components Analysis + +function [residuals,reconstructed] = pcares(X,NDIM) + + if (nargin ~= 2) + error('pcares takes two inputs: The data Matrix X and number of principal components NDIM') + endif + + # Mean center data + Xcentered = bsxfun(@minus,X,mean(X)); + + # Apply svd to get the principal component coefficients + [U,S,V] = svd(Xcentered); + + # Use only the first ndim PCA components + v = V(:,1:NDIM); + + if (nargout == 2) + # Calculate the residuals + residuals = Xcentered - Xcentered * (v*v'); + + # Reconstructed data using ndim PCA components + reconstructed = X - residuals; + else + # Calculate the residuals + residuals = Xcentered - Xcentered * (v*v'); + endif +endfunction +%!demo +%! X = [ 7 26 6 60; +%! 1 29 15 52; +%! 11 56 8 20; +%! 11 31 8 47; +%! 7 52 6 33; +%! 11 55 9 22; +%! 3 71 17 6; +%! 1 31 22 44; +%! 2 54 18 22; +%! 21 47 4 26; +%! 1 40 23 34; +%! 11 66 9 12; +%! 10 68 8 12 +%! ]; +%! # As we increase the number of principal components, the norm +%! # of the residuals matrix will decrease +%! r1 = pcares(X,1); +%! n1 = norm(r1) +%! r2 = pcares(X,2); +%! n2 = norm(r2) +%! r3 = pcares(X,3); +%! n3 = norm(r3) +%! r4 = pcares(X,4); +%! n4 = norm(r4) + diff --git a/inst/pdf.m b/inst/pdf.m new file mode 100644 index 0000000..c15dbce --- /dev/null +++ b/inst/pdf.m @@ -0,0 +1,109 @@ +## Copyright (C) 2016 Andreas Stahel +## strongly based on cdf.m by 2013 Pantxo Diribarne +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{retval} =} pdf (@var{name}, @var{X}, @dots{}) +## Return probability density function of @var{name} function for value +## @var{x}. +## This is a wrapper around various @var{name}pdf and @var{name}_pdf +## functions. See the individual functions help to learn the signification of +## the arguments after @var{x}. Supported functions and corresponding number of +## additional arguments are: +## +## @multitable @columnfractions 0.02 0.3 0.45 0.2 +## @headitem @tab function @tab alternative @tab args +## @item @tab "beta" @tab "beta" @tab 2 +## @item @tab "bino" @tab "binomial" @tab 2 +## @item @tab "cauchy" @tab @tab 2 +## @item @tab "chi2" @tab "chisquare" @tab 1 +## @item @tab "discrete" @tab @tab 2 +## @item @tab "exp" @tab "exponential" @tab 1 +## @item @tab "f" @tab @tab 2 +## @item @tab "gam" @tab "gamma" @tab 2 +## @item @tab "geo" @tab "geometric" @tab 1 +## @item @tab "gev" @tab "generalized extreme value" @tab 3 +## @item @tab "hyge" @tab "hypergeometric" @tab 3 +## @item @tab "kolmogorov_smirnov" @tab @tab 1 +## @item @tab "laplace" @tab @tab 2 +## @item @tab "logistic" @tab @tab 0 +## @item @tab "logn" @tab "lognormal" @tab 2 +## @item @tab "norm" @tab "normal" @tab 2 +## @item @tab "poiss" @tab "poisson" @tab 1 +## @item @tab "rayl" @tab "rayleigh" @tab 1 +## @item @tab "t" @tab @tab 1 +## @item @tab "unif" @tab "uniform" @tab 2 +## @item @tab "wbl" @tab "weibull" @tab 2 +## @end multitable +## +## @seealso{betapdf, binopdf, cauchy_pdf, chi2pdf, discrete_pdf, +## exppdf, fpdf, gampdf, geopdf, gevpdf, hygepdf, laplace_pdf, +## logistic_pdf, lognpdf, normpdf, poisspdf, raylpdf, tpdf, +## unifpdf, wblpdf} +## @end deftypefn + +function [retval] = pdf (varargin) + ## implemented functions + persistent allpdf = {{"beta", "beta"}, @betapdf, 2, ... + {"bino", "binomial"}, @binopdf, 2, ... + {"cauchy"}, @cauchy_pdf, 2, ... + {"chi2", "chisquare"}, @chi2pdf, 1, ... + {"discrete"}, @discrete_pdf, 2, ... + {"exp", "exponential"}, @exppdf, 1, ... + {"f"}, @fpdf, 2, ... + {"gam", "gamma"}, @gampdf, 2, ... + {"geo", "geometric"}, @geopdf, 1, ... + {"gev", "generalized extreme value"}, @gevpdf, 3, ... + {"hyge", "hypergeometric"}, @hygepdf, 3, ... + {"laplace"}, @laplace_pdf, 1, ... + {"logistic"}, @logistic_pdf, 0, ... # ML has 2 args here + {"logn", "lognormal"}, @lognpdf, 2, ... + {"norm", "normal"}, @normpdf, 2, ... + {"poiss", "poisson"}, @poisspdf, 1, ... + {"rayl", "rayleigh"}, @raylpdf, 1, ... + {"t"}, @tpdf, 1, ... + {"unif", "uniform"}, @unifpdf, 2, ... + {"wbl", "weibull"}, @wblpdf, 2}; + + if (numel (varargin) < 2 || ! ischar (varargin{1})) + print_usage (); + endif + + name = varargin{1}; + x = varargin{2}; + + varargin(1:2) = []; + nargs = numel (varargin); + + pdfnames = allpdf(1:3:end); + pdfhdl = allpdf(2:3:end); + pdfargs = allpdf(3:3:end); + + idx = cellfun (@(x) any (strcmpi (name, x)), pdfnames); + + if (any (idx)) + if (nargs == pdfargs{idx}) + retval = feval (pdfhdl{idx}, x, varargin{:}); + else + error ("pdf: %s requires %d arguments", name, pdfargs{idx}) + endif + else + error ("pdf: %s not implemented", name); + endif + +endfunction + +%!test +%! assert(pdf ('norm', 1, 0, 1), normpdf (1, 0, 1)) \ No newline at end of file diff --git a/inst/pdist.m b/inst/pdist.m new file mode 100644 index 0000000..3ba6377 --- /dev/null +++ b/inst/pdist.m @@ -0,0 +1,229 @@ +## Copyright (C) 2008 Francesco Potortì +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} pdist (@var{x}) +## @deftypefnx {Function File} {@var{y} =} pdist (@var{x}, @var{metric}) +## @deftypefnx {Function File} {@var{y} =} pdist (@var{x}, @var{metric}, @var{metricarg}, @dots{}) +## +## Return the distance between any two rows in @var{x}. +## +## @var{x} is the @var{n}x@var{d} matrix representing @var{q} row +## vectors of size @var{d}. +## +## The output is a dissimilarity matrix formatted as a row vector +## @var{y}, @math{(n-1)*n/2} long, where the distances are in +## the order [(1, 2) (1, 3) @dots{} (2, 3) @dots{} (n-1, n)]. You can +## use the @code{squareform} function to display the distances between +## the vectors arranged into an @var{n}x@var{n} matrix. +## +## @code{metric} is an optional argument specifying how the distance is +## computed. It can be any of the following ones, defaulting to +## "euclidean", or a user defined function that takes two arguments +## @var{x} and @var{y} plus any number of optional arguments, +## where @var{x} is a row vector and and @var{y} is a matrix having the +## same number of columns as @var{x}. @code{metric} returns a column +## vector where row @var{i} is the distance between @var{x} and row +## @var{i} of @var{y}. Any additional arguments after the @code{metric} +## are passed as metric (@var{x}, @var{y}, @var{metricarg1}, +## @var{metricarg2} @dots{}). +## +## Predefined distance functions are: +## +## @table @samp +## @item "euclidean" +## Euclidean distance (default). +## +## @item "squaredeuclidean" +## Squared Euclidean distance. It omits the square root from the calculation +## of the Euclidean distance. It does not satisfy the triangle inequality. +## +## @item "seuclidean" +## Standardized Euclidean distance. Each coordinate in the sum of +## squares is inverse weighted by the sample variance of that +## coordinate. +## +## @item "mahalanobis" +## Mahalanobis distance: see the function mahalanobis. +## +## @item "cityblock" +## City Block metric, aka Manhattan distance. +## +## @item "minkowski" +## Minkowski metric. Accepts a numeric parameter @var{p}: for @var{p}=1 +## this is the same as the cityblock metric, with @var{p}=2 (default) it +## is equal to the euclidean metric. +## +## @item "cosine" +## One minus the cosine of the included angle between rows, seen as +## vectors. +## +## @item "correlation" +## One minus the sample correlation between points (treated as +## sequences of values). +## +## @item "spearman" +## One minus the sample Spearman's rank correlation between +## observations, treated as sequences of values. +## +## @item "hamming" +## Hamming distance: the quote of the number of coordinates that differ. +## +## @item "jaccard" +## One minus the Jaccard coefficient, the quote of nonzero +## coordinates that differ. +## +## @item "chebychev" +## Chebychev distance: the maximum coordinate difference. +## @end table +## @seealso{linkage, mahalanobis, squareform, pdist2} +## @end deftypefn + +## Author: Francesco Potortì + +function y = pdist (x, metric, varargin) + + if (nargin < 1) + print_usage (); + elseif ((nargin > 1) + && ! ischar (metric) + && ! isa (metric, "function_handle")) + error (["pdist: the distance function must be either a string or a " + "function handle."]); + endif + + if (nargin < 2) + metric = "euclidean"; + endif + + if (! ismatrix (x) || isempty (x)) + error ("pdist: x must be a nonempty matrix"); + elseif (length (size (x)) > 2) + error ("pdist: x must be 1 or 2 dimensional"); + endif + + y = []; + if (rows(x) == 1) + return; + endif + + if (ischar (metric)) + order = nchoosek(1:rows(x),2); + Xi = order(:,1); + Yi = order(:,2); + X = x'; + metric = lower (metric); + switch (metric) + case "euclidean" + d = X(:,Xi) - X(:,Yi); + y = norm (d, "cols"); + + case "squaredeuclidean" + d = X(:,Xi) - X(:,Yi); + y = sumsq (d); + + case "seuclidean" + d = X(:,Xi) - X(:,Yi); + weights = inv (diag (var (x, 0, 1))); + y = sqrt (sum ((weights * d) .* d, 1)); + + case "mahalanobis" + d = X(:,Xi) - X(:,Yi); + weights = inv (cov (x)); + y = sqrt (sum ((weights * d) .* d, 1)); + + case "cityblock" + d = X(:,Xi) - X(:,Yi); + if (str2num(version()(1:3)) > 3.1) + y = norm (d, 1, "cols"); + else + y = sum (abs (d), 1); + endif + + case "minkowski" + d = X(:,Xi) - X(:,Yi); + p = 2; # default + if (nargin > 2) + p = varargin{1}; # explicitly assigned + endif; + y = norm (d, p, "cols"); + + case "cosine" + prod = X(:,Xi) .* X(:,Yi); + weights = sumsq (X(:,Xi), 1) .* sumsq (X(:,Yi), 1); + y = 1 - sum (prod, 1) ./ sqrt (weights); + + case "correlation" + if (rows(X) == 1) + error ("pdist: correlation distance between scalars not defined") + endif + cor = corr (X); + y = 1 - cor (sub2ind (size (cor), Xi, Yi))'; + + case "spearman" + if (rows(X) == 1) + error ("pdist: spearman distance between scalars not defined") + endif + cor = spearman (X); + y = 1 - cor (sub2ind (size (cor), Xi, Yi))'; + + case "hamming" + d = logical (X(:,Xi) - X(:,Yi)); + y = sum (d, 1) / rows (X); + + case "jaccard" + d = logical (X(:,Xi) - X(:,Yi)); + weights = X(:,Xi) | X(:,Yi); + y = sum (d & weights, 1) ./ sum (weights, 1); + + case "chebychev" + d = X(:,Xi) - X(:,Yi); + y = norm (d, Inf, "cols"); + + endswitch + endif + + if (isempty (y)) + ## Metric is a function handle or the name of an external function + l = rows (x); + y = zeros (1, nchoosek (l, 2)); + idx = 1; + for ii = 1:l-1 + for jj = ii+1:l + y(idx++) = feval (metric, x(ii,:), x, varargin{:})(jj); + endfor + endfor + endif + +endfunction + +%!shared xy, t, eucl +%! xy = [0 1; 0 2; 7 6; 5 6]; +%! t = 1e-3; +%! eucl = @(v,m) sqrt(sumsq(repmat(v,rows(m),1)-m,2)); +%!assert(pdist(xy), [1.000 8.602 7.071 8.062 6.403 2.000],t); +%!assert(pdist(xy,eucl), [1.000 8.602 7.071 8.062 6.403 2.000],t); +%!assert(pdist(xy,"euclidean"), [1.000 8.602 7.071 8.062 6.403 2.000],t); +%!assert(pdist(xy,"seuclidean"), [0.380 2.735 2.363 2.486 2.070 0.561],t); +%!assert(pdist(xy,"mahalanobis"),[1.384 1.967 2.446 2.384 1.535 2.045],t); +%!assert(pdist(xy,"cityblock"), [1.000 12.00 10.00 11.00 9.000 2.000],t); +%!assert(pdist(xy,"minkowski"), [1.000 8.602 7.071 8.062 6.403 2.000],t); +%!assert(pdist(xy,"minkowski",3),[1.000 7.763 6.299 7.410 5.738 2.000],t); +%!assert(pdist(xy,"cosine"), [0.000 0.349 0.231 0.349 0.231 0.013],t); +%!assert(pdist(xy,"correlation"),[0.000 2.000 0.000 2.000 0.000 2.000],t); +%!assert(pdist(xy,"spearman"), [0.000 2.000 0.000 2.000 0.000 2.000],t); +%!assert(pdist(xy,"hamming"), [0.500 1.000 1.000 1.000 1.000 0.500],t); +%!assert(pdist(xy,"jaccard"), [1.000 1.000 1.000 1.000 1.000 0.500],t); +%!assert(pdist(xy,"chebychev"), [1.000 7.000 5.000 7.000 5.000 2.000],t); diff --git a/inst/pdist2.m b/inst/pdist2.m new file mode 100644 index 0000000..4db52bb --- /dev/null +++ b/inst/pdist2.m @@ -0,0 +1,176 @@ +## Copyright (C) 2014-2019 Piotr Dollar +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, see +## . + +## -*- texinfo -*- +## @deftypefn {Function File} {} pdist2 (@var{x}, @var{y}) +## @deftypefnx {Function File} {} pdist2 (@var{x}, @var{y}, @var{metric}) +## Compute pairwise distance between two sets of vectors. +## +## Let @var{X} be an MxP matrix representing m points in P-dimensional space +## and @var{Y} be an NxP matrix representing another set of points in the same +## space. This function computes the M-by-N distance matrix @var{D} where +## @code{@var{D}(i,j)} is the distance between @code{@var{X}(i,:)} and +## @code{@var{Y}(j,:)}. +## +## The optional argument @var{metric} can be used to select different +## distances: +## +## @table @asis +## @item @qcode{"euclidean"} (default) +## +## @item @qcode{"sqeuclidean"} +## Compute the squared euclidean distance, i.e., the euclidean distance +## before computing square root. This is ideal when the interest is on the +## order of the euclidean distances rather than the actual distance value +## because it performs significantly faster while preserving the order. +## +## @item @qcode{"chisq'"} +## The chi-squared distance between two vectors is defined as: +## @code{d(x, y) = sum ((xi-yi)^2 / (xi+yi)) / 2}. +## The chi-squared distance is useful when comparing histograms. +## +## @item @qcode{"cosine"} +## Distance is defined as the cosine of the angle between two vectors. +## +## @item @qcode{"emd"} +## Earth Mover's Distance (EMD) between positive vectors (histograms). +## Note for 1D, with all histograms having equal weight, there is a simple +## closed form for the calculation of the EMD. The EMD between histograms +## @var{x} and @var{y} is given by @code{sum (abs (cdf (x) - cdf (y)))}, +## where @code{cdf} is the cumulative distribution function (computed +## simply by @code{cumsum}). +## +## @item @qcode{"L1"} +## The L1 distance between two vectors is defined as: @code{sum (abs (x-y))} +## +## @end table +## +## @seealso{pdist} +## @end deftypefn + +## Taken from Piotr's Computer Vision Matlab Toolbox Version 2.52, with +## author permission to distribute under GPLv3 + +function D = pdist2 (X, Y, metric = "euclidean") + + if (nargin < 2 || nargin > 3) + print_usage (); + elseif (columns (X) != columns (Y)) + error ("pdist2: X and Y must have equal number of columns"); + elseif (ndims (X) != 2 || ndims (Y) != 2) + error ("pdist2: X and Y must be 2 dimensional matrices"); + endif + + switch (tolower (metric)) + case "sqeuclidean", D = distEucSq (X, Y); + case "euclidean", D = sqrt (distEucSq (X, Y)); + case "l1", D = distL1 (X, Y); + case "cosine", D = distCosine (X, Y); + case "emd", D = distEmd (X, Y); + case "chisq", D = distChiSq (X, Y); + otherwise + error ("pdist2: unknown distance METRIC %s", metric); + endswitch + D = max (0, D); + +endfunction + +## TODO we could check the value of p and n first, and choose one +## or the other loop accordingly. +## L1 COMPUTATION WITH LOOP OVER p, FAST FOR SMALL p. +## function D = distL1( X, Y ) +## m = size(X,1); n = size(Y,1); p = size(X,2); +## mOnes = ones(1,m); nOnes = ones(1,n); D = zeros(m,n); +## for i=1:p +## yi = Y(:,i); yi = yi( :, mOnes ); +## xi = X(:,i); xi = xi( :, nOnes ); +## D = D + abs( xi-yi' ); +## end + +function D = distL1 (X, Y) + m = rows (X); + n = rows (Y); + mOnes = ones (1, m); + D = zeros (m, n); + for i = 1:n + yi = Y(i,:); + yi = yi(mOnes,:); + D(:,i) = sum (abs (X-yi), 2); + endfor +endfunction + +function D = distCosine (X, Y) + p = columns (X); + X = X ./ repmat (sqrt (sumsq (X, 2)), [1 p]); + Y = Y ./ repmat (sqrt (sumsq (Y, 2)), [1 p]); + D = 1 - X*Y'; +endfunction + +function D = distEmd (X, Y) + Xcdf = cumsum (X,2); + Ycdf = cumsum (Y,2); + m = rows (X); + n = rows (Y); + mOnes = ones (1, m); + D = zeros (m, n); + for i=1:n + ycdf = Ycdf(i,:); + ycdfRep = ycdf(mOnes,:); + D(:,i) = sum (abs (Xcdf - ycdfRep), 2); + endfor +endfunction + +function D = distChiSq (X, Y) + ## note: supposedly it's possible to implement this without a loop! + m = rows (X); + n = rows (Y); + mOnes = ones (1, m); + D = zeros (m, n); + for i = 1:n + yi = Y(i, :); + yiRep = yi(mOnes, :); + s = yiRep + X; + d = yiRep - X; + D(:,i) = sum (d.^2 ./ (s+eps), 2); + endfor + D = D/2; +endfunction + +function dists = distEucSq (x, y) + xx = sumsq (x, 2); + yy = sumsq (y, 2)'; + dists = max (0, bsxfun (@plus, xx, yy) - 2 * x * (y')); +endfunction + +## euclidean distance as loop for testing purposes +%!function dist = euclidean_distance (x, y) +%! [m, p] = size (X); +%! [n, p] = size (Y); +%! D = zeros (m, n); +%! for i = 1:n +%! d = X - repmat (Y(i,:), [m 1]); +%! D(:,i) = sumsq (d, 2); +%! endfor +%!endfunction + +%!test +%! x = [1 1 1; 2 2 2; 3 3 3]; +%! y = [0 0 0; 1 2 3; 0 2 4; 4 7 1]; +%! d = sqrt([ 3 5 11 45 +%! 12 2 8 30 +%! 27 5 11 21]); +%! assert (pdist2 (x, y), d) + diff --git a/inst/plsregress.m b/inst/plsregress.m new file mode 100644 index 0000000..db1b1a8 --- /dev/null +++ b/inst/plsregress.m @@ -0,0 +1,124 @@ +## Copyright (C) 2012-2019 Fernando Damian Nieuwveldt +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License +## as published by the Free Software Foundation; either version 3 +## of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{XLOADINGS},@var{YLOADINGS},@var{XSCORES},@var{YSCORES},@var{coefficients},@var{fitted}] =} plsregress(@var{X}, @var{Y}, @var{NCOMP}) +## Calculate partial least squares regression +## +## @itemize @bullet +## @item +## @var{X}: Matrix of observations +## @item +## @var{Y}: Is a vector or matrix of responses +## @item +## @var{NCOMP}: number of components used for modelling +## @item +## @var{X} and @var{Y} will be mean centered to improve accuracy +## @end itemize +## +## @subheading References +## +## @enumerate +## @item +## SIMPLS: An alternative approach to partial least squares regression. Chemometrics and Intelligent Laboratory +## Systems (1993) +## +## @end enumerate +## @end deftypefn + +## Author: Fernando Damian Nieuwveldt +## Description: Partial least squares regression using SIMPLS algorithm + +function [XLOADINGS, YLOADINGS, XSCORES, YSCORES, coefficients, fitted] = plsregress (X, Y, NCOMP) + + if nargout != 6 + print_usage(); + end + + nobs = rows (X); # Number of observations + npred = columns (X); # Number of predictor variables + nresp = columns (Y); # Number of responses + + if (! isnumeric (X) || ! isnumeric (Y)) + error ("plsregress:Data matrix X and reponse matrix Y must be real matrices"); + elseif (nobs != rows (Y)) + error ("plsregress:Number of observations for Data matrix X and Response Matrix Y must be equal"); + elseif(! isscalar (NCOMP)) + error ("plsregress: Third argument must be a scalar"); + end + + ## Mean centering Data matrix + Xmeans = mean (X); + X = bsxfun (@minus, X, Xmeans); + + ## Mean centering responses + Ymeans = mean (Y); + Y = bsxfun (@minus, Y, Ymeans); + + S = X'*Y; + + R = P = V = zeros (npred, NCOMP); + T = U = zeros (nobs, NCOMP); + Q = zeros (nresp, NCOMP); + + for a = 1:NCOMP + [eigvec eigval] = eig (S'*S); # Y factor weights + domindex = find (diag (eigval) == max (diag (eigval))); # get dominant eigenvector + q = eigvec(:,domindex); + + r = S*q; # X block factor weights + t = X*r; # X block factor scores + t = t - mean (t); + + nt = sqrt (t'*t); # compute norm + t = t/nt; + r = r/nt; # normalize + + p = X'*t; # X block factor loadings + q = Y'*t; # Y block factor loadings + u = Y*q; # Y block factor scores + v = p; + + ## Ensure orthogonality + if a > 1 + v = v - V*(V'*p); + u = u - T*(T'*u); + endif + + v = v/sqrt(v'*v); # normalize orthogonal loadings + S = S - v*(v'*S); # deflate S wrt loadings + + ## Store data + R(:,a) = r; + T(:,a) = t; + P(:,a) = p; + Q(:,a) = q; + U(:,a) = u; + V(:,a) = v; + endfor + + ## Regression coefficients + B = R*Q'; + + fitted = bsxfun (@plus, T*Q', Ymeans); # Add mean + + ## Return + coefficients = B; + XSCORES = T; + XLOADINGS = P; + YSCORES = U; + YLOADINGS = Q; + projection = R; + +endfunction diff --git a/inst/poisstat.m b/inst/poisstat.m new file mode 100644 index 0000000..707e405 --- /dev/null +++ b/inst/poisstat.m @@ -0,0 +1,92 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} poisstat (@var{lambda}) +## Compute mean and variance of the Poisson distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{lambda} is the parameter of the Poisson distribution. The +## elements of @var{lambda} must be positive +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the Poisson distribution +## +## @item +## @var{v} is the variance of the Poisson distribution +## @end itemize +## +## @subheading Example +## +## @example +## @group +## lambda = 1 ./ (1:6); +## [m, v] = poisstat (lambda) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the Poisson distribution + +function [m, v] = poisstat (lambda) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (lambda) && ! ismatrix (lambda)) + error ("poisstat: lambda must be a numeric matrix"); + endif + + # Set moments + m = lambda; + v = lambda; + + # Continue argument check + k = find (! (lambda > 0) | ! (lambda < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! lambda = 1 ./ (1:6); +%! [m, v] = poisstat (lambda); +%! assert (m, lambda); +%! assert (v, lambda); diff --git a/inst/princomp.m b/inst/princomp.m new file mode 100644 index 0000000..012954b --- /dev/null +++ b/inst/princomp.m @@ -0,0 +1,176 @@ +## Copyright (C) 2013-2019 Fernando Damian Nieuwveldt +## +## This program is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License +## as published by the Free Software Foundation; either version 3 +## of the License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{COEFF}]} = princomp(@var{X}) +## @deftypefnx {Function File} {[@var{COEFF},@var{SCORE}]} = princomp(@var{X}) +## @deftypefnx {Function File} {[@var{COEFF},@var{SCORE},@var{latent}]} = princomp(@var{X}) +## @deftypefnx {Function File} {[@var{COEFF},@var{SCORE},@var{latent},@var{tsquare}]} = princomp(@var{X}) +## @deftypefnx {Function File} {[...]} = princomp(@var{X},'econ') +## Performs a principal component analysis on a NxP data matrix X +## +## @itemize @bullet +## @item +## @var{COEFF} : returns the principal component coefficients +## @item +## @var{SCORE} : returns the principal component scores, the representation of X +## in the principal component space +## @item +## @var{LATENT} : returns the principal component variances, i.e., the +## eigenvalues of the covariance matrix X. +## @item +## @var{TSQUARE} : returns Hotelling's T-squared Statistic for each observation in X +## @item +## [...] = princomp(X,'econ') returns only the elements of latent that are not +## necessarily zero, and the corresponding columns of COEFF and SCORE, that is, +## when n <= p, only the first n-1. This can be significantly faster when p is +## much larger than n. In this case the svd will be applied on the transpose of +## the data matrix X +## +## @end itemize +## +## @subheading References +## +## @enumerate +## @item +## Jolliffe, I. T., Principal Component Analysis, 2nd Edition, Springer, 2002 +## +## @end enumerate +## @end deftypefn + +function [COEFF,SCORE,latent,tsquare] = princomp(X,varargin) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + if (nargin == 2 && ! strcmpi (varargin{:}, "econ")) + error ("princomp: if a second input argument is present, it must be the string 'econ'"); + endif + + [nobs nvars] = size(X); + + # Center the columns to mean zero + Xcentered = bsxfun(@minus,X,mean(X)); + + # Check if there are more variables then observations + if nvars <= nobs + + [U,S,COEFF] = svd(Xcentered, "econ"); + + else + + # Calculate the svd on the transpose matrix, much faster + if (nargin == 2 && strcmpi ( varargin{:} , "econ")) + [COEFF,S,V] = svd(Xcentered' , 'econ'); + else + [COEFF,S,V] = svd(Xcentered'); + endif + + endif + + if nargout > 1 + + # Get the Scores + SCORE = Xcentered*COEFF; + + # Get the rank of the SCORE matrix + r = rank(SCORE); + + # Only use the first r columns, pad rest with zeros if economy != 'econ' + SCORE = SCORE(:,1:r) ; + + if !(nargin == 2 && strcmpi ( varargin{:} , "econ")) + SCORE = [SCORE, zeros(nobs , nvars-r)]; + else + COEFF = COEFF(: , 1:r); + endif + + endif + + if nargout > 2 + + # This is the same as the eigenvalues of the covariance matrix of X + latent = (diag(S'*S)/(size(Xcentered,1)-1))(1:r); + + if !(nargin == 2 && strcmpi ( varargin{:} , "econ")) + latent= [latent;zeros(nvars-r,1)]; + endif + endif + + if nargout > 3 + # Calculate the Hotelling T-Square statistic for the observations + tsquare = sumsq(zscore(SCORE(:,1:r)),2); + endif + +endfunction + +%!shared COEFF,SCORE,latent,tsquare,m,x,R,V,lambda,i,S,F + +#NIST Engineering Statistics Handbook example (6.5.5.2) +%!test +%! x=[7 4 3 +%! 4 1 8 +%! 6 3 5 +%! 8 6 1 +%! 8 5 7 +%! 7 2 9 +%! 5 3 3 +%! 9 5 8 +%! 7 4 5 +%! 8 2 2]; +%! R = corrcoef (x); +%! [V, lambda] = eig (R); +%! [~, i] = sort(diag(lambda), "descend"); #arrange largest PC first +%! S = V(:, i) * diag(sqrt(diag(lambda)(i))); +%!assert(diag(S(:, 1:2)*S(:, 1:2)'), [0.8662; 0.8420; 0.9876], 1E-4); #contribution of first 2 PCs to each original variable +%! B = V(:, i) * diag( 1./ sqrt(diag(lambda)(i))); +%! F = zscore(x)*B; +%! [COEFF,SCORE,latent,tsquare] = princomp(zscore(x, 1)); +%!assert(tsquare,sumsq(F, 2),1E4*eps); + +%!test +%! x=[1,2,3;2,1,3]'; +%! [COEFF,SCORE,latent,tsquare] = princomp(x); +%! m=[sqrt(2),sqrt(2);sqrt(2),-sqrt(2);-2*sqrt(2),0]/2; +%! m(:,1) = m(:,1)*sign(COEFF(1,1)); +%! m(:,2) = m(:,2)*sign(COEFF(1,2)); + +%!assert(COEFF,m(1:2,:),10*eps); +%!assert(SCORE,-m,10*eps); +%!assert(latent,[1.5;.5],10*eps); +%!assert(tsquare,[4;4;4]/3,10*eps); + +%!test +%! x=x'; +%! [COEFF,SCORE,latent,tsquare] = princomp(x); +%! m=[sqrt(2),sqrt(2),0;-sqrt(2),sqrt(2),0;0,0,2]/2; +%! m(:,1) = m(:,1)*sign(COEFF(1,1)); +%! m(:,2) = m(:,2)*sign(COEFF(1,2)); +%! m(:,3) = m(:,3)*sign(COEFF(3,3)); + +%!assert(COEFF,m,10*eps); +%!assert(SCORE(:,1),-m(1:2,1),10*eps); +%!assert(SCORE(:,2:3),zeros(2),10*eps); +%!assert(latent,[1;0;0],10*eps); +%!assert(tsquare,[0.5;0.5],10*eps) + +%!test +%! [COEFF,SCORE,latent,tsquare] = princomp(x, "econ"); + +%!assert(COEFF,m(:, 1),10*eps); +%!assert(SCORE,-m(1:2,1),10*eps); +%!assert(latent,[1],10*eps); +%!assert(tsquare,[0.5;0.5],10*eps) + diff --git a/inst/private/CalinskiHarabaszEvaluation.m b/inst/private/CalinskiHarabaszEvaluation.m new file mode 100644 index 0000000..e34796c --- /dev/null +++ b/inst/private/CalinskiHarabaszEvaluation.m @@ -0,0 +1,205 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef CalinskiHarabaszEvaluation < ClusterCriterion + ## -*- texinfo -*- + ## @deftypefn {Function File} {@var{eva} =} evalclusters (@var{x}, @var{clust}, @qcode{CalinskiHarabasz}) + ## @deftypefnx {Function File} {@var{eva} =} evalclusters (@dots{}, @qcode{Name}, @qcode{Value}) + ## + ## A Calinski-Harabasz object to evaluate clustering solutions. + ## + ## A @code{CalinskiHarabaszEvaluation} object is a @code{ClusterCriterion} + ## object used to evaluate clustering solutions using the Calinski-Harabasz + ## criterion. + ## + ## The Calinski-Harabasz index is based on the ratio between SSb and SSw. + ## SSb is the overall variance between clusters, that is the variance of the + ## distances between the centroids. + ## SSw is the overall variance within clusters, that is the sum of the + ## variances of the distances between each datapoint and its centroid. + ## + ## The best solution according to the Calinski-Harabasz criterion is the one + ## that scores the highest value. + ## @end deftypefn + ## + ## @seealso{ClusterCriterion, evalclusters} + + properties (GetAccess = public, SetAccess = private) + + endproperties + + properties (Access = protected) + Centroids = {}; # a list of the centroids for every solution + endproperties + + methods (Access = public) + ## constructor + function this = CalinskiHarabaszEvaluation (x, clust, KList) + this@ClusterCriterion(x, clust, KList); + + this.CriterionName = "CalinskiHarabasz"; + this.evaluate(this.InspectedK); # evaluate the list of cluster numbers + endfunction + + ## set functions + + ## addK + ## add new cluster sizes to evaluate + function this = addK (this, K) + addK@ClusterCriterion(this, K); + + ## if we have new data, we need a new evaluation + if (this.OptimalK == 0) + Centroids_tmp = {}; + pS = 0; # position shift of the elements of Centroids + for iter = 1 : length (this.InspectedK) + ## reorganize Centroids according to the new list of cluster numbers + if (any (this.InspectedK(iter) == K)) + pS += 1; + else + Centroids_tmp{iter} = this.Centroids{iter - pS}; + endif + endfor + this.Centroids = Centroids_tmp; + this.evaluate(K); # evaluate just the new cluster numbers + endif + endfunction + + ## compact + ## ... + function this = compact (this) + # FIXME: stub! + warning ("CalinskiHarabaszEvaluation: compact is unavailable"); + endfunction + endmethods + + methods (Access = protected) + ## evaluate + ## do the evaluation + function this = evaluate (this, K) + ## use complete observations only + UsableX = this.X(find (this.Missing == false), :); + if (! isempty (this.ClusteringFunction)) + ## build the clusters + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + if (isa (this.ClusteringFunction, "function_handle")) + ## custom function + ClusteringSolution = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + if (ismatrix (ClusteringSolution) && ... + rows (ClusteringSolution) == this.NumObservations && ... + columns (ClusteringSolution) == this.P) + ## the custom function returned a matrix: + ## we take the index of the maximum value for every row + [~, this.ClusteringSolutions(:, iter)] = ... + max (ClusteringSolution, [], 2); + elseif (iscolumn (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution; + elseif (isrow (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution'; + else + error (["CalinskiHarabaszEvaluation: invalid return value "... + "from custom clustering function"]); + endif + this.ClusteringSolutions(:, iter) = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + else + switch (this.ClusteringFunction) + case "kmeans" + [this.ClusteringSolutions(:, iter), this.Centroids{iter}] =... + kmeans (UsableX, this.InspectedK(iter), ... + "Distance", "sqeuclidean", "EmptyAction", "singleton", ... + "Replicates", 5); + + case "linkage" + ## use clusterdata + this.ClusteringSolutions(:, iter) = clusterdata (UsableX, ... + "MaxClust", this.InspectedK(iter), ... + "Distance", "euclidean", "Linkage", "ward"); + this.Centroids{iter} = this.computeCentroids (UsableX, iter); + + case "gmdistribution" + gmm = fitgmdist (UsableX, this.InspectedK(iter), ... + "SharedCov", true, "Replicates", 5); + this.ClusteringSolutions(:, iter) = cluster (gmm, UsableX); + this.Centroids{iter} = gmm.mu; + + otherwise + error (["CalinskiHarabaszEvaluation: unexpected error, " ... + "report this bug"]); + endswitch + endif + endif + endfor + endif + + ## get the criterion values for every clustering solution + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + ## not defined for one cluster + if (this.InspectedK(iter) == 1) + this.CriterionValues(iter) = NaN; + continue; + endif + + ## Caliński-Harabasz index + ## reference: calinhara function from the fpc package of R, + ## by Christian Hennig + ## https://CRAN.R-project.org/package=fpc + W = zeros (columns (UsableX)); # between clusters covariance + for i = 1 : this.InspectedK(iter) + vIndicesI = find (this.ClusteringSolutions(:, iter) == i); + ni = length (vIndicesI); # size of cluster i + if (ni == 1) + ## if the cluster has just one member the covariance is zero + continue; + endif + ## weighted update of the covariance matrix + W += cov (UsableX(vIndicesI, :)) * (ni - 1); + endfor + S = (this.NumObservations - 1) * cov (UsableX); # within clusters cov. + B = S - W; # between clusters means + + ## tr(B) / tr(W) * (N-k) / (k-1) + this.CriterionValues(iter) = (this.NumObservations - ... + this.InspectedK(iter)) * trace (B) / ... + ((this.InspectedK(iter) - 1) * trace (W)); + endif + endfor + + [~, this.OptimalIndex] = max (this.CriterionValues); + this.OptimalK = this.InspectedK(this.OptimalIndex(1)); + this.OptimalY = this.ClusteringSolutions(:, this.OptimalIndex(1)); + endfunction + endmethods + + methods (Access = private) + ## computeCentroids + ## compute the centroids if they are not available by other means + function C = computeCentroids (this, X, index) + C = zeros (this.InspectedK(index), columns (X)); + + for iter = 1 : this.InspectedK(index) + vIndicesI = find (this.ClusteringSolutions(:, index) == iter); + C(iter, :) = mean (X(vIndicesI, :)); + endfor + endfunction + endmethods +endclassdef diff --git a/inst/private/ClusterCriterion.m b/inst/private/ClusterCriterion.m new file mode 100644 index 0000000..a732f3b --- /dev/null +++ b/inst/private/ClusterCriterion.m @@ -0,0 +1,239 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef ClusterCriterion < handle + ## -*- texinfo -*- + ## @deftypefn {} {} ClusterCriterion () + ## + ## A clustering evaluation object as created by @code{evalclusters}. + ## + ## @code{ClusterCriterion} is a superclass for clustering evaluation objects + ## as created by @code{evalclusters}. + ## + ## List of public properties: + ## @table @code + ## @item @qcode{ClusteringFunction} + ## a valid clustering funtion name or function handle. It can be empty if + ## the clustering solutions are passed as an input matric. + ## + ## @item @qcode{CriterionName} + ## a valid criterion name to evaluate the clustering solutions. + ## + ## @item @qcode{CriterionValues} + ## a vector of values as generated by the evaluation criterion for each + ## clustering solution. + ## + ## @item @qcode{InspectedK} + ## the list of proposed cluster numbers. + ## + ## @item @qcode{Missing} + ## a logical vector of missing observations. When there are @code{NaN} + ## values in the data matrix, the corresponding observation is excluded. + ## + ## @item @qcode{NumObservations} + ## the number of non-missing observations in the data matrix. + ## + ## @item @qcode{OptimalK} + ## the optimal number of clusters. + ## + ## @item @qcode{OptimalY} + ## the clustering solution corresponding to @code{OptimalK}. + ## + ## @item @qcode{X} + ## the data matrix. + ## + ## @end table + ## + ## List of public methods: + ## @table @code + ## @item @qcode{addK} + ## add a list of numbers of clusters to evaluate. + ## + ## @item @qcode{compact} + ## return a compact clustering evaluation object. Not implemented + ## + ## @item @qcode{plot} + ## plot the clustering evaluation values against the corresponding number of + ## clusters. + ## + ## @end table + ## @end deftypefn + ## + ## @seealso{CalinskiHarabaszEvaluation, DaviesBouldinEvaluation, evalclusters, + ## GapEvaluation, SilhouetteEvaluation} + + properties (Access = public) + ## public properties + endproperties + + properties (GetAccess = public, SetAccess = protected) + ClusteringFunction = ""; + CriterionName = ""; + CriterionValues = []; + InspectedK = []; + Missing = []; + NumObservations = 0; + OptimalK = 0; + OptimalY = []; + X = []; + endproperties + + properties (Access = protected) + N = 0; # number of observations + P = 0; # number of variables + ClusteringSolutions = []; # + OptimalIndex = 0; # index of the optimal K + endproperties + + methods (Access = public) + ## constructor + function this = ClusterCriterion (x, clust, KList) + ## parsing input data + if ((! ismatrix (x)) || (! isnumeric (x))) + error ("ClusterCriterion: 'x' must be a numeric matrix"); + endif + this.X = x; + this.N = rows (this.X); + this.P = columns (this.X); + ## look for missing values + for iter = 1 : this.N + if (any (find (x(iter, :) == NaN))) + this.Missing(iter) = true; + else + this.Missing(iter) = false; + endif + endfor + ## number of usable observations + this.NumObservations = sum (this.Missing == false); + + ## parsing the clustering algorithm + if (ischar (clust)) + if (any (strcmpi (clust, {"kmeans", "linkage", "gmdistribution"}))) + this.ClusteringFunction = lower (clust); + else + error ("ClusterCriterion: unknown clustering algorithm '%s'", clust); + endif + elseif (isa (clust, "function_handle")) + this.ClusteringFunction = clust; + elseif (ismatrix (clust)) + if (isnumeric (clust) && (length (size (clust)) == 2) && ... + (rows (clust) == this.N)) + this.ClusteringFunction = ""; + this.ClusteringSolutions = clust(find (this.Missing == false), :); + else + error ("ClusterCriterion: invalid matrix of clustering solutions"); + endif + else + error ("ClusterCriterion: invalid argument"); + endif + + ## parsing the list of cluster sizes to inspect + this.InspectedK = parseKList (this, KList); + endfunction + + ## addK + ## add one or more new cluster sizes to evaluate + function this = addK (this, k) + ## -*- texinfo -*- + ## @deftypefn {} {} ClusterCriterion.addK (@var{K}) + ## + ## Add an array of cluster numbers to inspect to the evaluation object. + ## @end deftypefn + + ## if there is not a clustering function, then we are using a predefined + ## set of clustering solutions, hence we cannot redefine the number of + ## solutions + if (isempty (this.ClusteringFunction)) + warning (["ClusterCriterion: cannot redefine the list of cluster"... + "numbers to evaluate when there is not a clustering function"]); + return; + endif + + ## otherwise go on + newList = this.parseKList ([this.InspectedK k]); + + ## check if the list has changed + if (length (newList) == length (this.InspectedK)) + warning ("ClusterCriterion: the list has not changed"); + else + ## update ClusteringSolutions and CriterionValues + ClusteringSolutions_tmp = zeros (this.NumObservations, ... + length (newList)); + CriterionValues_tmp = zeros (length (newList), 1); + for iter = 1 : length (this.InspectedK) + idx = find (newList == this.InspectedK(iter)); + + if (! isempty (idx)) + ClusteringSolutions_tmp(:, idx) = this.ClusteringSolutions(:, iter); + CriterionValues_tmp(idx) = this.CriterionValues(iter); + endif + endfor + this.ClusteringSolutions = ClusteringSolutions_tmp; + this.CriterionValues = CriterionValues_tmp; + + ## reset the old results + this.OptimalK = 0; + this.OptimalY = []; + this.OptimalIndex = 0; + + ## update the list of cluster numbers to evaluate + this.InspectedK = newList; + endif + endfunction + + ## plot + ## plot the CriterionValues against InspectedK and return a handle to the + ## plot + function h = plot (this) + ## -*- texinfo -*- + ## @deftypefn {} {} ClusterCriterion.plot () + ## + ## Plot the evaluation results. + ## @end deftypefn + + yLabel = sprintf ("%s value", this.CriterionName); + h = gca (); + hold on; + plot (this.InspectedK, this.CriterionValues, "bo-"); + plot (this.OptimalK, this.CriterionValues(this.OptimalIndex), "b*"); + xlabel ("number of clusters"); + ylabel (yLabel); + hold off; + endfunction + endmethods + + methods (Abstract = true) + function compact () + ## -*- texinfo -*- + ## @deftypefn {} {@var{eva} =} ClusterCriterion.compact () + ## + ## Return a compact evaluation object. + ## @end deftypefn + endfunction + endmethods + + methods (Access = private) + ## check if a list of cluster sizes is correct + function retList = parseKList (this, KList) + if (isnumeric (KList) && isvector (KList) && all (find (KList > 0)) && ... + all (floor (KList) == KList)) + retList = unique (KList); + else + error (["ClusterCriterion: the list of cluster sizes must be an " ... + "array of positive integer numbers"]); + endif + endfunction + endmethods +endclassdef diff --git a/inst/private/DaviesBouldinEvaluation.m b/inst/private/DaviesBouldinEvaluation.m new file mode 100644 index 0000000..a2f002f --- /dev/null +++ b/inst/private/DaviesBouldinEvaluation.m @@ -0,0 +1,208 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef DaviesBouldinEvaluation < ClusterCriterion + ## -*- texinfo -*- + ## @deftypefn {Function File} {@var{eva} =} evalclusters (@var{x}, @var{clust}, @qcode{DaviesBouldin}) + ## @deftypefnx {Function File} {@var{eva} =} evalclusters (@dots{}, @qcode{Name}, @qcode{Value}) + ## + ## A Davies-Bouldin object to evaluate clustering solutions. + ## + ## A @code{DaviesBouldinEvaluation} object is a @code{ClusterCriterion} + ## object used to evaluate clustering solutions using the Davies-Bouldin + ## criterion. + ## + ## The Davies-Bouldin criterion is based on the ratio between the distances + ## between clusters and within clusters, that is the distances between the + ## centroids and the distances between each datapoint and its centroid. + ## + ## The best solution according to the Davies-Bouldin criterion is the one + ## that scores the lowest value. + ## @end deftypefn + ## + ## @seealso{ClusterCriterion, evalclusters} + + properties (GetAccess = public, SetAccess = private) + + endproperties + + properties (Access = protected) + Centroids = {}; # a list of the centroids for every solution + endproperties + + methods (Access = public) + ## constructor + function this = DaviesBouldinEvaluation (x, clust, KList) + this@ClusterCriterion(x, clust, KList); + + this.CriterionName = "DaviesBouldin"; + this.evaluate(this.InspectedK); # evaluate the list of cluster numbers + endfunction + + ## set functions + + ## addK + ## add new cluster sizes to evaluate + function this = addK (this, K) + addK@ClusterCriterion(this, K); + + ## if we have new data, we need a new evaluation + if (this.OptimalK == 0) + Centroids_tmp = {}; + pS = 0; # position shift of the elements of Centroids + for iter = 1 : length (this.InspectedK) + ## reorganize Centroids according to the new list of cluster numbers + if (any (this.InspectedK(iter) == K)) + pS += 1; + else + Centroids_tmp{iter} = this.Centroids{iter - pS}; + endif + endfor + this.Centroids = Centroids_tmp; + this.evaluate(K); # evaluate just the new cluster numbers + endif + endfunction + + ## compact + ## ... + function this = compact (this) + # FIXME: stub! + warning ("DaviesBouldinEvaluation: compact is unavailable"); + endfunction + endmethods + + methods (Access = protected) + ## evaluate + ## do the evaluation + function this = evaluate (this, K) + ## use complete observations only + UsableX = this.X(find (this.Missing == false), :); + if (! isempty (this.ClusteringFunction)) + ## build the clusters + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + if (isa (this.ClusteringFunction, "function_handle")) + ## custom function + ClusteringSolution = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + if (ismatrix (ClusteringSolution) && ... + rows (ClusteringSolution) == this.NumObservations && ... + columns (ClusteringSolution) == this.P) + ## the custom function returned a matrix: + ## we take the index of the maximum value for every row + [~, this.ClusteringSolutions(:, iter)] = ... + max (ClusteringSolution, [], 2); + elseif (iscolumn (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution; + elseif (isrow (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution'; + else + error (["DaviesBouldinEvaluation: invalid return value "... + "from custom clustering function"]); + endif + this.ClusteringSolutions(:, iter) = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + else + switch (this.ClusteringFunction) + case "kmeans" + [this.ClusteringSolutions(:, iter), this.Centroids{iter}] =... + kmeans (UsableX, this.InspectedK(iter), ... + "Distance", "sqeuclidean", "EmptyAction", "singleton", ... + "Replicates", 5); + + case "linkage" + ## use clusterdata + this.ClusteringSolutions(:, iter) = clusterdata (UsableX, ... + "MaxClust", this.InspectedK(iter), ... + "Distance", "euclidean", "Linkage", "ward"); + this.Centroids{iter} = this.computeCentroids (UsableX, iter); + + case "gmdistribution" + gmm = fitgmdist (UsableX, this.InspectedK(iter), ... + "SharedCov", true, "Replicates", 5); + this.ClusteringSolutions(:, iter) = cluster (gmm, UsableX); + this.Centroids{iter} = gmm.mu; + + otherwise + error (["DaviesBouldinEvaluation: unexpected error, " ... + "report this bug"]); + endswitch + endif + endif + endfor + endif + + ## get the criterion values for every clustering solution + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + ## not defined for one cluster + if (this.InspectedK(iter) == 1) + this.CriterionValues(iter) = NaN; + continue; + endif + + ## Davies-Bouldin value + ## an evaluation of the ratio between within-cluster and + ## between-cluster distances + + ## mean distances between cluster members and their centroid + vD = zeros (this.InspectedK(iter), 1); + for i = 1 : this.InspectedK(iter) + vIndicesI = find (this.ClusteringSolutions(:, iter) == i); + vD(i) = mean (vecnorm (UsableX(vIndicesI, :) - ... + this.Centroids{iter}(i, :), 2, 2)); + endfor + + ## within-to-between cluster distance ratio + Dij = zeros (this.InspectedK(iter)); + for i = 1 : (this.InspectedK(iter) - 1) + for j = (i + 1) : this.InspectedK(iter) + ## centroid to centroid distance + dij = vecnorm (this.Centroids{iter}(i, :) - ... + this.Centroids{iter}(j, :)); + ## within-to-between cluster distance ratio for clusters i and j + Dij(i, j) = (vD(i) + vD(j)) / dij; + endfor + endfor + + ## ( max_j D1j + max_j D2j + ... + max_j Dkj) / k + this.CriterionValues(iter) = sum (max (Dij(i, :), [], 2)) / ... + this.InspectedK(iter); + endif + endfor + + [~, this.OptimalIndex] = min (this.CriterionValues); + this.OptimalK = this.InspectedK(this.OptimalIndex(1)); + this.OptimalY = this.ClusteringSolutions(:, this.OptimalIndex(1)); + endfunction + endmethods + + methods (Access = private) + ## computeCentroids + ## compute the centroids if they are not available by other means + function C = computeCentroids (this, X, index) + C = zeros (this.InspectedK(index), columns (X)); + + for iter = 1 : this.InspectedK(index) + vIndicesI = find (this.ClusteringSolutions(:, index) == iter); + C(iter, :) = mean (X(vIndicesI, :)); + endfor + endfunction + endmethods +endclassdef diff --git a/inst/private/GapEvaluation.m b/inst/private/GapEvaluation.m new file mode 100644 index 0000000..34bc71e --- /dev/null +++ b/inst/private/GapEvaluation.m @@ -0,0 +1,364 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef GapEvaluation < ClusterCriterion + ## -*- texinfo -*- + ## @deftypefn {Function File} {@var{eva} =} evalclusters (@var{x}, @var{clust}, @qcode{gap}) + ## @deftypefnx {Function File} {@var{eva} =} evalclusters (@dots{}, @qcode{Name}, @qcode{Value}) + ## + ## A gap object to evaluate clustering solutions. + ## + ## A @code{GapEvaluation} object is a @code{ClusterCriterion} + ## object used to evaluate clustering solutions using the gap criterion, + ## which is a mathematical formalization of the elbow method. + ## + ## List of public properties specific to @code{SilhouetteEvaluation}: + ## @table @code + ## @item @qcode{B} + ## the number of reference datasets to generate. + ## + ## @item @qcode{Distance} + ## a valid distance metric name, or a function handle as accepted by the + ## @code{pdist} function. + ## + ## @item @qcode{ExpectedLogW} + ## a vector of the expected values for the logarithm of the within clusters + ## dispersion. + ## + ## @item @qcode{LogW} + ## a vector of the values of the logarithm of the within clusters dispersion. + ## + ## @item @qcode{ReferenceDistribution} + ## a valid name for the reference distribution, namely: @code{PCA} (default) + ## or @code{uniform}. + ## + ## @item @qcode{SE} + ## a vector of the standard error of the expected values for the logarithm + ## of the within clusters dispersion. + ## + ## @item @qcode{SearchMethod} + ## a valid name for the search method to use: @code{globalMaxSE} (default) or + ## @code{firstMaxSE}. + ## + ## @item @qcode{StdLogW} + ## a vector of the standard deviation of the expected values for the logarithm + ## of the within clusters dispersion. + ## @end table + ## + ## The best solution according to the gap criterion depends on the chosen + ## search method. When the search method is @code{globalMaxSE}, the chosen + ## gap value is the smaller one which is inside a standard error from the + ## max gap value; when the search method is @code{firstMaxSE}, the chosen + ## gap value is the first one which is inside a standard error from the next + ## gap value. + ## @end deftypefn + ## + ## @seealso{ClusterCriterion, evalclusters} + + properties (GetAccess = public, SetAccess = private) + B = 0; # number of reference datasets + Distance = ""; # pdist parameter + ReferenceDistribution = ""; # distribution to use as reference + SearchMethod = ""; # the method do identify the optimal number of clusters + ExpectedLogW = []; # expected value for the natural logarithm of W + LogW = []; # natural logarithm of W + SE = []; # standard error for the natural logarithm of W + StdLogW = []; # standard deviation of the natural logarithm of W + endproperties + + properties (Access = protected) + DistanceVector = []; # vector of pdist distances + mExpectedLogW = []; # the result of the Monte-Carlo simulations + endproperties + + methods (Access = public) + ## constructor + function this = GapEvaluation (x, clust, KList, b = 100, ... + distanceMetric = "sqeuclidean", ... + referenceDistribution = "pca", searchMethod = "globalmaxse") + this@ClusterCriterion(x, clust, KList); + + ## parsing the distance criterion + if (ischar (distanceMetric)) + if (any (strcmpi (distanceMetric, {"sqeuclidean", "euclidean", ... + "cityblock", "cosine", "correlation", "hamming", "jaccard"}))) + this.Distance = lower (distanceMetric); + + ## kmeans can use only a subset + if (strcmpi (clust, "kmeans") && any (strcmpi (this.Distance, ... + {"euclidean", "jaccard"}))) + error (["GapEvaluation: invalid distance criterion '%s' "... + "for 'kmeans'"], distanceMetric); + endif + else + error ("GapEvaluation: unknown distance criterion '%s'", ... + distanceMetric); + endif + elseif (isa (distanceMetric, "function_handle")) + this.Distance = distanceMetric; + + ## kmeans cannot use a function handle + if (strcmpi (clust, "kmeans")) + error ("GapEvaluation: invalid distance criterion for 'kmeans'"); + endif + elseif (isvector (distanceMetric) && isnumeric (distanceMetric)) + this.Distance = ""; + this.DistanceVector = distanceMetric; # the validity check is delegated + + ## kmeans cannot use a distance vector + if (strcmpi (clust, "kmeans")) + error (["GapEvaluation: invalid distance criterion for "... + "'kmeans'"]); + endif + else + error ("GapEvaluation: invalid distance metric"); + endif + + ## B: number of Monte-Carlo iterations + if (! isnumeric (b) || ! isscalar (b) || b != floor (b) || b < 1) + error ("GapEvaluation: b must a be positive integer number"); + endif + this.B = b; + + ## reference distribution + if (! ischar (referenceDistribution) || ! any (strcmpi ... + (referenceDistribution, {"pca", "uniform"}))) + error (["GapEvaluation: the reference distribution must be either" ... + "'PCA' or 'uniform'"]); + elseif (strcmpi (referenceDistribution, "pca")) + warning (["GapEvaluation: 'PCA' distribution not implemented, " ... + "using 'uniform'"]); + endif + this.ReferenceDistribution = lower (referenceDistribution); + + if (! ischar (searchMethod) || ! any (strcmpi (searchMethod, ... + {"globalmaxse", "firstmaxse"}))) + error (["evalclusters: the search method must be either" ... + "'globalMaxSE' or 'firstMaxSE'"]); + endif + this.SearchMethod = lower (searchMethod); + + ## a matrix to store the results from the Monte-Carlo runs + this.mExpectedLogW = zeros (this.B, length (this.InspectedK)); + + this.CriterionName = "gap"; + this.evaluate(this.InspectedK); # evaluate the list of cluster numbers + endfunction + + ## set functions + + ## addK + ## add new cluster sizes to evaluate + function this = addK (this, K) + addK@ClusterCriterion(this, K); + + ## if we have new data, we need a new evaluation + if (this.OptimalK == 0) + mExpectedLogW_tmp = zeros (this.B, length (this.InspectedK)); + pS = 0; # position shift + for iter = 1 : length (this.InspectedK) + ## reorganize all the arrays according to the new list + ## of cluster numbers + if (any (this.InspectedK(iter) == K)) + pS += 1; + else + mExpectedLogW_tmp(:, iter) = this.mExpectedLogW(:, iter - pS); + endif + endfor + this.mExpectedLogW = mExpectedLogW_tmp; + + this.evaluate(K); # evaluate just the new cluster numbers + endif + endfunction + + ## compact + ## ... + function this = compact (this) + # FIXME: stub! + warning ("GapEvaluation: compact is unavailable"); + endfunction + + ## plot + ## plot the CriterionValues against InspectedK, show the standard deviation + ## and return a handle to the plot + function h = plot (this) + yLabel = sprintf ("%s value", this.CriterionName); + h = gca (); + hold on; + errorbar (this.InspectedK, this.CriterionValues, this.StdLogW); + plot (this.InspectedK, this.CriterionValues, "bo"); + plot (this.OptimalK, this.CriterionValues(this.OptimalIndex), "b*"); + xlabel ("number of clusters"); + ylabel (yLabel); + hold off; + endfunction + endmethods + + methods (Access = protected) + ## evaluate + ## do the evaluation + function this = evaluate (this, K) + ## Monte-Carlo runs + for mcrun = 1 : (this.B + 1) + ## use complete observations only + UsableX = this.X(find (this.Missing == false), :); + + ## the last run use tha actual data, + ## the others are Monte-Carlo runs with reconstructed data + if (mcrun <= this.B) + ## uniform distribution + colMins = min (UsableX); + colMaxs = max (UsableX); + for col = 1 : columns (UsableX) + UsableX(:, col) = colMins(col) + rand (this.NumObservations, 1) *... + (colMaxs(col) - colMins(col)); + endfor + endif + + if (! isempty (this.ClusteringFunction)) + ## build the clusters + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + if (isa (this.ClusteringFunction, "function_handle")) + ## custom function + ClusteringSolution = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + if (ismatrix (ClusteringSolution) && ... + rows (ClusteringSolution) == this.NumObservations && ... + columns (ClusteringSolution) == this.P) + ## the custom function returned a matrix: + ## we take the index of the maximum value for every row + [~, this.ClusteringSolutions(:, iter)] = ... + max (ClusteringSolution, [], 2); + elseif (iscolumn (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution; + elseif (isrow (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution'; + else + error (["GapEvaluation: invalid return value from " ... + "custom clustering function"]); + endif + this.ClusteringSolutions(:, iter) = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + else + switch (this.ClusteringFunction) + case "kmeans" + this.ClusteringSolutions(:, iter) = kmeans (UsableX, ... + this.InspectedK(iter), "Distance", this.Distance, ... + "EmptyAction", "singleton", "Replicates", 5); + + case "linkage" + if (! isempty (this.Distance)) + ## use clusterdata + Distance_tmp = this.Distance; + LinkageMethod = "average"; # for non euclidean methods + if (strcmpi (this.Distance, "sqeuclidean")) + ## pdist uses different names for its algorithms + Distance_tmp = "squaredeuclidean"; + LinkageMethod = "ward"; + elseif (strcmpi (this.Distance, "euclidean")) + LinkageMethod = "ward"; + endif + this.ClusteringSolutions(:, iter) = clusterdata ... + (UsableX, "MaxClust", this.InspectedK(iter), ... + "Distance", Distance_tmp, "Linkage", LinkageMethod); + else + ## use linkage + Z = linkage (this.DistanceVector, "average"); + this.ClusteringSolutions(:, iter) = ... + cluster (Z, "MaxClust", this.InspectedK(iter)); + endif + + case "gmdistribution" + gmm = fitgmdist (UsableX, this.InspectedK(iter), ... + "SharedCov", true, "Replicates", 5); + this.ClusteringSolutions(:, iter) = cluster (gmm, UsableX); + + otherwise + ## this should not happen + error (["GapEvaluation: unexpected error, " ... + "report this bug"]); + endswitch + endif + endif + endfor + endif + + ## get the gap values for every clustering + distance_pdist = this.Distance; + if (strcmpi (distance_pdist, "sqeuclidean")) + distance_pdist = "squaredeuclidean"; + endif + + ## compute LogW + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + wk = 0; + for r = 1 : this.InspectedK(iter) + vIndicesR = find (this.ClusteringSolutions(:, iter) == r); + nr = length (vIndicesR); + Dr = pdist (UsableX(vIndicesR, :), distance_pdist); + wk += sum (Dr) / (2 * nr); + endfor + if (mcrun <= this.B) + this.mExpectedLogW(mcrun, iter) = log (wk); + else + this.LogW(iter) = log (wk); + endif + endif + endfor + endfor + + this.ExpectedLogW = mean (this.mExpectedLogW); + this.SE = sqrt ((1 + 1 / this.B) * sumsq (this.mExpectedLogW - ... + this.ExpectedLogW) / this.B); + this.StdLogW = std (this.mExpectedLogW); + this.CriterionValues = this.ExpectedLogW - this.LogW; + + this.OptimalIndex = this.gapSearch (); + this.OptimalK = this.InspectedK(this.OptimalIndex(1)); + this.OptimalY = this.ClusteringSolutions(:, this.OptimalIndex(1)); + endfunction + + ## gapSearch + ## find the best solution according to the gap method + function ind = gapSearch (this) + if (strcmpi (this.SearchMethod, "globalmaxse")) + [gapmax, indgp] = max (this.CriterionValues); + for iter = 1 : length (this.InspectedK) + ind = iter; + if (this.CriterionValues(iter) > (gapmax - this.SE(indgp))) + return + endif + endfor + elseif (strcmpi (this.SearchMethod, "firstmaxse")) + for iter = 1 : (length (this.InspectedK) - 1) + ind = iter; + if (this.CriterionValues(iter) > (this.CriterionValues(iter + 1) - ... + this.SE(iter + 1))) + return + endif + endfor + else + ## this should not happen + error (["GapEvaluation: unexpected error, please report this bug"]); + endif + endfunction + endmethods +endclassdef diff --git a/inst/private/SilhouetteEvaluation.m b/inst/private/SilhouetteEvaluation.m new file mode 100644 index 0000000..40a7652 --- /dev/null +++ b/inst/private/SilhouetteEvaluation.m @@ -0,0 +1,257 @@ +## Copyright (C) 2021 Stefano Guidoni +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef SilhouetteEvaluation < ClusterCriterion + ## -*- texinfo -*- + ## @deftypefn {Function File} {@var{eva} =} evalclusters (@var{x}, @var{clust}, @qcode{silhouette}) + ## @deftypefnx {Function File} {@var{eva} =} evalclusters (@dots{}, @qcode{Name}, @qcode{Value}) + ## + ## A silhouette object to evaluate clustering solutions. + ## + ## A @code{SilhouetteEvaluation} object is a @code{ClusterCriterion} + ## object used to evaluate clustering solutions using the silhouette + ## criterion. + ## + ## List of public properties specific to @code{SilhouetteEvaluation}: + ## @table @code + ## @item @qcode{Distance} + ## a valid distance metric name, or a function handle or a numeric array as + ## generated by the @code{pdist} function. + ## + ## @item @qcode{ClusterPriors} + ## a valid name for the evaluation of silhouette values: @code{empirical} + ## (default) or @code{equal}. + ## + ## @item @qcode{ClusterSilhouettes} + ## a cell array with the silhouette values of each data point for each cluster + ## number. + ## + ## @end table + ## + ## The best solution according to the silhouette criterion is the one + ## that scores the highest average silhouette value. + ## @end deftypefn + ## + ## @seealso{ClusterCriterion, evalclusters, silhouette} + + properties (GetAccess = public, SetAccess = private) + Distance = ""; # pdist parameter + ClusterPriors = ""; # evaluation of silhouette values: equal or empirical + ClusterSilhouettes = {}; # results of the silhoutte function for each K + endproperties + + properties (Access = protected) + DistanceVector = []; # vector of pdist distances + endproperties + + methods (Access = public) + ## constructor + function this = SilhouetteEvaluation (x, clust, KList, ... + distanceMetric = "sqeuclidean", clusterPriors = "empirical") + this@ClusterCriterion(x, clust, KList); + + ## parsing the distance criterion + if (ischar (distanceMetric)) + if (any (strcmpi (distanceMetric, {"sqeuclidean", ... + "euclidean", "cityblock", "cosine", "correlation", ... + "hamming", "jaccard"}))) + this.Distance = lower (distanceMetric); + + ## kmeans can use only a subset + if (strcmpi (clust, "kmeans") && any (strcmpi (this.Distance, ... + {"euclidean", "jaccard"}))) + error (["SilhouetteEvaluation: invalid distance criterion '%s' "... + "for 'kmeans'"], distanceMetric); + endif + else + error ("SilhouetteEvaluation: unknown distance criterion '%s'", ... + distanceMetric); + endif + elseif (isa (distanceMetric, "function_handle")) + this.Distance = distanceMetric; + + ## kmeans cannot use a function handle + if (strcmpi (clust, "kmeans")) + error (["SilhouetteEvaluation: invalid distance criterion for "... + "'kmeans'"]); + endif + elseif (isvector (distanceMetric) && isnumeric (distanceMetric)) + this.Distance = ""; + this.DistanceVector = distanceMetric; # the validity check is delegated + + ## kmeans cannot use a distance vector + if (strcmpi (clust, "kmeans")) + error (["SilhouetteEvaluation: invalid distance criterion for "... + "'kmeans'"]); + endif + else + error ("SilhouetteEvaluation: invalid distance metric"); + endif + + ## parsing the prior probabilities of each cluster + if (ischar (distanceMetric)) + if (any (strcmpi (clusterPriors, {"empirical", "equal"}))) + this.ClusterPriors = lower (clusterPriors); + else + error (["SilhouetteEvaluation: unknown prior probability criterion"... + " '%s'"], clusterPriors); + endif + else + error ("SilhouetteEvaluation: invalid prior probabilities"); + endif + + this.CriterionName = "silhouette"; + this.evaluate(this.InspectedK); # evaluate the list of cluster numbers + endfunction + + ## set functions + + ## addK + ## add new cluster sizes to evaluate + function this = addK (this, K) + addK@ClusterCriterion(this, K); + + ## if we have new data, we need a new evaluation + if (this.OptimalK == 0) + ClusterSilhouettes_tmp = {}; + pS = 0; # position shift of the elements of ClusterSilhouettes + for iter = 1 : length (this.InspectedK) + ## reorganize ClusterSilhouettes according to the new list + ## of cluster numbers + if (any (this.InspectedK(iter) == K)) + pS += 1; + else + ClusterSilhouettes_tmp{iter} = this.ClusterSilhouettes{iter - pS}; + endif + endfor + this.ClusterSilhouettes = ClusterSilhouettes_tmp; + this.evaluate(K); # evaluate just the new cluster numbers + endif + endfunction + + ## compact + ## ... + function this = compact (this) + # FIXME: stub! + warning ("SilhouetteEvaluation: compact is unavailable"); + endfunction + endmethods + + methods (Access = protected) + ## evaluate + ## do the evaluation + function this = evaluate (this, K) + ## use complete observations only + UsableX = this.X(find (this.Missing == false), :); + if (! isempty (this.ClusteringFunction)) + ## build the clusters + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + if (isa (this.ClusteringFunction, "function_handle")) + ## custom function + ClusteringSolution = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + if (ismatrix (ClusteringSolution) && ... + rows (ClusteringSolution) == this.NumObservations && ... + columns (ClusteringSolution) == this.P) + ## the custom function returned a matrix: + ## we take the index of the maximum value for every row + [~, this.ClusteringSolutions(:, iter)] = ... + max (ClusteringSolution, [], 2); + elseif (iscolumn (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution; + elseif (isrow (ClusteringSolution) && + length (ClusteringSolution) == this.NumObservations) + this.ClusteringSolutions(:, iter) = ClusteringSolution'; + else + error (["SilhouetteEvaluation: invalid return value from " ... + "custom clustering function"]); + endif + this.ClusteringSolutions(:, iter) = ... + this.ClusteringFunction(UsableX, this.InspectedK(iter)); + else + switch (this.ClusteringFunction) + case "kmeans" + this.ClusteringSolutions(:, iter) = kmeans (UsableX, ... + this.InspectedK(iter), "Distance", this.Distance, ... + "EmptyAction", "singleton", "Replicates", 5); + + case "linkage" + if (! isempty (this.Distance)) + ## use clusterdata + Distance_tmp = this.Distance; + LinkageMethod = "average"; # for non euclidean methods + if (strcmpi (this.Distance, "sqeuclidean")) + ## pdist uses different names for its algorithms + Distance_tmp = "squaredeuclidean"; + LinkageMethod = "ward"; + elseif (strcmpi (this.Distance, "euclidean")) + LinkageMethod = "ward"; + endif + this.ClusteringSolutions(:, iter) = clusterdata (UsableX,... + "MaxClust", this.InspectedK(iter), ... + "Distance", Distance_tmp, "Linkage", LinkageMethod); + else + ## use linkage + Z = linkage (this.DistanceVector, "average"); + this.ClusteringSolutions(:, iter) = ... + cluster (Z, "MaxClust", this.InspectedK(iter)); + endif + + case "gmdistribution" + gmm = fitgmdist (UsableX, this.InspectedK(iter), ... + "SharedCov", true, "Replicates", 5); + this.ClusteringSolutions(:, iter) = cluster (gmm, UsableX); + + otherwise + error (["SilhouetteEvaluation: unexpected error, " ... + "report this bug"]); + endswitch + endif + endif + endfor + endif + + ## get the silhouette values for every clustering + set (0, 'DefaultFigureVisible', 'off'); # temporarily disable figures + for iter = 1 : length (this.InspectedK) + ## do it only for the specified K values + if (any (this.InspectedK(iter) == K)) + this.ClusterSilhouettes{iter} = silhouette (UsableX, ... + this.ClusteringSolutions(:, iter)); + if (strcmpi (this.ClusterPriors, "empirical")) + this.CriterionValues(iter) = mean (this.ClusterSilhouettes{iter}); + else + ## equal + this.CriterionValues(iter) = 0; + si = this.ClusterSilhouettes{iter}; + for k = 1 : this.InspectedK(iter) + this.CriterionValues(iter) += mean (si(find ... + (this.ClusteringSolutions(:, iter) == k))); + endfor + this.CriterionValues(iter) /= this.InspectedK(iter); + endif + endif + endfor + set (0, 'DefaultFigureVisible', 'on'); # enable figures again + + [~, this.OptimalIndex] = max (this.CriterionValues); + this.OptimalK = this.InspectedK(this.OptimalIndex(1)); + this.OptimalY = this.ClusteringSolutions(:, this.OptimalIndex(1)); + endfunction + endmethods +endclassdef diff --git a/inst/private/tbl_delim.m b/inst/private/tbl_delim.m new file mode 100644 index 0000000..b93d684 --- /dev/null +++ b/inst/private/tbl_delim.m @@ -0,0 +1,73 @@ +## Copyright (C) 2008 Bill Denney +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{d}, @var{err}] = } tbl_delim (@var{d}) +## Return the delimiter for tblread or tblwrite. +## +## The delimeter, @var{d} may be any single character or +## @itemize +## @item "space" " " (default) +## @item "tab" "\t" +## @item "comma" "," +## @item "semi" ";" +## @item "bar" "|" +## @end itemize +## +## @var{err} will be empty if there is no error, and @var{d} will be NaN +## if there is an error. You MUST check the value of @var{err}. +## @seealso{tblread, tblwrite} +## @end deftypefn + +function [d, err] = tbl_delim (d) + + ## Check arguments + if nargin != 1 + print_usage (); + endif + + err = ""; + ## Format the delimiter + if ischar (d) + ## allow for escape characters + d = sprintf (d); + if numel (d) > 1 + ## allow the word forms + s.space = " "; + s.tab = "\t"; + s.comma = ","; + s.semi = ";"; + s.bar = "|"; + if ! ismember (d, fieldnames (s)) + err = ["tblread: delimiter must be either a single " ... + "character or one of\n" ... + sprintf("%s, ", fieldnames (s){:})(1:end-2)]; + d = NaN; + else + d = s.(d); + endif + endif + else + err = "delimiter must be a character"; + d = NaN; + endif + if isempty (d) + err = "the delimiter may not be empty"; + d = NaN; + endif + +endfunction + +#tested in tblwrite diff --git a/inst/qrandn.m b/inst/qrandn.m new file mode 100644 index 0000000..1843624 --- /dev/null +++ b/inst/qrandn.m @@ -0,0 +1,93 @@ +## Copyright (C) 2014 - Juan Pablo Carbajal +## +## This progrm is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## Author: Juan Pablo Carbajal + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{z} =} qrandn (@var{q}, @var{r},@var{c}) +## @deftypefnx {Function File} {@var{z} =} qrandn (@var{q}, [@var{r},@var{c}]) +## Returns random deviates drawn from a q-Gaussian distribution. +## +## Parameter @var{q} charcterizes the q-Gaussian distribution. +## The result has the size indicated by @var{s}. +## +## Reference: +## W. Thistleton, J. A. Marsh, K. Nelson, C. Tsallis (2006) +## "Generalized Box-Muller method for generating q-Gaussian random deviates" +## arXiv:cond-mat/0605570 http://arxiv.org/abs/cond-mat/0605570 +## +## @seealso{rand, randn} +## @end deftypefn + +function z = qrandn(q,R,C=[]) + if !isscalar (q) + error ('Octave:invalid-input-arg', 'The parameter q must be a scalar.') + endif + + # Check that q < 3 + if q > 3 + error ('Octave:invalid-input-arg', 'The parameter q must be lower than 3.'); + endif + + if numel (R) > 1 + S = R; + elseif numel (R) ==1 && isempty (C) + S = [R,1]; + elseif numel (R) ==1 && !isempty (C) + S = [R,C]; + endif + + # Calaulate the q to be used on the q-log + qGen = (1 + q) / (3 - q); + + # Initialize the output vector + z = sqrt (-2 * log_q (rand (S),qGen)) .* sin (2*pi*rand (S)); + +endfunction + +function a = log_q (x,q) + # + # Returns the q-log of x, using q + # + dq = 1 - q; + # Check to see if q = 1 (to double precision) + if abs (dq) < 10*eps + # If q is 1, use the usual natural logarithm + a = log (x); + else + # If q differs from 1, use the definition of the q-log + a = ( x .^ dq - 1 ) ./ dq; + endif + +endfunction + +%!demo +%! z = qrandn (-5, 5e6); +%! [c x] = hist (z,linspace(-1.5,1.5,200),1); +%! figure(1) +%! plot(x,c,"r."); axis tight; axis([-1.5,1.5]); +%! +%! z = qrandn (-0.14286, 5e6); +%! [c x] = hist (z,linspace(-2,2,200),1); +%! figure(2) +%! plot(x,c,"r."); axis tight; axis([-2,2]); +%! +%! z = qrandn (2.75, 5e6); +%! [c x] = hist (z,linspace(-1e3,1e3,1e3),1); +%! figure(3) +%! semilogy(x,c,"r."); axis tight; axis([-100,100]); +%! +%! # --------- +%! # Figures from the reference paper. diff --git a/inst/random.m b/inst/random.m new file mode 100644 index 0000000..14074f8 --- /dev/null +++ b/inst/random.m @@ -0,0 +1,171 @@ +## Copyright (C) 2007 Soren Hauberg +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @var{r} = random(@var{name}, @var{arg1}) +## @deftypefnx{Function File} @var{r} = random(@var{name}, @var{arg1}, @var{arg2}) +## @deftypefnx{Function File} @var{r} = random(@var{name}, @var{arg1}, @var{arg2}, @var{arg3}) +## @deftypefnx{Function File} @var{r} = random(@var{name}, ..., @var{s1}, ...) +## Generates pseudo-random numbers from a given one-, two-, or three-parameter +## distribution. +## +## The variable @var{name} must be a string that names the distribution from +## which to sample. If this distribution is a one-parameter distribution @var{arg1} +## should be supplied, if it is a two-paramter distribution @var{arg2} must also +## be supplied, and if it is a three-parameter distribution @var{arg3} must also +## be present. Any arguments following the distribution paramters will determine +## the size of the result. +## +## As an example, the following code generates a 10 by 20 matrix containing +## random numbers from a normal distribution with mean 5 and standard deviation +## 2. +## @example +## R = random("normal", 5, 2, [10, 20]); +## @end example +## +## The variable @var{name} can be one of the following strings +## +## @table @asis +## @item "beta" +## @itemx "beta distribution" +## Samples are drawn from the Beta distribution. +## @item "bino" +## @itemx "binomial" +## @itemx "binomial distribution" +## Samples are drawn from the Binomial distribution. +## @item "chi2" +## @itemx "chi-square" +## @itemx "chi-square distribution" +## Samples are drawn from the Chi-Square distribution. +## @item "exp" +## @itemx "exponential" +## @itemx "exponential distribution" +## Samples are drawn from the Exponential distribution. +## @item "f" +## @itemx "f distribution" +## Samples are drawn from the F distribution. +## @item "gam" +## @itemx "gamma" +## @itemx "gamma distribution" +## Samples are drawn from the Gamma distribution. +## @item "geo" +## @itemx "geometric" +## @itemx "geometric distribution" +## Samples are drawn from the Geometric distribution. +## @item "hyge" +## @itemx "hypergeometric" +## @itemx "hypergeometric distribution" +## Samples are drawn from the Hypergeometric distribution. +## @item "logn" +## @itemx "lognormal" +## @itemx "lognormal distribution" +## Samples are drawn from the Log-Normal distribution. +## @item "nbin" +## @itemx "negative binomial" +## @itemx "negative binomial distribution" +## Samples are drawn from the Negative Binomial distribution. +## @item "norm" +## @itemx "normal" +## @itemx "normal distribution" +## Samples are drawn from the Normal distribution. +## @item "poiss" +## @itemx "poisson" +## @itemx "poisson distribution" +## Samples are drawn from the Poisson distribution. +## @item "rayl" +## @itemx "rayleigh" +## @itemx "rayleigh distribution" +## Samples are drawn from the Rayleigh distribution. +## @item "t" +## @itemx "t distribution" +## Samples are drawn from the T distribution. +## @item "unif" +## @itemx "uniform" +## @itemx "uniform distribution" +## Samples are drawn from the Uniform distribution. +## @item "unid" +## @itemx "discrete uniform" +## @itemx "discrete uniform distribution" +## Samples are drawn from the Uniform Discrete distribution. +## @item "wbl" +## @itemx "weibull" +## @itemx "weibull distribution" +## Samples are drawn from the Weibull distribution. +## @end table +## @seealso{rand, betarnd, binornd, chi2rnd, exprnd, frnd, gamrnd, geornd, hygernd, +## lognrnd, nbinrnd, normrnd, poissrnd, raylrnd, trnd, unifrnd, unidrnd, wblrnd} +## @end deftypefn + +function retval = random(name, varargin) + ## General input checking + if (nargin < 2) + print_usage(); + endif + if (!ischar(name)) + error("random: first input argument must be a string"); + endif + + ## Select distribution + switch (lower(name)) + case {"beta", "beta distribution"} + retval = betarnd(varargin{:}); + case {"bino", "binomial", "binomial distribution"} + retval = binornd(varargin{:}); + case {"chi2", "chi-square", "chi-square distribution"} + retval = chi2rnd(varargin{:}); + case {"exp", "exponential", "exponential distribution"} + retval = exprnd(varargin{:}); + case {"ev", "extreme value", "extreme value distribution"} + error("random: distribution type '%s' is not yet implemented", name); + case {"f", "f distribution"} + retval = frnd(varargin{:}); + case {"gam", "gamma", "gamma distribution"} + retval = gamrnd(varargin{:}); + case {"gev", "generalized extreme value", "generalized extreme value distribution"} + error("random: distribution type '%s' is not yet implemented", name); + case {"gp", "generalized pareto", "generalized pareto distribution"} + error("random: distribution type '%s' is not yet implemented", name); + case {"geo", "geometric", "geometric distribution"} + retval = geornd(varargin{:}); + case {"hyge", "hypergeometric", "hypergeometric distribution"} + retval = hygernd(varargin{:}); + case {"logn", "lognormal", "lognormal distribution"} + retval = lognrnd(varargin{:}); + case {"nbin", "negative binomial", "negative binomial distribution"} + retval = nbinrnd(varargin{:}); + case {"ncf", "noncentral f", "noncentral f distribution"} + error("random: distribution type '%s' is not yet implemented", name); + case {"nct", "noncentral t", "noncentral t distribution"} + error("random: distribution type '%s' is not yet implemented", name); + case {"ncx2", "noncentral chi-square", "noncentral chi-square distribution"} + error("random: distribution type '%s' is not yet implemented", name); + case {"norm", "normal", "normal distribution"} + retval = normrnd(varargin{:}); + case {"poiss", "poisson", "poisson distribution"} + retval = poissrnd(varargin{:}); + case {"rayl", "rayleigh", "rayleigh distribution"} + retval = raylrnd(varargin{:}); + case {"t", "t distribution"} + retval = trnd(varargin{:}); + case {"unif", "uniform", "uniform distribution"} + retval = unifrnd(varargin{:}); + case {"unid", "discrete uniform", "discrete uniform distribution"} + retval = unidrnd(varargin{:}); + case {"wbl", "weibull", "weibull distribution"} + retval = wblrnd(varargin{:}); + otherwise + error("random: unsupported distribution type '%s'", name); + endswitch +endfunction diff --git a/inst/randsample.m b/inst/randsample.m new file mode 100644 index 0000000..032c003 --- /dev/null +++ b/inst/randsample.m @@ -0,0 +1,127 @@ +## Copyright (C) 2014 - Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## Author: Nir Krakauer + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} randsample (@var{v}, @var{k}, @var{replacement}=false [, @var{w}]) +## Elements sampled from a vector. +## +## Returns @var{k} random elements from a vector @var{v} with @var{n} elements, sampled without or with @var{replacement}. +## +## If @var{v} is a scalar, samples from 1:@var{v}. +## +## If a weight vector @var{w} of the same size as @var{v} is specified, the probablility of each element being sampled is proportional to @var{w}. Unlike Matlab's function of the same name, this can be done for sampling with or without replacement. +## +## Randomization is performed using rand(). +## +## @seealso{datasample, randperm} +## @end deftypefn + +function y = randsample(v,k,replacement=false,w=[]) + + if (isscalar (v) && isreal (v)) + n = v; + vector_v = false; + elseif (isvector (v)) + n = numel (v); + vector_v = true; + else + error ('Octave:invalid-input-arg', 'randsample: The input v must be a vector or positive integer.'); + endif + + if k < 0 || ( k > n && !replacement ) + error ('Octave:invalid-input-arg', 'randsample: The input k must be a non-negative integer. Sampling without replacement needs k <= n.'); + endif + + if (all (length (w) != [0, n])) + error ('Octave:invalid-input-arg', 'randsample: the size w (%d) must match the first argument (%d)', length(w), n); + endif + + + if (replacement) # sample with replacement + if (isempty (w)) # all elements are equally likely to be sampled + y = round (n * rand(1, k) + 0.5); + else + y = weighted_replacement (k, w); + endif + else # sample without replacement + if (isempty (w)) # all elements are equally likely to be sampled + y = randperm (n, k); + else # use "accept-reject"-like sampling + y = weighted_replacement (k, w); + while (1) + [yy, idx] = sort (y); # Note: sort keeps order of equal elements. + Idup = [false, (diff (yy)==0)]; + if !any (Idup) + break + else + Idup(idx) = Idup; # find duplicates in original vector + w(y) = 0; # don't permit resampling + # remove duplicates, then sample again + y = [y(~Idup), (weighted_replacement (sum (Idup), w))]; + endif + endwhile + endif + endif + + if vector_v + y = v(y); + endif + +endfunction + +function y = weighted_replacement (k, w) + w = w / sum(w); + w = [0 cumsum(w(:))']; + # distribute k uniform random deviates based on the given weighting + y = arrayfun (@(x) find (w <= x, 1, "last"), rand (1, k)); +endfunction + +%!test +%! n = 20; +%! k = 5; +%! x = randsample(n, k); +%! assert (size(x), [1 k]); +%! x = randsample(n, k, true); +%! assert (size(x), [1 k]); +%! x = randsample(n, k, false); +%! assert (size(x), [1 k]); +%! x = randsample(n, k, true, ones(n, 1)); +%! assert (size(x), [1 k]); +%! x = randsample(1:n, k); +%! assert (size(x), [1 k]); +%! x = randsample(1:n, k, true); +%! assert (size(x), [1 k]); +%! x = randsample(1:n, k, false); +%! assert (size(x), [1 k]); +%! x = randsample(1:n, k, true, ones(n, 1)); +%! assert (size(x), [1 k]); +%! x = randsample((1:n)', k); +%! assert (size(x), [k 1]); +%! x = randsample((1:n)', k, true); +%! assert (size(x), [k 1]); +%! x = randsample((1:n)', k, false); +%! assert (size(x), [k 1]); +%! x = randsample((1:n)', k, true, ones(n, 1)); +%! assert (size(x), [k 1]); +%! n = 10; +%! k = 100; +%! x = randsample(n, k, true, 1:n); +%! assert (size(x), [1 k]); +%! x = randsample((1:n)', k, true); +%! assert (size(x), [k 1]); +%! x = randsample(k, k, false, 1:k); +%! assert (size(x), [1 k]); diff --git a/inst/raylcdf.m b/inst/raylcdf.m new file mode 100644 index 0000000..8e57d1f --- /dev/null +++ b/inst/raylcdf.m @@ -0,0 +1,117 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{p} =} raylcdf (@var{x}, @var{sigma}) +## Compute the cumulative distribution function of the Rayleigh +## distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the support. The elements of @var{x} must be non-negative. +## +## @item +## @var{sigma} is the parameter of the Rayleigh distribution. The elements +## of @var{sigma} must be positive. +## @end itemize +## @var{x} and @var{sigma} must be of common size or one of them must be +## scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{p} is the cumulative distribution of the Rayleigh distribution at +## each element of @var{x} and corresponding parameter @var{sigma}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = 0:0.5:2.5; +## sigma = 1:6; +## p = raylcdf (x, sigma) +## @end group +## +## @group +## p = raylcdf (x, 0.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. pages 104 and 148, McGraw-Hill, New York, second edition, +## 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: CDF of the Rayleigh distribution + +function p = raylcdf (x, sigma) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (x) && ! ismatrix (x)) + error ("raylcdf: x must be a numeric matrix"); + endif + if (! isempty (sigma) && ! ismatrix (sigma)) + error ("raylcdf: sigma must be a numeric matrix"); + endif + + if (! isscalar (x) || ! isscalar (sigma)) + [retval, x, sigma] = common_size (x, sigma); + if (retval > 0) + error ("raylcdf: x and sigma must be of common size or scalar"); + endif + endif + + # Calculate cdf + p = 1 - exp ((-x .^ 2) ./ (2 * sigma .^ 2)); + + # Continue argument check + k = find (! (x >= 0) | ! (x < Inf) | ! (sigma > 0)); + if (any (k)) + p(k) = NaN; + endif + +endfunction + +%!test +%! x = 0:0.5:2.5; +%! sigma = 1:6; +%! p = raylcdf (x, sigma); +%! expected_p = [0.0000, 0.0308, 0.0540, 0.0679, 0.0769, 0.0831]; +%! assert (p, expected_p, 0.001); + +%!test +%! x = 0:0.5:2.5; +%! p = raylcdf (x, 0.5); +%! expected_p = [0.0000, 0.3935, 0.8647, 0.9889, 0.9997, 1.0000]; +%! assert (p, expected_p, 0.001); diff --git a/inst/raylinv.m b/inst/raylinv.m new file mode 100644 index 0000000..8e332f9 --- /dev/null +++ b/inst/raylinv.m @@ -0,0 +1,123 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{x} =} raylinv (@var{p}, @var{sigma}) +## Compute the quantile of the Rayleigh distribution. The quantile is the +## inverse of the cumulative distribution function. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{p} is the cumulative distribution. The elements of @var{p} must be +## probabilities. +## +## @item +## @var{sigma} is the parameter of the Rayleigh distribution. The elements +## of @var{sigma} must be positive. +## @end itemize +## @var{p} and @var{sigma} must be of common size or one of them must be +## scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{x} is the quantile of the Rayleigh distribution at each element of +## @var{p} and corresponding parameter @var{sigma}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## p = 0:0.1:0.5; +## sigma = 1:6; +## x = raylinv (p, sigma) +## @end group +## +## @group +## x = raylinv (p, 0.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. pages 104 and 148, McGraw-Hill, New York, second edition, +## 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Quantile of the Rayleigh distribution + +function x = raylinv (p, sigma) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (p) && ! ismatrix (p)) + error ("raylinv: p must be a numeric matrix"); + endif + if (! isempty (sigma) && ! ismatrix (sigma)) + error ("raylinv: sigma must be a numeric matrix"); + endif + + if (! isscalar (p) || ! isscalar (sigma)) + [retval, p, sigma] = common_size (p, sigma); + if (retval > 0) + error ("raylinv: p and sigma must be of common size or scalar"); + endif + endif + + # Calculate quantile + x = sqrt (-2 .* log (1 - p) .* sigma .^ 2); + + k = find (p == 1); + if (any (k)) + x(k) = Inf; + endif + + # Continue argument check + k = find (! (p >= 0) | ! (p <= 1) | ! (sigma > 0)); + if (any (k)) + x(k) = NaN; + endif + +endfunction + +%!test +%! p = 0:0.1:0.5; +%! sigma = 1:6; +%! x = raylinv (p, sigma); +%! expected_x = [0.0000, 0.9181, 2.0041, 3.3784, 5.0538, 7.0645]; +%! assert (x, expected_x, 0.001); + +%!test +%! p = 0:0.1:0.5; +%! x = raylinv (p, 0.5); +%! expected_x = [0.0000, 0.2295, 0.3340, 0.4223, 0.5054, 0.5887]; +%! assert (x, expected_x, 0.001); diff --git a/inst/raylpdf.m b/inst/raylpdf.m new file mode 100644 index 0000000..744dc10 --- /dev/null +++ b/inst/raylpdf.m @@ -0,0 +1,116 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} raylpdf (@var{x}, @var{sigma}) +## Compute the probability density function of the Rayleigh distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the support. The elements of @var{x} must be non-negative. +## +## @item +## @var{sigma} is the parameter of the Rayleigh distribution. The elements +## of @var{sigma} must be positive. +## @end itemize +## @var{x} and @var{sigma} must be of common size or one of them must be +## scalar. +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{y} is the probability density of the Rayleigh distribution at each +## element of @var{x} and corresponding parameter @var{sigma}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## x = 0:0.5:2.5; +## sigma = 1:6; +## y = raylpdf (x, sigma) +## @end group +## +## @group +## y = raylpdf (x, 0.5) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. pages 104 and 148, McGraw-Hill, New York, second edition, +## 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: PDF of the Rayleigh distribution + +function y = raylpdf (x, sigma) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (x) && ! ismatrix (x)) + error ("raylpdf: x must be a numeric matrix"); + endif + if (! isempty (sigma) && ! ismatrix (sigma)) + error ("raylpdf: sigma must be a numeric matrix"); + endif + + if (! isscalar (x) || ! isscalar (sigma)) + [retval, x, sigma] = common_size (x, sigma); + if (retval > 0) + error ("raylpdf: x and sigma must be of common size or scalar"); + endif + endif + + # Calculate pdf + y = x .* exp ((-x .^ 2) ./ (2 .* sigma .^ 2)) ./ (sigma .^ 2); + + # Continue argument check + k = find (! (x >= 0) | ! (x < Inf) | ! (sigma > 0)); + if (any (k)) + y(k) = NaN; + endif + +endfunction + +%!test +%! x = 0:0.5:2.5; +%! sigma = 1:6; +%! y = raylpdf (x, sigma); +%! expected_y = [0.0000, 0.1212, 0.1051, 0.0874, 0.0738, 0.0637]; +%! assert (y, expected_y, 0.001); + +%!test +%! x = 0:0.5:2.5; +%! y = raylpdf (x, 0.5); +%! expected_y = [0.0000, 1.2131, 0.5413, 0.0667, 0.0027, 0.0000]; +%! assert (y, expected_y, 0.001); diff --git a/inst/raylrnd.m b/inst/raylrnd.m new file mode 100644 index 0000000..950270c --- /dev/null +++ b/inst/raylrnd.m @@ -0,0 +1,157 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{x} =} raylrnd (@var{sigma}) +## @deftypefnx {Function File} {@var{x} =} raylrnd (@var{sigma}, @var{sz}) +## @deftypefnx {Function File} {@var{x} =} raylrnd (@var{sigma}, @var{r}, @var{c}) +## Generate a matrix of random samples from the Rayleigh distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{sigma} is the parameter of the Rayleigh distribution. The elements +## of @var{sigma} must be positive. +## +## @item +## @var{sz} is the size of the matrix to be generated. @var{sz} must be a +## vector of non-negative integers. +## +## @item +## @var{r} is the number of rows of the matrix to be generated. @var{r} must +## be a non-negative integer. +## +## @item +## @var{c} is the number of columns of the matrix to be generated. @var{c} +## must be a non-negative integer. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{x} is a matrix of random samples from the Rayleigh distribution with +## corresponding parameter @var{sigma}. If neither @var{sz} nor @var{r} and +## @var{c} are specified, then @var{x} is of the same size as @var{sigma}. +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## sigma = 1:6; +## x = raylrnd (sigma) +## @end group +## +## @group +## sz = [2, 3]; +## x = raylrnd (0.5, sz) +## @end group +## +## @group +## r = 2; +## c = 3; +## x = raylrnd (0.5, r, c) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. pages 104 and 148, McGraw-Hill, New York, second edition, +## 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Random samples from the Rayleigh distribution + +function x = raylrnd (sigma, r, c) + + # Check arguments + if (nargin == 1) + sz = size (sigma); + elseif (nargin == 2) + if (! isvector (r) || any ((r < 0) | round (r) != r)) + error ("raylrnd: sz must be a vector of non-negative integers") + endif + sz = r(:)'; + if (! isscalar (sigma) && ! isempty (sigma) && (length (size (sigma)) != length (sz) || any (size (sigma) != sz))) + error ("raylrnd: sigma must be scalar or of size sz"); + endif + elseif (nargin == 3) + if (! isscalar (r) || any ((r < 0) | round (r) != r)) + error ("raylrnd: r must be a non-negative integer") + endif + if (! isscalar (c) || any ((c < 0) | round (c) != c)) + error ("raylrnd: c must be a non-negative integer") + endif + sz = [r, c]; + if (! isscalar (sigma) && ! isempty (sigma) && (length (size (sigma)) != length (sz) || any (size (sigma) != sz))) + error ("raylrnd: sigma must be scalar or of size [r, c]"); + endif + else + print_usage (); + endif + + if (! isempty (sigma) && ! ismatrix (sigma)) + error ("raylrnd: sigma must be a numeric matrix"); + endif + + if (isempty (sigma)) + x = []; + elseif (isscalar (sigma) && ! (sigma > 0)) + x = NaN .* ones (sz); + else + # Draw random samples + x = sqrt (-2 .* log (1 - rand (sz)) .* sigma .^ 2); + + # Continue argument check + k = find (! (sigma > 0)); + if (any (k)) + x(k) = NaN; + endif + endif + +endfunction + +%!test +%! sigma = 1:6; +%! x = raylrnd (sigma); +%! assert (size (x), size (sigma)); +%! assert (all (x >= 0)); + +%!test +%! sigma = 0.5; +%! sz = [2, 3]; +%! x = raylrnd (sigma, sz); +%! assert (size (x), sz); +%! assert (all (x >= 0)); + +%!test +%! sigma = 0.5; +%! r = 2; +%! c = 3; +%! x = raylrnd (sigma, r, c); +%! assert (size (x), [r, c]); +%! assert (all (x >= 0)); diff --git a/inst/raylstat.m b/inst/raylstat.m new file mode 100644 index 0000000..96f4d8b --- /dev/null +++ b/inst/raylstat.m @@ -0,0 +1,94 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} raylstat (@var{sigma}) +## Compute mean and variance of the Rayleigh distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{sigma} is the parameter of the Rayleigh distribution. The elements +## of @var{sigma} must be positive. +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the Rayleigh distribution. +## +## @item +## @var{v} is the variance of the Rayleigh distribution. +## @end itemize +## +## @subheading Example +## +## @example +## @group +## sigma = 1:6; +## [m, v] = raylstat (sigma) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the Rayleigh distribution + +function [m, v] = raylstat (sigma) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (sigma) && ! ismatrix (sigma)) + error ("raylstat: sigma must be a numeric matrix"); + endif + + # Calculate moments + m = sigma .* sqrt (pi ./ 2); + v = (2 - pi ./ 2) .* sigma .^ 2; + + # Continue argument check + k = find (! (sigma > 0)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! sigma = 1:6; +%! [m, v] = raylstat (sigma); +%! expected_m = [1.2533, 2.5066, 3.7599, 5.0133, 6.2666, 7.5199]; +%! expected_v = [0.4292, 1.7168, 3.8628, 6.8673, 10.7301, 15.4513]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/regress.m b/inst/regress.m new file mode 100644 index 0000000..32bd921 --- /dev/null +++ b/inst/regress.m @@ -0,0 +1,214 @@ +## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno +## Copyright (C) 2011 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}]) +## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X} +## with the model @code{y = X * beta + e}. +## +## Here, +## +## @itemize +## @item +## @code{y} is a column vector of observed values +## @item +## @code{X} is a matrix of regressors, with the first column filled with +## the constant value 1 +## @item +## @code{beta} is a column vector of regression parameters +## @item +## @code{e} is a column vector of random errors +## @end itemize +## +## Arguments are +## +## @itemize +## @item +## @var{y} is the @code{y} in the model +## @item +## @var{X} is the @code{X} in the model +## @item +## @var{alpha} is the significance level used to calculate the confidence +## intervals @var{bint} and @var{rint} (see `Return values' below). If not +## specified, ALPHA defaults to 0.05 +## @end itemize +## +## Return values are +## +## @itemize +## @item +## @var{b} is the @code{beta} in the model +## @item +## @var{bint} is the confidence interval for @var{b} +## @item +## @var{r} is a column vector of residuals +## @item +## @var{rint} is the confidence interval for @var{r} +## @item +## @var{stats} is a row vector containing: +## +## @itemize +## @item The R^2 statistic +## @item The F statistic +## @item The p value for the full model +## @item The estimated error variance +## @end itemize +## @end itemize +## +## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize +## the residual intervals and identify outliers. +## +## NaN values in @var{y} and @var{X} are removed before calculation begins. +## +## @end deftypefn + +## References: +## - Matlab 7.0 documentation (pdf) +## - ¡¶´óѧÊýѧʵÑé¡· ½ªÆôÔ´ µÈ (textbook) +## - http://www.netnam.vn/unescocourse/statistics/12_5.htm +## - wsolve.m in octave-forge +## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf + +function [b, bint, r, rint, stats] = regress (y, X, alpha) + + if (nargin < 2 || nargin > 3) + print_usage; + endif + + if (! ismatrix (y)) + error ("regress: y must be a numeric matrix"); + endif + if (! ismatrix (X)) + error ("regress: X must be a numeric matrix"); + endif + + if (columns (y) != 1) + error ("regress: y must be a column vector"); + endif + + if (rows (y) != rows (X)) + error ("regress: y and X must contain the same number of rows"); + endif + + if (nargin < 3) + alpha = 0.05; + elseif (! isscalar (alpha)) + error ("regress: alpha must be a scalar value") + endif + + notnans = ! logical (sum (isnan ([y X]), 2)); + y = y(notnans); + X = X(notnans,:); + + [Xq Xr] = qr (X, 0); + pinv_X = Xr \ Xq'; + + b = pinv_X * y; + + if (nargout > 1) + + n = rows (X); + p = columns (X); + dof = n - p; + t_alpha_2 = tinv (alpha / 2, dof); + + r = y - X * b; # added -- Nir + SSE = sum (r .^ 2); + v = SSE / dof; + + # c = diag(inv (X' * X)) using (economy) QR decomposition + # which means that we only have to use Xr + c = diag (inv (Xr' * Xr)); + + db = t_alpha_2 * sqrt (v * c); + + bint = [b + db, b - db]; + + endif + + if (nargout > 3) + + dof1 = n - p - 1; + h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X), without doing the matrix multiply) + + # From Matlab's documentation on Multiple Linear Regression, + # sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h)); + # dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h)); + # Substitute + # norm (r) ^ 2 == sum (r .^ 2) == SSE + # -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2 + # We get + # sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1; + # dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h)); + # Combine, we get + # dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1); + + dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1); + + rint = [r + dr, r - dr]; + + endif + + if (nargout > 4) + + R2 = 1 - SSE / sum ((y - mean (y)) .^ 2); +# F = (R2 / (p - 1)) / ((1 - R2) / dof); + F = dof / (p - 1) / (1 / R2 - 1); + pval = 1 - fcdf (F, p - 1, dof); + + stats = [R2 F pval v]; + + endif + +endfunction + + +%!test +%! % Longley data from the NIST Statistical Reference Dataset +%! Z = [ 60323 83.0 234289 2356 1590 107608 1947 +%! 61122 88.5 259426 2325 1456 108632 1948 +%! 60171 88.2 258054 3682 1616 109773 1949 +%! 61187 89.5 284599 3351 1650 110929 1950 +%! 63221 96.2 328975 2099 3099 112075 1951 +%! 63639 98.1 346999 1932 3594 113270 1952 +%! 64989 99.0 365385 1870 3547 115094 1953 +%! 63761 100.0 363112 3578 3350 116219 1954 +%! 66019 101.2 397469 2904 3048 117388 1955 +%! 67857 104.6 419180 2822 2857 118734 1956 +%! 68169 108.4 442769 2936 2798 120445 1957 +%! 66513 110.8 444546 4681 2637 121950 1958 +%! 68655 112.6 482704 3813 2552 123366 1959 +%! 69564 114.2 502601 3931 2514 125368 1960 +%! 69331 115.7 518173 4806 2572 127852 1961 +%! 70551 116.9 554894 4007 2827 130081 1962 ]; +%! % Results certified by NIST using 500 digit arithmetic +%! % b and standard error in b +%! V = [ -3482258.63459582 890420.383607373 +%! 15.0618722713733 84.9149257747669 +%! -0.358191792925910E-01 0.334910077722432E-01 +%! -2.02022980381683 0.488399681651699 +%! -1.03322686717359 0.214274163161675 +%! -0.511041056535807E-01 0.226073200069370 +%! 1829.15146461355 455.478499142212 ]; +%! Rsq = 0.995479004577296; +%! F = 330.285339234588; +%! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)]; +%! alpha = 0.05; +%! [b, bint, r, rint, stats] = regress (y, X, alpha); +%! assert(b,V(:,1),3e-6); +%! assert(stats(1),Rsq,1e-12); +%! assert(stats(2),F,3e-8); +%! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5); diff --git a/inst/regress_gp.m b/inst/regress_gp.m new file mode 100644 index 0000000..40310d8 --- /dev/null +++ b/inst/regress_gp.m @@ -0,0 +1,136 @@ +## Copyright (c) 2012 Juan Pablo Carbajal +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{K}] =} regress_gp (@var{x}, @var{y}, @var{Sp}) +## @deftypefnx {Function File} {[@dots{} @var{yi} @var{dy}] =} regress_gp (@dots{}, @var{xi}) +## Linear scalar regression using gaussian processes. +## +## It estimates the model @var{y} = @var{x}'*m for @var{x} R^D and @var{y} in R. +## The information about errors of the predictions (interpolation/extrapolation) is given +## by the covarianve matrix @var{K}. If D==1 the inputs must be column vectors, +## if D>1 then @var{x} is n-by-D, with n the number of data points. @var{Sp} defines +## the prior covariance of @var{m}, it should be a (D+1)-by-(D+1) positive definite matrix, +## if it is empty, the default is @code{Sp = 100*eye(size(x,2)+1)}. +## +## If @var{xi} inputs are provided, the model is evaluated and returned in @var{yi}. +## The estimation of the variation of @var{yi} are given in @var{dy}. +## +## Run @code{demo regress_gp} to see an examples. +## +## The function is a direc implementation of the formulae in pages 11-12 of +## Gaussian Processes for Machine Learning. Carl Edward Rasmussen and @ +## Christopher K. I. Williams. The MIT Press, 2006. ISBN 0-262-18253-X. +## available online at @url{http://gaussianprocess.org/gpml/}. +## +## @seealso{regress} +## @end deftypefn + +function [wm K yi dy] = regress_gp (x,y,Sp=[],xi=[]) + + if isempty(Sp) + Sp = 100*eye(size(x,2)+1); + end + + x = [ones(1,size(x,1)); x']; + + ## Juan Pablo Carbajal + ## Note that in the book the equation (below 2.11) for the A reads + ## A = (1/sy^2)*x*x' + inv (Vp); + ## where sy is the scalar variance of the of the residuals (i.e y = x' * w + epsilon) + ## and epsilon is drawn from N(0,sy^2). Vp is the variance of the parameters w. + ## Note that + ## (sy^2 * A)^{-1} = (1/sy^2)*A^{-1} = (x*x' + sy^2 * inv(Vp))^{-1}; + ## and that the formula for the w mean is + ## (1/sy^2)*A^{-1}*x*y + ## Then one obtains + ## inv(x*x' + sy^2 * inv(Vp))*x*y + ## Looking at the formula bloew we see that Sp = (1/sy^2)*Vp + ## making the regression depend on only one parameter, Sp, and not two. + A = x*x' + inv (Sp); + K = inv (A); + wm = K*x*y; + + yi =[]; + dy =[]; + if !isempty (xi); + xi = [ones(size(xi,1),1) xi]; + yi = xi*wm; + dy = diag (xi*K*xi'); + end + +endfunction + +%!demo +%! % 1D Data +%! x = 2*rand (5,1)-1; +%! y = 2*x -1 + 0.3*randn (5,1); +%! +%! % Points for interpolation/extrapolation +%! xi = linspace (-2,2,10)'; +%! +%! [m K yi dy] = regress_gp (x,y,[],xi); +%! +%! plot (x,y,'xk',xi,yi,'r-',xi,bsxfun(@plus, yi, [-dy +dy]),'b-'); + +%!demo +%! % 2D Data +%! x = 2*rand (4,2)-1; +%! y = 2*x(:,1)-3*x(:,2) -1 + 1*randn (4,1); +%! +%! % Mesh for interpolation/extrapolation +%! [xi yi] = meshgrid (linspace (-1,1,10)); +%! +%! [m K zi dz] = regress_gp (x,y,[],[xi(:) yi(:)]); +%! zi = reshape (zi, 10,10); +%! dz = reshape (dz,10,10); +%! +%! plot3 (x(:,1),x(:,2),y,'.g','markersize',8); +%! hold on; +%! h = mesh (xi,yi,zi,zeros(10,10)); +%! set(h,'facecolor','none'); +%! h = mesh (xi,yi,zi+dz,ones(10,10)); +%! set(h,'facecolor','none'); +%! h = mesh (xi,yi,zi-dz,ones(10,10)); +%! set(h,'facecolor','none'); +%! hold off +%! axis tight +%! view(80,25) + +%!demo +%! % Projection over basis function +%! pp = [2 2 0.3 1]; +%! n = 10; +%! x = 2*rand (n,1)-1; +%! y = polyval(pp,x) + 0.3*randn (n,1); +%! +%! % Powers +%! px = [sqrt(abs(x)) x x.^2 x.^3]; +%! +%! % Points for interpolation/extrapolation +%! xi = linspace (-1,1,100)'; +%! pxi = [sqrt(abs(xi)) xi xi.^2 xi.^3]; +%! +%! Sp = 100*eye(size(px,2)+1); +%! Sp(2,2) = 1; # We don't believe the sqrt is present +%! [m K yi dy] = regress_gp (px,y,Sp,pxi); +%! disp(m) +%! +%! plot (x,y,'xk;Data;',xi,yi,'r-;Estimation;',xi,polyval(pp,xi),'g-;True;'); +%! axis tight +%! axis manual +%! hold on +%! plot (xi,bsxfun(@plus, yi, [-dy +dy]),'b-'); +%! hold off diff --git a/inst/repanova.m b/inst/repanova.m new file mode 100644 index 0000000..e9d525c --- /dev/null +++ b/inst/repanova.m @@ -0,0 +1,100 @@ +## Copyright (C) 2011 Kyle Winfree +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{pval}, @var{table}, @var{st}] =} repanova (@var{X}, @var{cond}) +## @deftypefnx {Function File} {[@var{pval}, @var{table}, @var{st}] =} repanova (@var{X}, @var{cond}, ['string' | 'cell']) +## Perform a repeated measures analysis of variance (Repeated ANOVA). +## X is formated such that each row is a subject and each column is a condition. +## +## condition is typically a point in time, say t=1 then t=2, etc +## condition can also be thought of as groups. +## +## The optional flag can be either 'cell' or 'string' and reflects +## the format of the table returned. Cell is the default. +## +## NaNs are ignored using nanmean and nanstd. +## +## This fuction does not currently support multiple columns of the same +## condition! +## @end deftypefn + +function [p, table, st] = repanova(varargin) + +switch nargin + case 0 + error('Too few inputs.'); + case 1 + X = varargin{1}; + for c = 1:size(X, 2) + condition{c} = ['time', num2str(c)]; + end + option = 'cell'; + case 2 + X = varargin{1}; + condition = varargin{2}; + option = 'cell'; + case 3 + X = varargin{1}; + condition = varargin{2}; + option = varargin{3}; + otherwise + error('Too many inputs.'); +end + % Find the means of the subjects and measures, ignoring any NaNs + u_subjects = nanmean(X,2); + u_measures = nanmean(X,1); + u_grand = nansum(nansum(X)) / (size(X,1) * size(X,2)); + % Differences between rows will be reflected in SS subjects, differences + % between columns will be reflected in SS_within subjects. + N = size(X,1); % number of subjects + J = size(X,2); % number of samples per subject + SS_measures = N * nansum((u_measures - u_grand).^2); + SS_subjects = J * nansum((u_subjects - u_grand).^2); + SS_total = nansum(nansum((X - u_grand).^2)); + SS_error = SS_total - SS_measures - SS_subjects; + df_measures = J - 1; + df_subjects = N - 1; + df_grand = (N*J) - 1; + df_error = df_grand - df_measures - df_subjects; + MS_measures = SS_measures / df_measures; + MS_subjects = SS_subjects / df_subjects; + MS_error = SS_error / df_error; % variation expected as a result of sampling error alone + F = MS_measures / MS_error; + p = 1 - fcdf(F, df_measures, df_error); % Probability of F given equal means. + + if strcmp(option, 'string') + table = [sprintf('\nSource\tSS\tdf\tMS\tF\tProb > F'), ... + sprintf('\nSubject\t%g\t%i\t%g', SS_subjects, df_subjects, MS_subjects), ... + sprintf('\nMeasure\t%g\t%i\t%g\t%g\t%g', SS_measures, df_measures, MS_measures, F, p), ... + sprintf('\nError\t%g\t%i\t%g', SS_error, df_error, MS_error), ... + sprintf('\n')]; + else + table = {'Source', 'Partial SS', 'df', 'MS', 'F', 'Prob > F'; ... + 'Subject', SS_subjects, df_subjects, MS_subjects, '', ''; ... + 'Measure', SS_measures, df_measures, MS_measures, F, p}; + end + + st.gnames = condition'; % this is the same struct format used in anova1 + st.n = repmat(N, 1, J); + st.source = 'anova1'; % it cannot be assumed that 'repanova' is a supported source for multcompare + st.means = u_measures; + st.df = df_error; + st.s = sqrt(MS_error); +end + +% This function was created with guidance from the following websites: +% http://courses.washington.edu/stat217/rmANOVA.html +% http://grants.hhp.coe.uh.edu/doconnor/PEP6305/Topic%20010%20Repeated%20Measures.htm diff --git a/inst/runstest.m b/inst/runstest.m new file mode 100644 index 0000000..4ce8d47 --- /dev/null +++ b/inst/runstest.m @@ -0,0 +1,107 @@ +## Copyright (C) 2013 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{h}, @var{p}, @var{stats} =} runstest (@var{x}, @var{v}) +## Runs test for detecting serial correlation in the vector @var{x}. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{x} is the vector of given values. +## @item +## @var{v} is the value to subtract from @var{x} to get runs (defaults to @code{median(x)}) +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{h} is true if serial correlation is detected at the 95% confidence level (two-tailed), false otherwise. +## @item +## @var{p} is the probablity of obtaining a test statistic of the magnitude found under the null hypothesis of no serial correlation. +## @item +## @var{stats} is the structure containing as fields the number of runs @var{nruns}; the numbers of positive and negative values of @code{x - v}, @var{n1} and @var{n0}; and the test statistic @var{z}. +## +## @end itemize +## +## Note: the large-sample normal approximation is used to find @var{h} and @var{p}. This is accurate if @var{n1}, @var{n0} are both greater than 10. +## +## Reference: +## NIST Engineering Statistics Handbook, 1.3.5.13. Runs Test for Detecting Non-randomness, http://www.itl.nist.gov/div898/handbook/eda/section3/eda35d.htm +## +## @seealso{} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Runs test for detecting serial correlation + +function [h, p, stats] = runstest (x, x2) + + # Check arguments + if (nargin < 1) + print_usage; + endif + + if nargin > 1 && isnumeric(x2) + v = x2; + else + v = median(x); + endif + + x = x(~isnan(x)); #delete missing values + x = sign(x - v); + x = x(x ~= 0); #delete any zeros + + R = sum((x(1:(end-1)) .* x(2:end)) < 0) + 1; #number of runs + + #expected number of runs for an iid sequence + n1 = sum(x > 0); + n2 = sum(x < 0); + R_bar = 1 + 2*n1*n2/(n1 + n2); + + #standard deviation of number of runs for an iid sequence + s_R = sqrt(2*n1*n2*(2*n1*n2 - n1 - n2)/((n1 + n2)^2 * (n1 + n2 - 1))); + + #desired significance level + alpha = 0.05; + + Z = (R - R_bar) / s_R; #test statistic + + p = 2 * normcdf(-abs(Z)); + + h = p < alpha; + + if nargout > 2 + stats.nruns = R; + stats.n1 = n1; + stats.n0 = n2; + stats.z = Z; + endif + +endfunction + + + +%!test +%! data = [-213 -564 -35 -15 141 115 -420 -360 203 -338 -431 194 -220 -513 154 -125 -559 92 -21 -579 -52 99 -543 -175 162 -457 -346 204 -300 -474 164 -107 -572 -8 83 -541 -224 180 -420 -374 201 -236 -531 83 27 -564 -112 131 -507 -254 199 -311 -495 143 -46 -579 -90 136 -472 -338 202 -287 -477 169 -124 -568 17 48 -568 -135 162 -430 -422 172 -74 -577 -13 92 -534 -243 194 -355 -465 156 -81 -578 -64 139 -449 -384 193 -198 -538 110 -44 -577 -6 66 -552 -164 161 -460 -344 205 -281 -504 134 -28 -576 -118 156 -437 -381 200 -220 -540 83 11 -568 -160 172 -414 -408 188 -125 -572 -32 139 -492 -321 205 -262 -504 142 -83 -574 0 48 -571 -106 137 -501 -266 190 -391 -406 194 -186 -553 83 -13 -577 -49 103 -515 -280 201 300 -506 131 -45 -578 -80 138 -462 -361 201 -211 -554 32 74 -533 -235 187 -372 -442 182 -147 -566 25 68 -535 -244 194 -351 -463 174 -125 -570 15 72 -550 -190 172 -424 -385 198 -218 -536 96]; #NIST beam deflection data, http://www.itl.nist.gov/div898/handbook/eda/section4/eda425.htm +%! [h, p, stats] = runstest (data); +%! expected_h = true; +%! expected_p = 0.0070646; +%! expected_z = 2.6938; +%! assert (h, expected_h); +%! assert (p, expected_p, 1E-6); +%! assert (stats.z, expected_z, 1E-4); diff --git a/inst/sigma_pts.m b/inst/sigma_pts.m new file mode 100644 index 0000000..cf01557 --- /dev/null +++ b/inst/sigma_pts.m @@ -0,0 +1,123 @@ +## Copyright (C) 2017 - Juan Pablo Carbajal +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## Author: Juan Pablo Carbajal + +## -*- texinfo -*- +## @deftypefn {} {@var{pts} =} sigma_pts (@var{n}) +## @deftypefnx {@var{pts} =} sigma_pts (@var{n}, @var{m}) +## @deftypefnx {@var{pts} =} sigma_pts (@var{n}, @var{m}, @var{K}) +## @deftypefnx {@var{pts} =} sigma_pts (@var{n}, @var{m}, @var{K}, @var{l}) +## Calculates 2*@var{n}+1 sigma points in @var{n} dimensions. +## +## Sigma points are used in the unscented transfrom to estimate +## the result of applying a given nonlinear transformation to a probability +## distribution that is characterized only in terms of a finite set of statistics. +## +## If only the dimension @var{n} is given the resulting points have zero mean +## and identity covariance matrix. +## If the mean @var{m} or the covaraince matrix @var{K} are given, then the resulting points +## will have those statistics. +## The factor @var{l} scaled the points away from the mean. It is useful to tune +## the accuracy of the unscented transfrom. +## +## There is no unique way of computing sigma points, this function implements the +## algorithm described in section 2.6 "The New Filter" pages 40-41 of +## +## Uhlmann, Jeffrey (1995). "Dynamic Map Building and Localization: New Theoretical Foundations". +## Ph.D. thesis. University of Oxford. +## +## @end deftypefn + +function pts = sigma_pts (n, m = [], K = [], l = 0) + + if isempty (K) + K = eye (n); + endif + if isempty (m) + m = zeros (1, n); + endif + + if (n ~= length (m)) + error ("Dimension and size of mean vector don't match.") + endif + if any(n ~= size (K)) + error ("Dimension and size of covariance matrix don't match.") + endif + + if isdefinite (K) <= 0 + error ("Covariance matrix should be positive definite.") + endif + + pts = zeros (2 * n + 1, n); + pts(1,:) = m; + + K = sqrtm ((n + l) * K); + pts(2:n+1,:) = bsxfun (@plus, m , K); + pts(n+2:end,:) = bsxfun (@minus, m , K); + +endfunction + +%!demo +%! K = [1 0.5; 0.5 1]; # covaraince matrix +%! # calculate and build associated ellipse +%! [R,S,~] = svd (K); +%! theta = atan2 (R(2,1), R(1,1)); +%! v = sqrt (diag (S)); +%! v = v .* [cos(theta) sin(theta); -sin(theta) cos(theta)]; +%! t = linspace (0, 2*pi, 100).'; +%! xe = v(1,1) * cos (t) + v(2,1) * sin (t); +%! ye = v(1,2) * cos (t) + v(2,2) * sin (t); +%! +%! figure(1); clf; hold on +%! # Plot ellipse and axes +%! line ([0 0; v(:,1).'],[0 0; v(:,2).']) +%! plot (xe,ye,'-r'); +%! +%! col = 'rgb'; +%! l = [-1.8 -1 1.5]; +%! for li = 1:3 +%! p = sigma_pts (2, [], K, l(li)); +%! tmp = plot (p(2:end,1), p(2:end,2), ['x' col(li)], ... +%! p(1,1), p(1,2), ['o' col(li)]); +%! h(li) = tmp(1); +%! endfor +%! hold off +%! axis image +%! legend (h, arrayfun (@(x) sprintf ("l:%.2g", x), l, "unif", 0)); + + +%!test +%! p = sigma_pts (5); +%! assert (mean (p), zeros(1,5), sqrt(eps)); +%! assert (cov (p), eye(5), sqrt(eps)); + +%!test +%! m = randn(1, 5); +%! p = sigma_pts (5, m); +%! assert (mean (p), m, sqrt(eps)); +%! assert (cov (p), eye(5), sqrt(eps)); + +%!test +%! x = linspace (0,1,5); +%! K = exp (- (x.' - x).^2/ 0.5); +%! p = sigma_pts (5, [], K); +%! assert (mean (p), zeros(1,5), sqrt(eps)); +%! assert (cov (p), K, sqrt(eps)); + +%!error sigma_pts(2,1); +%!error sigma_pts(2,[],1); +%!error sigma_pts(2,1,1); +%!error sigma_pts(2,[0.5 0.5],[-1 0; 0 0]); diff --git a/inst/signtest.m b/inst/signtest.m new file mode 100644 index 0000000..78fd5e6 --- /dev/null +++ b/inst/signtest.m @@ -0,0 +1,163 @@ +## Copyright (C) 2014 Tony Richardson +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}) +## @deftypefnx {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}, @var{m}) +## @deftypefnx {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}, @var{y}) +## @deftypefnx {Function File} {[@var{pval}, @var{h}, @var{stats}] =} signtest (@var{x}, @var{y}, @var{Name}, @var{Value}) +## Test for median. +## +## Perform a signtest of the null hypothesis that @var{x} is from a distribution +## that has a zero median. +## +## If the second argument @var{m} is a scalar, the null hypothesis is that +## X has median m. +## +## If the second argument @var{y} is a vector, the null hypothesis is that +## the distribution of @code{@var{x} - @var{y}} has zero median. +## +## The argument @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). The string +## argument @qcode{"tail"}, can be used to select the desired alternative +## hypotheses. If @qcode{"alt"} is @qcode{"both"} (default) the null is +## tested against the two-sided alternative @code{median (@var{x}) != @var{m}}. +## If @qcode{"alt"} is @qcode{"right"} the one-sided +## alternative @code{median (@var{x}) > @var{m}} is considered. +## Similarly for @qcode{"left"}, the one-sided alternative @code{median +## (@var{x}) < @var{m}} is considered. When @qcode{"method"} is @qcode{"exact"} +## the p-value is computed using an exact method (this is the default). When +## @qcode{"method"} is @qcode{"approximate"} a normal approximation is used for the +## test statistic. +## +## The p-value of the test is returned in @var{pval}. If @var{h} is 0 the +## null hypothesis is accepted, if it is 1 the null hypothesis is rejected. +## @var{stats} is a structure containing the value of the test statistic +## (@var{sign}) and the value of the z statistic (@var{zval}) (only computed +## when the 'method' is 'approximate'. +## +## @end deftypefn + +## Author: Tony Richardson + +function [p, h, stats] = signtest(x, my, varargin) + + my_default = 0; + alpha = 0.05; + tail = 'both'; + method = 'exact'; + + % Find the first non-singleton dimension of x + dim = min(find(size(x)~=1)); + if isempty(dim), dim = 1; end + + if (nargin == 1) + my = my_default; + end + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + case 'method' + i = i + 1; + method = varargin{i}; + case 'dim' + i = i + 1; + dim = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail, 'char') + error('tail argument to signtest must be a string\n',[]); + end + + if ~isa(method, 'char') + error('method argument to signtest must be a string\n',[]); + end + + % Set default values if arguments are present but empty + if isempty(my) + my = my_default; + end + + % This adjustment allows everything else to remain the + % same for both the one-sample t test and paired tests. + % If second argument is a vector + if ~isscalar(my) + x = x - my; + my = my_default; + end + + n = size(x, dim); + + switch lower(method) + case 'exact' + stats.zval = nan; + switch lower(tail) + case 'both' + w = min(sum(xmy)); + pl = binocdf(w, n, 0.5); + p = 2*min(pl,1-pl); + case 'left' + w = sum(xmy); + p = 1 - binocdf(w, n, 0.5); + otherwise + error('Invalid tail argument to signtest\n',[]); + end + case 'approximate' + switch lower(tail) + case 'both' + npos = sum(x>my); + nneg = sum(xmy); + nneg = sum(xmy); + nneg = sum(xmy); + stats.zval = (w - 0.5*n - 0.5*sign(npos-nneg))/sqrt(0.25*n); + p = 1-normcdf(stats.zval); + otherwise + error('Invalid tail argument to signtest\n',[]); + end + otherwise + error('Invalid method argument to signtest\n',[]); + end + + stats.sign = w; + + h = double(p < alpha); + +end diff --git a/inst/silhouette.m b/inst/silhouette.m new file mode 100644 index 0000000..9c240bd --- /dev/null +++ b/inst/silhouette.m @@ -0,0 +1,203 @@ +## Copyright (C) 2021 Stefano Guidoni +## Copyright (C) 2016 Nan Zhou +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @ +## {[@var{si}, @var{h}] =} silhouette (@var{X}, @var{clust}) +## @deftypefnx {Function File} @ +## {[@var{si}, @var{h}] =} silhouette (@dots{}, @var{Metric}, @var{MetricArg}) +## +## Compute the silhouette values of clustered data and show them on a plot. +## +## @var{X} is a n-by-p matrix of n data points in a p-dimensional space. Each +## datapoint is assigned to a cluster using @var{clust}, a vector of n elements, +## one cluster assignment for each data point. +## +## Each silhouette value of @var{si}, a vector of size n, is a measure of the +## likelihood that a data point is accurately classified to the right cluster. +## Defining "a" as the mean distance between a point and the other points from +## its cluster, and "b" as the mean distance between that point and the points +## from other clusters, the silhouette value of the i-th point is: +## +## @tex +## \def\frac#1#2{{\begingroup#1\endgroup\over#2}} +## $$ S_i = \frac{b_i - a_i}{max(a_1,b_i)} $$ +## @end tex +## @ifnottex +## @verbatim +## bi - ai +## Si = ------------ +## max(ai,bi) +## @end verbatim +## @end ifnottex +## +## Each element of @var{si} ranges from -1, minimum likelihood of a correct +## classification, to 1, maximum likelihood. +## +## Optional input value @var{Metric} is the metric used to compute the distances +## between data points. Since @code{silhouette} uses @code{pdist} to compute +## these distances, @var{Metric} is quite similar to the option @var{Metric} of +## pdist and it can be: +## @itemize @bullet +## @item A known distance metric defined as a string: @qcode{Euclidean}, +## @qcode{sqEuclidean} (default), @qcode{cityblock}, @qcode{cosine}, +## @qcode{correlation}, @qcode{Hamming}, @qcode{Jaccard}. +## +## @item A vector as those created by @code{pdist}. In this case @var{X} does +## nothing. +## +## @item A function handle that is passed to @code{pdist} with @var{MetricArg} +## as optional inputs. +## @end itemize +## +## Optional return value @var{h} is a handle to the silhouette plot. +## +## @strong{Reference} +## Peter J. Rousseeuw, Silhouettes: a Graphical Aid to the Interpretation and +## Validation of Cluster Analysis. 1987. doi:10.1016/0377-0427(87)90125-7 +## @end deftypefn +## +## @seealso{dendrogram, evalcluster, kmeans, linkage, pdist} + +function [si, h] = silhouette (X, clust, metric = "sqeuclidean", varargin) + ## check the input parameters + if (nargin < 2) + print_usage (); + endif + + n = size (clust, 1); + + ## check size + if (! isempty (X)) + if (size (X, 1) != n) + error ("First dimension of X <%d> doesn't match that of clust <%d>",... + size (X, 1), n); + endif + endif + + ## check metric + if (ischar (metric)) + metric = lower (metric); + switch (metric) + case "sqeuclidean" + metric = "squaredeuclidean"; + case { "euclidean", "cityblock", "cosine", ... + "correlation", "hamming", "jaccard" } + ; + otherwise + error ("silhouette: invalid metric '%s'", metric); + endswitch + elseif (isnumeric (metric) && isvector (metric)) + ## X can be omitted when using this + distMatrix = squareform (metric); + if (size (distMatrix, 1) != n) + error ("First dimension of X <%d> doesn't match that of clust <%d>",... + size (distMatrix, 1), n); + endif + endif + + ## main + si = zeros(n, 1); + clusterIDs = unique (clust); # eg [1; 2; 3; 4] + m = length (clusterIDs); + + ## if only one cluster is defined, the silhouette value is not defined + if (m == 1) + si = NaN * ones (n, 1); + return; + endif + + ## distance matrix showing the distance for any two rows of X + if (! exist ('distMatrix', 'var')) + distMatrix = squareform (pdist (X, metric, varargin{:})); + endif + + ## calculate values of si one by one + for iii = 1 : length (si) + + ## allocate values to clusters + groupedValues = {}; + for jjj = 1 : m + groupedValues{clusterIDs(jjj)} = [distMatrix(iii, ... + clust == clusterIDs(jjj))]; + endfor + ## end allocation + + ## calculate a(i) + ## average distance of iii to all other objects in the same cluster + if (length (groupedValues{clust(iii)}) == 1) + si(iii) = 1; + continue; + else + a_i = (sum (groupedValues{clust(iii)})) / ... + (length (groupedValues{clust(iii)}) - 1); + endif + ## end a(i) + + + ## calculate b(i) + clusterIDs_new = clusterIDs; + ## remove the cluster iii in + clusterIDs_new(find (clusterIDs_new == clust(iii))) = []; + ## average distance of iii to all objects of another cluster + a_iii_2others = zeros (length (clusterIDs_new), 1); + for jjj = 1 : length (clusterIDs_new) + a_iii_2others(jjj) = mean (groupedValues{clusterIDs_new(jjj)}); + endfor + b_i = min (a_iii_2others); + ## end b(i) + + + ## calculate s(i) + si(iii) = (b_i - a_i) / (max ([a_i; b_i])); + ## end s(i) + endfor + + ## plot + ## a poor man silhouette graph + vBarsc = zeros (m, 1); + vPadding = [0; 0; 0; 0]; + Bars = vPadding; + + for i = 1 : m + vBar = si(find (clust == clusterIDs(i))); + vBarsc(i) = length (Bars) + (length (vBar) / 2); + Bars = [Bars; (sort (vBar, "descend")); vPadding]; + endfor + + figure(); + h = barh (Bars, "hist", "facecolor", [0 0.4471 0.7412]); + + xlabel ("Silhouette Value"); + ylabel ("Cluster"); + set (gca, "ytick", vBarsc, "yticklabel", clusterIDs); + ylim ([0 (length (Bars))]); + axis ("ij"); +endfunction + +%!error silhouette (); +%!error silhouette ([1 2; 1 1]); +%!error silhouette ([1 2; 1 1], [1 2 3]'); +%!error silhouette ([1 2; 1 1], [1 2]', "xxx"); + +%!demo +%! load fisheriris; +%! X = meas(:,3:4); +%! cidcs = kmeans (X, 3, "Replicates", 5); +%! silhouette (X, cidcs); +%! y_labels(cidcs([1 51 101])) = unique (species); +%! set (gca, "yticklabel", y_labels); +%! title ("Fisher's iris data"); diff --git a/inst/slicesample.m b/inst/slicesample.m new file mode 100644 index 0000000..0ccf9ab --- /dev/null +++ b/inst/slicesample.m @@ -0,0 +1,305 @@ +######################################################################## +## +## Copyright (C) 1993-2021 The Octave Project Developers +## +## See the file COPYRIGHT.md in the top-level directory of this +## distribution or . +## +## This file is part of Octave. +## +## Octave is free software: you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . +## +######################################################################## + +## -*- texinfo -*- +## @deftypefn {} {[@var{smpl}, @var{neval}] =} slicesample (@var{start}, @var{nsamples}, @var{property}, @var{value}, @dots{}) +## Draws @var{nsamples} samples from a target stationary distribution @var{pdf} +## using slice sampling of Radford M. Neal. +## +## Input: +## @itemize +## @item +## @var{start} is a 1 by @var{dim} vector of the starting point of the +## Markov chain. Each column corresponds to a different dimension. +## +## @item +## @var{nsamples} is the number of samples, the length of the Markov chain. +## @end itemize +## +## Next, several property-value pairs can or must be specified, they are: +## +## (Required properties) One of: +## +## @itemize +## @item +## @var{"pdf"}: the value is a function handle of the target stationary +## distribution to be sampled. The function should accept different locations +## in each row and each column corresponds to a different dimension. +## +## or +## +## @item +## @var{logpdf}: the value is a function handle of the log of the target +## stationary distribution to be sampled. The function should accept different +## locations in each row and each column corresponds to a different dimension. +## @end itemize +## +## The following input property/pair values may be needed depending on the +## desired outut: +## +## @itemize +## @item +## "burnin" @var{burnin} the number of points to discard at the beginning, the default +## is 0. +## +## @item +## "thin" @var{thin} omitts @var{m}-1 of every @var{m} points in the generated +## Markov chain. The default is 1. +## +## @item +## "width" @var{width} the maximum Manhattan distance between two samples. +## The default is 10. +## @end itemize +## +## Outputs: +## @itemize +## +## @item +## @var{smpl} is a @var{nsamples} by @var{dim} matrix of random +## values drawn from @var{pdf} where the rows are different random values, the +## columns correspond to the dimensions of @var{pdf}. +## +## @item +## @var{neval} is the number of function evaluations per sample. +## @end itemize +## Example : Sampling from a normal distribution +## +## @example +## @group +## start = 1; +## nsamples = 1e3; +## pdf = @@(x) exp (-.5 * x .^ 2) / (pi ^ .5 * 2 ^ .5); +## [smpl, accept] = slicesample (start, nsamples, "pdf", pdf, "thin", 4); +## histfit (smpl); +## @end group +## @end example +## +## @seealso{rand, mhsample, randsample} +## @end deftypefn + +function [smpl, neval] = slicesample (start, nsamples, varargin) + + if (nargin < 4) + print_usage (); + endif + + sizestart = size (start); + pdf = []; + logpdf = []; + width = 10; + burnin = 0; + thin = 1; + for k = 1:2:length (varargin) + if (ischar (varargin{k})) + switch lower (varargin{k}) + case "pdf" + if (isa (varargin{k+1}, "function_handle")) + pdf = varargin{k+1}; + else + error ("slicesample: pdf must be a function handle"); + endif + case "logpdf" + if (isa (varargin{k+1}, "function_handle")) + pdf = varargin{k+1}; + else + error ("slicesample: logpdf must be a function handle"); + endif + case "width" + if (numel (varargin{k+1}) == 1 || numel (varargin{k+1}) == sizestart(2)) + width = varargin{k+1}(:).'; + else + error ("slicesample: width must be a scalar or 1 by dim vector"); + endif + case "burnin" + if (varargin{k+1}>=0) + burnin = varargin{k+1}; + else + error ("slicesample: burnin must be greater than or equal to 0"); + endif + case "thin" + if (varargin{k+1}>=1) + thin = varargin{k+1}; + else + error ("slicesample: thin must be greater than or equal to 1"); + endif + otherwise + warning (["slicesample: Ignoring unknown option " varargin{k}]); + endswitch + else + error (["slicesample: " varargin{k} " is not a valid property."]); + endif + endfor + + if (! isempty (pdf) && isempty (logpdf)) + logpdf = @(x) rloge (pdf (x)); + elseif (isempty (pdf) && isempty (logpdf)) + error ("slicesample: pdf or logpdf must be input."); + endif + dim = sizestart(2); + smpl = zeros (nsamples, dim); + + if (all (sizestart == [1 dim])) + smpl(1, :) = start; + else + error ("slicesample: start must be a 1 by dim vector."); + endif + + maxit = 100; + neval = 0; + + fgraterthan = @(x, fxc) logpdf (x) >= fxc; + + ti = burnin + nsamples * thin; + + rndexp = rande (ti, 1); + crand = rand (ti, dim); + prand = rand (ti, dim); + + xc = smpl(1, :); + for i = 1:ti + neval++; + sliceheight = logpdf (xc) - rndexp(i); + c = width .* crand(i, :); + lb = xc - c; + ub = xc + width - c; + #Only for single variable as bounds can not be found with point when dim > 1 + if (dim == 1) + for k=1:maxit + neval++; + if (! fgraterthan (lb, sliceheight)) + break + endif + lb -= width; + end + if (k == maxit) + warning ("slicesample: Step out exceeded maximum iterations"); + endif + for k = 1:maxit + neval++; + if (! fgraterthan (ub, sliceheight)) + break + endif + ub += width; + end + if (k == maxit) + warning ("slicesample: Step out exceeded maximum iterations"); + endif + end + xp = (ub - lb) .* prand(i, :) + lb; + for k=1:maxit + neval++; + isgt = fgraterthan (xp,sliceheight); + if (all (isgt)) + break + endif + lc = ! isgt & xp < xc; + uc = ! isgt & xp > xc; + lb(lc) = xp(lc); + ub(uc) = xp(uc); + xp = (ub - lb) .* rand (1, dim) + lb; + end + if (k == maxit) + warning ("slicesample: Step in exceeded maximum iterations"); + endif + xc = xp; + if (i > burnin) + indx = (i - burnin) / thin; + if rem (indx, 1) == 0 + smpl(indx, :) = xc; + end + end + end + neval = neval / (nsamples * thin + burnin); +endfunction + +function y = rloge (x) + + y = -inf (size (x)); + xg0 = x > 0; + y(xg0) = log (x(xg0)); + +endfunction + + +%!demo +%! ## Define function to sample +%! d = 2; +%! mu = [-1; 2]; +%! Sigma = rand (d); +%! Sigma = (Sigma + Sigma'); +%! Sigma += eye (d)*abs (eigs (Sigma, 1, "sa")) * 1.1; +%! pdf = @(x)(2*pi)^(-d/2)*det(Sigma)^-.5*exp(-.5*sum((x.'-mu).*(Sigma\(x.'-mu)),1)); +%! ##Inputs +%! start = ones (1,2); +%! nsamples = 500; +%! K = 500; +%! m = 10; +%! [smpl, accept]=slicesample (start, nsamples, "pdf", pdf, "burnin", K, "thin", m, "width", [20, 30]); +%! figure; +%! hold on; +%! plot (smpl(:,1), smpl(:,2), 'x'); +%! [x, y] = meshgrid (linspace (-6,4), linspace(-3,7)); +%! z = reshape (pdf ([x(:), y(:)]), size(x)); +%! mesh (x, y, z, "facecolor", "None"); +%! ## Using sample points to find the volume of half a sphere with radius of .5 +%! f = @(x) ((.25-(x(:,1)+1).^2-(x(:,2)-2).^2).^.5.*(((x(:,1)+1).^2+(x(:,2)-2).^2)<.25)).'; +%! int = mean (f (smpl) ./ pdf (smpl)); +%! errest = std (f (smpl) ./ pdf (smpl)) / nsamples^.5; +%! trueerr = abs (2/3*pi*.25^(3/2)-int); +%! fprintf("Monte Carlo integral estimate int f(x) dx = %f\n", int); +%! fprintf("Monte Carlo integral error estimate %f\n", errest); +%! fprintf("The actual error %f\n", trueerr); +%! mesh (x,y,reshape (f([x(:), y(:)]), size(x)), "facecolor", "None"); + +%!demo +%! ##Integrate truncated normal distribution to find normilization constant +%! pdf = @(x) exp (-.5*x.^2)/(pi^.5*2^.5); +%! nsamples = 1e3; +%! [smpl,accept] = slicesample (1, nsamples, "pdf", pdf, "thin", 4); +%! f = @(x) exp (-.5 * x .^ 2) .* (x >= -2 & x <= 2); +%! x=linspace(-3,3,1000); +%! area(x,f(x)); +%! xlabel ('x'); +%! ylabel ('f(x)'); +%! int = mean (f (smpl)./pdf(smpl)); +%! errest = std (f (smpl)./pdf(smpl))/nsamples^.5; +%! trueerr = abs (erf (2^.5)*2^.5*pi^.5-int); +%! fprintf("Monte Carlo integral estimate int f(x) dx = %f\n", int); +%! fprintf("Monte Carlo integral error estimate %f\n", errest); +%! fprintf("The actual error %f\n", trueerr); + + +%!test +%! start = 0.5; +%! nsamples = 1e3; +%! pdf = @(x) exp (-.5*(x-1).^2)/(2*pi)^.5; +%! [smpl, accept] = slicesample (start, nsamples, "pdf", pdf, "thin", 2, "burnin", 0, "width", 5); +%! assert (mean (smpl, 1), 1, .1); +%! assert (var (smpl, 1), 1, .1); + +%!error slicesample (); +%!error slicesample (1); +%!error slicesample (1, 1); + diff --git a/inst/squareform.m b/inst/squareform.m new file mode 100644 index 0000000..365b4a7 --- /dev/null +++ b/inst/squareform.m @@ -0,0 +1,122 @@ +## Copyright (C) 2015 Carnë Draug +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{z} =} squareform (@var{y}) +## @deftypefnx {Function File} {@var{y} =} squareform (@var{z}) +## @deftypefnx {Function File} {@var{z} =} squareform (@var{y}, @qcode{"tovector"}) +## @deftypefnx {Function File} {@var{y} =} squareform (@var{z}, @qcode{"tomatrix"}) +## Interchange between distance matrix and distance vector formats. +## +## Converts between an hollow (diagonal filled with zeros), square, and +## symmetric matrix and a vector with of the lower triangular part. +## +## Its target application is the conversion of the vector returned by +## @code{pdist} into a distance matrix. It performs the opposite operation +## if input is a matrix. +## +## If @var{x} is a vector, its number of elements must fit into the +## triangular part of a matrix (main diagonal excluded). In other words, +## @code{numel (@var{x}) = @var{n} * (@var{n} - 1) / 2} for some integer +## @var{n}. The resulting matrix will be @var{n} by @var{n}. +## +## If @var{x} is a distance matrix, it must be square and the diagonal entries +## of @var{x} must all be zeros. @code{squareform} will generate a warning if +## @var{x} is not symmetric. +## +## The second argument is used to specify the output type in case there +## is a single element. It will defaults to @qcode{"tomatrix"} otherwise. +## +## @seealso{pdist} +## @end deftypefn + +## Author: Carnë Draug + +function y = squareform (x, method) + + if (nargin < 1 || nargin > 2) + print_usage (); + elseif (! isnumeric (x) || ! ismatrix (x)) + error ("squareform: Y or Z must be a numeric matrix or vector"); + endif + + if (nargin == 1) + ## This is ambiguous when numel (x) == 1, but that's the whole reason + ## why the "method" option exists. + if (isvector (x)) + method = "tomatrix"; + else + method = "tovector"; + endif + endif + + switch (tolower (method)) + case "tovector" + if (! issquare (x)) + error ("squareform: Z is not a square matrix"); + elseif (any (diag (x) != 0)) + error ("squareform: Z is not a hollow matrix, i.e., with diagonal entries all zero"); + elseif (! issymmetric(x)) + warning ("squareform:symmetric", + "squareform: Z is not a symmetric matrix"); + endif + + y = vec (tril (x, -1, "pack"), 2); + + case "tomatrix" + ## the dimensions of y are the solution to the quadratic formula for: + ## length (x) = (sy - 1) * (sy / 2) + sy = (1 + sqrt (1 + 8 * numel (x))) / 2; + if (fix (sy) != sy) + error ("squareform: the numel of Y cannot form a square matrix"); + endif + + y = zeros (sy, class (x)); + y(tril (true (sy), -1)) = x; # fill lower triangular part + y += y.'; # and then the upper triangular part + + otherwise + error ("squareform: invalid METHOD '%s'", method); + endswitch + +endfunction + +%!shared v, m +%! v = 1:6; +%! m = [0 1 2 3;1 0 4 5;2 4 0 6;3 5 6 0]; + +## make sure that it can go both directions automatically +%!assert (squareform (v), m) +%!assert (squareform (squareform (v)), v) +%!assert (squareform (m), v) + +## treat row and column vectors equally +%!assert (squareform (v'), m) + +## handle 1 element input properly +%!assert (squareform (1), [0 1;1 0]) +%!assert (squareform (1, "tomatrix"), [0 1; 1 0]) +%!assert (squareform (0, "tovector"), zeros (1, 0)) + +%!warning squareform ([0 1 2; 3 0 4; 5 6 0]); + +## confirm that it respects input class +%!test +%! for c = {@single, @double, @uint8, @uint32, @uint64} +%! f = c{1}; +%! assert (squareform (f (v)), f (m)) +%! assert (squareform (f (m)), f (v)) +%! endfor + diff --git a/inst/stepwisefit.m b/inst/stepwisefit.m new file mode 100644 index 0000000..1c51167 --- /dev/null +++ b/inst/stepwisefit.m @@ -0,0 +1,168 @@ +## Copyright (C) 2013-2021 Nir Krakauer +## Copyright (C) 2014 by Mikael Kurula + +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{X_use}, @var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats} =} stepwisefit (@var{y}, @var{X}, @var{penter} = 0.05, @var{premove} = 0.1, @var{method} = "corr") +## Linear regression with stepwise variable selection. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{y} is an @var{n} by 1 vector of data to fit. +## @item +## @var{X} is an @var{n} by @var{k} matrix containing the values of @var{k} potential predictors. No constant term should be included (one will always be added to the regression automatically). +## @item +## @var{penter} is the maximum p-value to enter a new variable into the regression (default: 0.05). +## @item +## @var{premove} is the minimum p-value to remove a variable from the regression (default: 0.1). +## @item +## @var{method} sets how predictors are selected at each step, either based on their correlation with the residuals ("corr", default) +## or on the p values of their regression coefficients when they are successively added ("p"). +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{X_use} contains the indices of the predictors included in the final regression model. The predictors are listed in the order they were added, so typically the first ones listed are the most significant. +## @item +## @var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats} are the results of @code{[b, bint, r, rint, stats] = regress(y, [ones(size(y)) X(:, X_use)], penter);} +## @end itemize +## @subheading References +## +## @enumerate +## @item +## N. R. Draper and H. Smith (1966). @cite{Applied Regression Analysis}. Wiley. Chapter 6. +## +## @end enumerate +## @seealso{regress} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Linear regression with stepwise variable selection + +function [X_use, b, bint, r, rint, stats] = stepwisefit(y, X, penter = 0.05, premove = 0.1, method = "corr") + +if nargin >= 3 && isempty(penter) + penter = 0.05; +endif + +if nargin >= 4 && isempty(premove) + premove = 0.1; +endif + + +#remove any rows with missing entries +notnans = !any (isnan ([y X]) , 2); +y = y(notnans); +X = X(notnans,:); + +n = numel(y); #number of data points +k = size(X, 2); #number of predictors + +X_use = []; +v = 0; #number of predictor variables in regression model + +iter = 0; +max_iters = 100; #maximum number of interations to do + +r = y; +while 1 + + iter++; + #decide which variable to add to regression, if any + added = false; + if numel(X_use) < k + X_inds = zeros(k, 1, "logical"); X_inds(X_use) = 1; + + switch lower (method) + case {"corr"} + [~, i_to_add] = max(abs(corr(X(:, ~X_inds), r))); #try adding the variable with the highest correlation to the residual from current regression + i_to_add = (1:k)(~X_inds)(i_to_add); #index within the original predictor set + [b_new, bint_new, r_new, rint_new, stats_new] = regress(y, [ones(n, 1) X(:, [X_use i_to_add])], penter); + case {"p"} + z_vals=zeros(k,1); + for j=1:k + if ~X_inds(j) + [b_j, bint_j, ~,~ ,~] = regress(y, [ones(n, 1) X(:, [X_use j])], penter); + z_vals(j) = abs(b_j(end)) / (bint_j(end, 2) - b_j(end)); + endif + endfor + [~, i_to_add] = max(z_vals); #try adding the variable with the largest z-value (smallest partial p-value) + [b_new, bint_new, r_new, rint_new, stats_new] = regress(y, [ones(n, 1) X(:, [X_use i_to_add])], penter); + otherwise + error("stepwisefit: invalid value for method") + endswitch + + z_new = abs(b_new(end)) / (bint_new(end, 2) - b_new(end)); + if z_new > 1 #accept new variable + added = true; + X_use = [X_use i_to_add]; + b = b_new; + bint = bint_new; + r = r_new; + rint = rint_new; + stats = stats_new; + v = v + 1; + endif + endif + + #decide which variable to drop from regression, if any + dropped = false; + if v > 0 + t_ratio = tinv(1 - premove/2, n - v - 1) / tinv(1 - penter/2, n - v - 1); #estimate the ratio between the z score corresponding to premove to that corresponding to penter + [z_min, i_min] = min(abs(b(2:end)) ./ (bint(2:end, 2) - b(2:end))); + if z_min < t_ratio #drop a variable + dropped = true; + X_use(i_min) = []; + [b, bint, r, rint, stats] = regress(y, [ones(n, 1) X(:, X_use)], penter); + v = v - 1; + endif + endif + + #terminate if no change in the list of regression variables + if ~added && ~dropped + break + endif + + if iter >= max_iters + warning('stepwisefit: maximum iteration count exceeded before convergence') + break + endif + +endwhile + +endfunction + +%!test +%! % Sample data from Draper and Smith (n = 13, k = 4) +%! X = [7 1 11 11 7 11 3 1 2 21 1 11 10; ... +%! 26 29 56 31 52 55 71 31 54 47 40 66 68; ... +%! 6 15 8 8 6 9 17 22 18 4 23 9 8; ... +%! 60 52 20 47 33 22 6 44 22 26 34 12 12]'; +%! y = [78.5 74.3 104.3 87.6 95.9 109.2 102.7 72.5 93.1 115.9 83.8 113.3 109.4]'; +%! [X_use, b, bint, r, rint, stats] = stepwisefit(y, X); +%! assert(X_use, [4 1]) +%! assert(b, regress(y, [ones(size(y)) X(:, X_use)], 0.05)) +%! [X_use, b, bint, r, rint, stats] = stepwisefit(y, X, 0.05, 0.1, "corr"); +%! assert(X_use, [4 1]) +%! assert(b, regress(y, [ones(size(y)) X(:, X_use)], 0.05)) +%! [X_use, b, bint, r, rint, stats] = stepwisefit(y, X, [], [], "p"); +%! assert(X_use, [4 1]) +%! assert(b, regress(y, [ones(size(y)) X(:, X_use)], 0.05)) + + diff --git a/inst/tabulate.m b/inst/tabulate.m new file mode 100644 index 0000000..e9015c6 --- /dev/null +++ b/inst/tabulate.m @@ -0,0 +1,129 @@ +## Copyright (C) 2003 Alberto Terruzzi +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{table} =} tabulate (@var{data}, @var{edges}) +## +## Compute a frequency table. +## +## For vector data, the function counts the number of +## values in data that fall between the elements in the edges vector +## (which must contain monotonically non-decreasing values). @var{table} is a +## matrix. +## The first column of @var{table} is the number of bin, the second +## is the number of instances in each class (absolute frequency). The +## third column contains the percentage of each value (relative +## frequency) and the fourth column contains the cumulative frequency. +## +## If @var{edges} is missed the width of each class is unitary, if @var{edges} +## is a scalar then represent the number of classes, or you can define the +## width of each bin. +## @var{table}(@var{k}, 2) will count the value @var{data} (@var{i}) if +## @var{edges} (@var{k}) <= @var{data} (@var{i}) < @var{edges} (@var{k}+1). +## The last bin will count the value of @var{data} (@var{i}) if +## @var{edges}(@var{k}) <= @var{data} (@var{i}) <= @var{edges} (@var{k}+1). +## Values outside the values in @var{edges} are not counted. Use -inf and inf +## in @var{edges} to include all values. +## Tabulate with no output arguments returns a formatted table in the +## command window. +## +## Example +## +## @example +## sphere_radius = [1:0.05:2.5]; +## tabulate (sphere_radius) +## @end example +## +## Tabulate returns 2 bins, the first contains the sphere with radius +## between 1 and 2 mm excluded, and the second one contains the sphere with +## radius between 2 and 3 mm. +## +## @example +## tabulate (sphere_radius, 10) +## @end example +## +## Tabulate returns ten bins. +## +## @example +## tabulate (sphere_radius, [1, 1.5, 2, 2.5]) +## @end example +## +## Tabulate returns three bins, the first contains the sphere with radius +## between 1 and 1.5 mm excluded, the second one contains the sphere with +## radius between 1.5 and 2 mm excluded, and the third contains the sphere with +## radius between 2 and 2.5 mm. +## +## @example +## bar (table (:, 1), table (:, 2)) +## @end example +## +## draw histogram. +## +## @seealso{bar, pareto} +## @end deftypefn + +## Author: Alberto Terruzzi +## Version: 1.0 +## Created: 13 February 2003 + +function table = tabulate (varargin) + + if nargin < 1 || nargin > 2 + print_usage; + endif + + data = varargin{1}; + if isvector (data) != 1 + error ("data must be a vector."); + endif + n = length(data); + m = min(data); + M = max(data); + + if nargin == 1 edges = 1:1:max(data)+1; + else edges = varargin{2}; + end + + if isscalar(edges) + h=(M-m)/edges; + edges = [m:h:M]; + end + + # number of classes + bins=length(edges)-1; + # initialize freqency table + freqtable = zeros(bins,4); + + for k=1:1:bins; + if k != bins + freqtable(k,2)=length(find (data >= edges(k) & data < edges(k+1))); + else + freqtable(k,2)=length(find (data >= edges(k) & data <= edges(k+1))); + end + if k == 1 freqtable (k,4) = freqtable(k,2); + else freqtable(k,4) = freqtable(k-1,4) + freqtable(k,2); + end + end + + freqtable(:,1) = edges(1:end-1)(:); + freqtable(:,3) = 100*freqtable(:,2)/n; + + if nargout == 0 + disp(" bin Fa Fr% Fc"); + printf("%8g %5d %6.2f%% %5d\n",freqtable'); + else table = freqtable; + end + +endfunction diff --git a/inst/tblread.m b/inst/tblread.m new file mode 100644 index 0000000..10db122 --- /dev/null +++ b/inst/tblread.m @@ -0,0 +1,99 @@ +## Copyright (C) 2008 Bill Denney +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{data}, @var{varnames}, @var{casenames}] =} tblread (@var{filename}) +## @deftypefnx {Function File} {[@var{data}, @var{varnames}, @var{casenames}] =} tblread (@var{filename}, @var{delimeter}) +## Read tabular data from an ascii file. +## +## @var{data} is read from an ascii data file named @var{filename} with +## an optional @var{delimeter}. The delimeter may be any single +## character or +## @itemize +## @item "space" " " (default) +## @item "tab" "\t" +## @item "comma" "," +## @item "semi" ";" +## @item "bar" "|" +## @end itemize +## +## The @var{data} is read starting at cell (2,2) where the +## @var{varnames} form a char matrix from the first row (starting at +## (1,2)) vertically concatenated, and the @var{casenames} form a char +## matrix read from the first column (starting at (2,1)) vertically +## concatenated. +## @seealso{tblwrite, csv2cell, cell2csv} +## @end deftypefn + +function [data, varnames, casenames] = tblread (f="", d=" ") + + ## Check arguments + if nargin < 1 || nargin > 2 + print_usage (); + endif + if isempty (f) + ## FIXME: open a file dialog box in this case when a file dialog box + ## becomes available + error ("tblread: filename must be given") + endif + [d err] = tbl_delim (d); + if ! isempty (err) + error ("tblread: %s", err) + endif + + d = csv2cell (f, d); + data = cell2mat (d(2:end, 2:end)); + varnames = strvcat (d(1,2:end)); + casenames = strvcat (d(2:end,1)); + +endfunction + +## Tests +%!shared d, v, c, tblreadspacefile, tblreadtabfile +%! d = [1 2;3 4]; +%! v = ["a ";"bc"]; +%! c = ["de";"f "]; +%! tblreadspacefile = file_in_loadpath("test/tblread-space.dat"); +%! tblreadtabfile = file_in_loadpath("test/tblread-tab.dat"); +%!test +%! [dt vt ct] = tblread (tblreadspacefile); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%!test +%! [dt vt ct] = tblread (tblreadspacefile, " "); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%!test +%! [dt vt ct] = tblread (tblreadspacefile, "space"); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%!test +%! [dt vt ct] = tblread (tblreadtabfile, "tab"); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%!test +%! [dt vt ct] = tblread (tblreadtabfile, "\t"); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%!test +%! [dt vt ct] = tblread (tblreadtabfile, '\t'); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); diff --git a/inst/tblwrite.m b/inst/tblwrite.m new file mode 100644 index 0000000..9dd7c84 --- /dev/null +++ b/inst/tblwrite.m @@ -0,0 +1,208 @@ +## Copyright (C) 2008 Bill Denney +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} tblwrite (@var{data}, @var{varnames}, @var{casenames}, @var{filename}) +## @deftypefnx {Function File} {} tblwrite (@var{data}, @var{varnames}, @var{casenames}, @var{filename}, @var{delimeter}) +## Write tabular data to an ascii file. +## +## @var{data} is written to an ascii data file named @var{filename} with +## an optional @var{delimeter}. The delimeter may be any single +## character or +## @itemize +## @item "space" " " (default) +## @item "tab" "\t" +## @item "comma" "," +## @item "semi" ";" +## @item "bar" "|" +## @end itemize +## +## The @var{data} is written starting at cell (2,2) where the +## @var{varnames} are a char matrix or cell vector written to the first +## row (starting at (1,2)), and the @var{casenames} are a char matrix +## (or cell vector) written to the first column (starting at (2,1)). +## @seealso{tblread, csv2cell, cell2csv} +## @end deftypefn + +function tblwrite (data, varnames, casenames, f="", d=" ") + + ## Check arguments + if nargin < 4 || nargin > 5 + print_usage (); + endif + varnames = __makecell__ (varnames, "varnames"); + casenames = __makecell__ (casenames, "varnames"); + if numel (varnames) != columns (data) + error ("tblwrite: the number of rows (or cells) in varnames must equal the number of columns in data") + endif + if numel (varnames) != rows (data) + error ("tblwrite: the number of rows (or cells) in casenames must equal the number of rows in data") + endif + + if isempty (f) + ## FIXME: open a file dialog box in this case when a file dialog box + ## becomes available + error ("tblread: filename must be given") + endif + [d err] = tbl_delim (d); + if ! isempty (err) + error ("tblwrite: %s", err) + endif + + dat = cell (size (data) + 1); + dat(1,2:end) = varnames; + dat(2:end,1) = casenames; + dat(2:end,2:end) = mat2cell (data, + ones (rows (data), 1), + ones (columns (data), 1));; + cell2csv (f, dat, d); + +endfunction + +function x = __makecell__ (x, name) + ## force x into a cell matrix + if ! iscell (x) + if ischar (x) + ## convert varnames into a cell + x = mat2cell (x, ones (rows (x), 1)); + else + error ("tblwrite: %s must be either a char or a cell", name) + endif + endif +endfunction + +## Tests + +%!shared privpath +%! privpath = [fileparts(which('tblwrite')) filesep() 'private']; +## Tests for tbl_delim (private function) +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim (" "); +%! assert (d, " "); +%! assert (err, ""); +%! rmpath (privpath); +## Named delimiters +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("space"); +%! assert (d, " "); +%! assert (err, ""); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("tab"); +%! assert (d, sprintf ("\t")); +%! assert (err, ""); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("comma"); +%! assert (d, ","); +%! assert (err, ""); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("semi"); +%! assert (d, ";"); +%! assert (err, ""); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("bar"); +%! assert (d, "|"); +%! assert (err, ""); +%! rmpath (privpath); +## An arbitrary character +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("x"); +%! assert (d, "x"); +%! assert (err, ""); +%! rmpath (privpath); +## An arbitrary escape string +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ('\r'); +%! assert (d, sprintf ('\r')) +%! assert (err, ""); +%! rmpath (privpath); +## Errors +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ("bars"); +%! assert (isnan (d)); +%! assert (! isempty (err)); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim (""); +%! assert (isnan (d)); +%! assert (! isempty (err)); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim (5); +%! assert (isnan (d)); +%! assert (! isempty (err)); +%! rmpath (privpath); +%!test +%! addpath (privpath,'-end') +%! [d err] = tbl_delim ({"."}); +%! assert (isnan (d)); +%! assert (! isempty (err)); +%! rmpath (privpath); + +## Tests for tblwrite +%!shared d, v, c, tempfilename +%! d = [1 2;3 4]; +%! v = ["a ";"bc"]; +%! c = ["de";"f "]; +%! tempfilename = tempname; +%!test +%! tblwrite (d, v, c, tempfilename); +%! [dt vt ct] = tblread (tempfilename, " "); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%! delete (tempfilename); +%!test +%! tblwrite (d, v, c, tempfilename, " "); +%! [dt vt ct] = tblread (tempfilename, " "); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%! delete (tempfilename); +%!test +%! tblwrite (d, v, c, tempfilename, "space"); +%! [dt vt ct] = tblread (tempfilename); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%! delete (tempfilename); +%!test +%! tblwrite (d, v, c, tempfilename, "tab"); +%! [dt vt ct] = tblread (tempfilename, "tab"); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%! delete (tempfilename); +%!test +%! tblwrite (d, v, c, tempfilename, "\t"); +%! [dt vt ct] = tblread (tempfilename, "\t"); +%! assert (dt, d); +%! assert (vt, v); +%! assert (ct, c); +%! delete (tempfilename); diff --git a/inst/tricdf.m b/inst/tricdf.m new file mode 100644 index 0000000..738fca3 --- /dev/null +++ b/inst/tricdf.m @@ -0,0 +1,131 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1997-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} tricdf (@var{x}, @var{a}, @var{b}, @var{c}) +## Compute the cumulative distribution function (CDF) at @var{x} of the +## triangular distribution with parameters @var{a}, @var{b}, and @var{c} +## on the interval [@var{a}, @var{b}]. +## @end deftypefn + +## Author: Dag Lyberg +## Description: CDF of the triangle distribution + +function cdf = tricdf (x, a, b, c) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (a) || ! isscalar (b) || ! isscalar (c)) + [retval, x, a, b, c] = common_size (x, a, b, c); + if (retval > 0) + error ("tricdf: X, A, B, and C must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (a) || iscomplex (b) || iscomplex (c)) + error ("tricdf: X, A, B, and C must not be complex"); + endif + + if (isa (x, "single") || isa (a, "single") + || isa (b, "single") || isa (c, "single")) + cdf = zeros (size (x), "single"); + else + cdf = zeros (size (x)); + endif + + k = isnan (x) | !(a < b) | !(c >= a) | !(c <= b) ; + cdf(k) = NaN; + + k = (x > a) & (a < b) & (a <= c) & (c <= b); + if (isscalar (a) && isscalar (b) && isscalar (c)) + h = 2 / (b-a); + + k_temp = k & (c <= x); + full_area = (c-a) * h / 2; + cdf(k_temp) += full_area; + + k_temp = k & (a < x) & (x < c); + area = (x(k_temp) - a).^2 * h / (2 * ( c - a)); + cdf(k_temp) += area; + + k_temp = k & (b <= x); + full_area = (b-c) * h / 2; + cdf(k_temp) += full_area; + + k_temp = k & (c < x) & (x < b); + area = (b-x(k_temp)).^2 * h / (2 * (b - c)); + cdf(k_temp) += full_area - area; + else + h = 2 ./ (b-a); + + k_temp = k & (c <= x); + full_area = (c(k_temp)-a(k_temp)) .* h(k_temp) / 2; + cdf(k_temp) += full_area; + + k_temp = k & (a <= x) & (x < c); + area = (x(k_temp) - a(k_temp)).^2 .* h(k_temp) ./ (2 * (c(k_temp) - a(k_temp))); + cdf(k_temp) += area; + + k_temp = k & (b <= x); + full_area = (b(k_temp)-c(k_temp)) .* h(k_temp) / 2; + cdf(k_temp) += full_area; + + k_temp = k & (c <= x) & (x < b); + area = (b(k_temp)-x(k_temp)).^2 .* h(k_temp) ./ (2 * (b(k_temp) - c(k_temp))); + cdf(k_temp) += full_area - area; + endif + +endfunction + + +%!shared x,y +%! x = [-1, 0, 0.1, 0.5, 0.9, 1, 2] + 1; +%! y = [0, 0, 0.02, 0.5, 0.98, 1 1]; +%!assert (tricdf (x, ones (1,7), 2*ones (1,7), 1.5*ones (1,7)), y, eps) +%!assert (tricdf (x, 1*ones (1,7), 2, 1.5), y, eps) +%!assert (tricdf (x, 1, 2*ones (1,7), 1.5), y, eps) +%!assert (tricdf (x, 1, 2, 1.5*ones (1,7)), y, eps) +%!assert (tricdf (x, 1, 2, 1.5), y, eps) +%!assert (tricdf (x, [1, 1, NaN, 1, 1, 1, 1], 2, 1.5), [y(1:2), NaN, y(4:7)], eps) +%!assert (tricdf (x, 1, 2*[1, 1, NaN, 1, 1, 1, 1], 1.5), [y(1:2), NaN, y(4:7)], eps) +%!assert (tricdf (x, 1, 2, 1.5*[1, 1, NaN, 1, 1, 1, 1]), [y(1:2), NaN, y(4:7)], eps) +%!assert (tricdf ([x, NaN], 1, 2, 1.5), [y, NaN], eps) + +## Test class of input preserved +%!assert (tricdf (single ([x, NaN]), 1, 2, 1.5), single ([y, NaN]), eps('single')) +%!assert (tricdf ([x, NaN], single (1), 2, 1.5), single ([y, NaN]), eps('single')) +%!assert (tricdf ([x, NaN], 1, single (2), 1.5), single ([y, NaN]), eps('single')) +%!assert (tricdf ([x, NaN], 1, 2, single (1.5)), single ([y, NaN]), eps('single')) + +## Test input validation +%!error tricdf () +%!error tricdf (1) +%!error tricdf (1,2) +%!error tricdf (1,2,3) +%!error tricdf (1,2,3,4,5) +%!error tricdf (1, ones (3), ones (2), ones (2)) +%!error tricdf (1, ones (2), ones (3), ones (2)) +%!error tricdf (1, ones (2), ones (2), ones (3)) +%!error tricdf (i, 2, 2, 2) +%!error tricdf (2, i, 2, 2) +%!error tricdf (2, 2, i, 2) +%!error tricdf (2, 2, 2, i) + diff --git a/inst/triinv.m b/inst/triinv.m new file mode 100644 index 0000000..bc10f4d --- /dev/null +++ b/inst/triinv.m @@ -0,0 +1,119 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} triinv (@var{x}, @var{a}, @var{b}, @var{c}) +## For each element of @var{x}, compute the quantile (the inverse of the CDF) +## at @var{x} of the triangular distribution with parameters +## @var{a}, @var{b}, and @var{c} on the interval [@var{a}, @var{b}]. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Quantile function of the triangular distribution + +function inv = triinv (x, a, b, c) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (a) || ! isscalar (b) || ! isscalar (c)) + [retval, x, a, b, c] = common_size (x, a, b, c); + if (retval > 0) + error ("triinv: X, A, B, and C must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (a) || iscomplex (b) || iscomplex (c)) + error ("triinv: X, A, B, and C must not be complex"); + endif + + if (isa (x, "single") || isa (a, "single") || isa (b, "single")) + inv = NaN (size (x), "single"); + else + inv = NaN (size (x)); + endif + + k = (x >= 0) & (x <= 1) & (a < b) & (a <= c) & (c <= b); + inv(k) = 0; + + if (isscalar (a) && isscalar (b) && isscalar(c)) + h = 2 / (b-a); + w = c-a; + area1 = h * w / 2; + j = k & (x <= area1); + inv(j) += (x(j) * 2 * w / h).^0.5 + a; + + w = b-c; + j = k & (area1 < x) & (x < 1); + inv(j) += b - ((1-x(j)) * 2 * w / h).^0.5; + + j = k & (x == 1); + inv(j) = b; + else + h = 2 ./ (b-a); + w = c-a; + area1 = h .* w / 2; + j = k & (x <= area1); + inv(j) += (2 * x(j) .* (w(j) ./ h(j))).^0.5 + a(j); + + w = b-c; + j = k & (area1 < x) & (x < 1); + inv(j) += b(j) - (2 * (1-x(j)) .* (w(j) ./ h(j))).^0.5; + + j = k & (x == 1); + inv(j) = b(j); + endif + +endfunction + + +%!shared x,y +%! x = [-1, 0, 0.02, 0.5, 0.98, 1, 2]; +%! y = [NaN, 0, 0.1, 0.5, 0.9, 1, NaN] + 1; +%!assert (triinv (x, ones (1,7), 2*ones (1,7), 1.5*ones (1,7)), y, eps) +%!assert (triinv (x, 1*ones (1,7), 2, 1.5), y, eps) +%!assert (triinv (x, 1, 2*ones (1,7), 1.5), y, eps) +%!assert (triinv (x, 1, 2, 1.5*ones (1,7)), y, eps) +%!assert (triinv (x, 1, 2, 1.5), y, eps) +%!assert (triinv (x, [1, 1, NaN, 1, 1, 1, 1], 2, 1.5), [y(1:2), NaN, y(4:7)], eps) +%!assert (triinv (x, 1, 2*[1, 1, NaN, 1, 1, 1, 1], 1.5), [y(1:2), NaN, y(4:7)], eps) +%!assert (triinv (x, 1, 2, 1.5*[1, 1, NaN, 1, 1, 1, 1]), [y(1:2), NaN, y(4:7)], eps) +%!assert (triinv ([x, NaN], 1, 2, 1.5), [y, NaN], eps) + +## Test class of input preserved +%!assert (triinv (single ([x, NaN]), 1, 2, 1.5), single ([y, NaN]), eps('single')) +%!assert (triinv ([x, NaN], single (1), 2, 1.5), single ([y, NaN]), eps('single')) +%!assert (triinv ([x, NaN], 1, single (2), 1.5), single ([y, NaN]), eps('single')) +%!assert (triinv ([x, NaN], 1, 2, single (1.5)), single ([y, NaN]), eps('single')) + +## Test input validation +%!error triinv () +%!error triinv (1) +%!error triinv (1,2) +%!error triinv (1,2,3) +%!error triinv (1,2,3,4,5) +%!error triinv (1, ones (3), ones (2), ones (2)) +%!error triinv (1, ones (2), ones (3), ones (2)) +%!error triinv (1, ones (2), ones (2), ones (3)) +%!error triinv (i, 2, 2, 2) +%!error triinv (2, i, 2, 2) +%!error triinv (2, 2, i, 2) +%!error triinv (2, 2, 2, i) + diff --git a/inst/trimmean.m b/inst/trimmean.m new file mode 100644 index 0000000..901b00f --- /dev/null +++ b/inst/trimmean.m @@ -0,0 +1,58 @@ +## Copyright (C) 2001 Paul Kienzle +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{a} =} trimmean (@var{x}, @var{p}) +## +## Compute the trimmed mean. +## +## The trimmed mean of @var{x} is defined as the mean of @var{x} excluding the +## highest and lowest @var{p} percent of the data. +## +## For example +## +## @example +## mean ([-inf, 1:9, inf]) +## @end example +## +## is NaN, while +## +## @example +## trimmean ([-inf, 1:9, inf], 10) +## @end example +## +## excludes the infinite values, which make the result 5. +## +## @seealso{mean} +## @end deftypefn + +function a = trimmean(x, p, varargin) + if (nargin != 2 && nargin != 3) + print_usage; + endif + y = sort(x, varargin{:}); + sz = size(x); + if nargin < 3 + dim = min(find(sz>1)); + if isempty(dim), dim=1; endif; + else + dim = varargin{1}; + endif + idx = cell (0); + for i=1:length(sz), idx{i} = 1:sz(i); end; + trim = round(sz(dim)*p*0.01); + idx{dim} = 1+trim : sz(dim)-trim; + a = mean (y (idx{:}), varargin{:}); +endfunction diff --git a/inst/tripdf.m b/inst/tripdf.m new file mode 100644 index 0000000..f870c9f --- /dev/null +++ b/inst/tripdf.m @@ -0,0 +1,111 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1997-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} tripdf (@var{x}, @var{a}, @var{b}, @var{c}) +## Compute the probability density function (PDF) at @var{x} of the triangular +## distribution with parameters @var{a}, @var{b}, and @var{c} on the interval +## [@var{a}, @var{b}]. +## @end deftypefn + +## Author: Dag Lyberg +## Description: PDF of the triangular distribution + +function pdf = tripdf (x, a, b, c) + + if (nargin != 4) + print_usage (); + endif + + if (! isscalar (a) || ! isscalar (b) || ! isscalar (c)) + [retval, x, a, b, c] = common_size (x, a, b, c); + if (retval > 0) + error ("tripdf: X, A, B, and C must be of common size or scalars"); + endif + endif + + if (iscomplex (x) || iscomplex (a) || iscomplex (b) || iscomplex (c)) + error ("tripdf: X, A, B, and C must not be complex"); + endif + + if (isa (x, "single") || isa (a, "single") ... + || isa (b, "single") || isa (c, "single")) + pdf = zeros (size (x), "single"); + else + pdf = zeros (size (x)); + endif + + k = isnan (x) | !(a < b) | !(c >= a) | !(c <= b) ; + pdf(k) = NaN; + + k = (x >= a) & (x <= b) & (a < b) & (a <= c) & (c <= b); + h = 2 ./ (b-a); + if (isscalar (a) && isscalar (b) && isscalar (c)) + j = k & (a <= x) & (x < c); + pdf(j) = h * (x(j)-a) / (c-a); + j = k & (x == c); + pdf(j) = h; + j = k & (c < x) & (x <= b); + pdf(j) = h * (b-x(j)) / (b-c); + else + j = k & (a <= x) & (x < c); + pdf(j) = h(j) .* (x(j)-a(j)) ./ (c(j)-a(j)); + j = k & (x == c); + pdf(j) = h(j); + j = k & (c < x) & (x <= b); + pdf(j) = h(j) .* (b(j)-x(j)) ./ (b(j)-c(j)); + endif + +endfunction + + +%!shared x,y,deps +%! x = [-1, 0, 0.1, 0.5, 0.9, 1, 2] + 1; +%! y = [0, 0, 0.4, 2, 0.4, 0, 0]; +%! deps = 2*eps; +%!assert (tripdf (x, ones (1,7), 2*ones (1,7), 1.5*ones (1,7)), y, deps) +%!assert (tripdf (x, 1*ones (1,7), 2, 1.5), y, deps) +%!assert (tripdf (x, 1, 2*ones (1,7), 1.5), y, deps) +%!assert (tripdf (x, 1, 2, 1.5*ones (1,7)), y, deps) +%!assert (tripdf (x, 1, 2, 1.5), y, deps) +%!assert (tripdf (x, [1, 1, NaN, 1, 1, 1, 1], 2, 1.5), [y(1:2), NaN, y(4:7)], deps) +%!assert (tripdf (x, 1, 2*[1, 1, NaN, 1, 1, 1, 1], 1.5), [y(1:2), NaN, y(4:7)], deps) +%!assert (tripdf (x, 1, 2, 1.5*[1, 1, NaN, 1, 1, 1, 1]), [y(1:2), NaN, y(4:7)], deps) +%!assert (tripdf ([x, NaN], 1, 2, 1.5), [y, NaN], deps) + +## Test class of input preserved +%!assert (tripdf (single ([x, NaN]), 1, 2, 1.5), single ([y, NaN]), eps('single')) +%!assert (tripdf ([x, NaN], single (1), 2, 1.5), single ([y, NaN]), eps('single')) +%!assert (tripdf ([x, NaN], 1, single (2), 1.5), single ([y, NaN]), eps('single')) +%!assert (tripdf ([x, NaN], 1, 2, single (1.5)), single ([y, NaN]), eps('single')) + +## Test input validation +%!error tripdf () +%!error tripdf (1) +%!error tripdf (1,2) +%!error tripdf (1,2,3) +%!error tripdf (1,2,3,4,5) +%!error tripdf (1, ones (3), ones (2), ones (2)) +%!error tripdf (1, ones (2), ones (3), ones (2)) +%!error tripdf (1, ones (2), ones (2), ones (3)) +%!error tripdf (i, 2, 2, 2) +%!error tripdf (2, i, 2, 2) +%!error tripdf (2, 2, i, 2) +%!error tripdf (2, 2, 2, i) + diff --git a/inst/trirnd.m b/inst/trirnd.m new file mode 100644 index 0000000..156967e --- /dev/null +++ b/inst/trirnd.m @@ -0,0 +1,153 @@ +## Copyright (C) 2016 Dag Lyberg +## Copyright (C) 1995-2015 Kurt Hornik +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or (at +## your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} trirnd (@var{a}, @var{b}, @var{c}) +## @deftypefnx {} {} trirnd (@var{a}, @var{b}, @var{c}, @var{r}) +## @deftypefnx {} {} trirnd (@var{a}, @var{b}, @var{c}, @var{r}, @var{c}, @dots{}) +## @deftypefnx {} {} trirnd (@var{a}, @var{b}, @var{c}, [@var{sz}]) +## Return a matrix of random samples from the rectangular distribution with +## parameters @var{a}, @var{b}, and @var{c} on the interval [@var{a}, @var{b}]. +## +## When called with a single size argument, return a square matrix with +## the dimension specified. When called with more than one scalar argument the +## first two arguments are taken as the number of rows and columns and any +## further arguments specify additional matrix dimensions. The size may also +## be specified with a vector of dimensions @var{sz}. +## +## If no size arguments are given then the result matrix is the common size of +## @var{a}, @var{b} and @var{c}. +## @end deftypefn + +## Author: Dag Lyberg +## Description: Random deviates from the triangular distribution + +function rnd = trirnd (a, b, c, varargin) + + if (nargin < 3) + print_usage (); + endif + + if (! isscalar (a) || ! isscalar (b) || ! isscalar (c)) + [retval, a, b, c] = common_size (a, b, c); + if (retval > 0) + error ("trirnd: A, B, and C must be of common size or scalars"); + endif + endif + + if (iscomplex (a) || iscomplex (b) || iscomplex (c)) + error ("trirnd: A, B, and C must not be complex"); + endif + + if (nargin == 3) + sz = size (a); + elseif (nargin == 4) + if (isscalar (varargin{1}) && varargin{1} >= 0) + sz = [varargin{1}, varargin{1}]; + elseif (isrow (varargin{1}) && all (varargin{1} >= 0)) + sz = varargin{1}; + else + error ("trirnd: dimension vector must be row vector of non-negative integers"); + endif + elseif (nargin > 4) + if (any (cellfun (@(x) (! isscalar (x) || x < 0), varargin))) + error ("trirnd: dimensions must be non-negative integers"); + endif + sz = [varargin{:}]; + endif + + if (! isscalar (a) && ! isequal (size (b), sz)) + error ("trirnd: A, B, and C must be scalar or of size SZ"); + endif + + if (isa (a, "single") || isa (b, "single") || isa (c, "single")) + cls = "single"; + else + cls = "double"; + endif + + if (isscalar (a) && isscalar (b) && isscalar (c)) + if ((-Inf < a) && (a < b) && (a <= c) && (c <= b) && (b < Inf)) + w = b-a; + left_width = c-a; + right_width = b-c; + h = 2 / w; + left_area = h * left_width / 2; + rnd = rand (sz, cls); + idx = rnd < left_area; + rnd(idx) = a + (rnd(idx) * w * left_width).^0.5; + rnd(~idx) = b - ((1-rnd(~idx)) * w * right_width).^0.5; + else + rnd = NaN (sz, cls); + endif + else + w = b-a; + left_width = c-a; + right_width = b-c; + h = 2 ./ w; + left_area = h .* left_width / 2; + rnd = rand (sz, cls); + k = rnd < left_area; + rnd(k) = a(k) + (rnd(k) .* w(k) .* left_width(k)).^0.5; + rnd(~k) = b(~k) - ((1-rnd(~k)) .* w(~k) .* right_width(~k)).^0.5; + + k = ! (-Inf < a) | ! (a < b) | ! (a <= c) | ! (c <= b) | ! (b < Inf); + rnd(k) = NaN; + endif + +endfunction + + +%!assert (size (trirnd (1,2,1.5)), [1, 1]) +%!assert (size (trirnd (1*ones (2,1), 2,1.5)), [2, 1]) +%!assert (size (trirnd (1*ones (2,2), 2,1.5)), [2, 2]) +%!assert (size (trirnd (1, 2*ones (2,1), 1.5)), [2, 1]) +%!assert (size (trirnd (1, 2*ones (2,2), 1.5)), [2, 2]) +%!assert (size (trirnd (1, 2, 1.5*ones (2,1))), [2, 1]) +%!assert (size (trirnd (1, 2, 1.5*ones (2,2))), [2, 2]) +%!assert (size (trirnd (1, 2, 1.5, 3)), [3, 3]) +%!assert (size (trirnd (1, 2, 1.5, [4 1])), [4, 1]) +%!assert (size (trirnd (1, 2, 1.5, 4, 1)), [4, 1]) + +## Test class of input preserved +%!assert (class (trirnd (1,2,1.5)), "double") +%!assert (class (trirnd (single (1),2,1.5)), "single") +%!assert (class (trirnd (single ([1 1]),2,1.5)), "single") +%!assert (class (trirnd (1,single (2),1.5)), "single") +%!assert (class (trirnd (1,single ([2 2]),1.5)), "single") +%!assert (class (trirnd (1,2,single (1.5))), "single") +%!assert (class (trirnd (1,2,single ([1.5 1.5]))), "single") + +## Test input validation +%!error trirnd () +%!error trirnd (1) +%!error trirnd (1,2) +%!error trirnd (ones (3), 2*ones (2), 1.5*ones (2), 2) +%!error trirnd (ones (2), 2*ones (3), 1.5*ones (2), 2) +%!error trirnd (ones (2), 2*ones (2), 1.5*ones (3), 2) +%!error trirnd (i, 2, 1.5) +%!error trirnd (1, i, 1.5) +%!error trirnd (1, 2, i) +%!error trirnd (1,2,1.5, -1) +%!error trirnd (1,2,1.5, ones (2)) +%!error trirnd (1,2,1.5, [2 -1 2]) +%!error trirnd (1*ones (2),2,1.5, 3) +%!error trirnd (1*ones (2),2,1.5, [3, 2]) +%!error trirnd (1*ones (2),2,1.5, 3, 2) + diff --git a/inst/tstat.m b/inst/tstat.m new file mode 100644 index 0000000..b5d7ac6 --- /dev/null +++ b/inst/tstat.m @@ -0,0 +1,98 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} tstat (@var{n}) +## Compute mean and variance of the t (Student) distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{n} is the parameter of the t (Student) distribution. The elements +## of @var{n} must be positive +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the t (Student) distribution +## +## @item +## @var{v} is the variance of the t (Student) distribution +## @end itemize +## +## @subheading Example +## +## @example +## @group +## n = 3:8; +## [m, v] = tstat (n) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the t (Student) distribution + +function [m, v] = tstat (n) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (n) && ! ismatrix (n)) + error ("tstat: n must be a numeric matrix"); + endif + + # Calculate moments + m = zeros (size (n)); + v = n ./ (n - 2); + + # Continue argument check + k = find (! (n > 1) | ! (n < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + k = find (! (n > 2) & (n < Inf)); + if (any (k)) + v(k) = Inf; + endif + +endfunction + +%!test +%! n = 3:8; +%! [m, v] = tstat (n); +%! expected_m = [0, 0, 0, 0, 0, 0]; +%! expected_v = [3.0000, 2.0000, 1.6667, 1.5000, 1.4000, 1.3333]; +%! assert (m, expected_m); +%! assert (v, expected_v, 0.001); diff --git a/inst/ttest.m b/inst/ttest.m new file mode 100644 index 0000000..525420d --- /dev/null +++ b/inst/ttest.m @@ -0,0 +1,161 @@ +## Copyright (C) 2014 Tony Richardson +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest (@var{x}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest (@var{x}, @var{m}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest (@var{x}, @var{y}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest (@var{x}, @var{m}, @var{Name}, @var{Value}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest (@var{x}, @var{y}, @var{Name}, @var{Value}) +## Test for mean of a normal sample with unknown variance. +## +## Perform a T-test of the null hypothesis @code{mean (@var{x}) == +## @var{m}} for a sample @var{x} from a normal distribution with unknown +## mean and unknown std deviation. Under the null, the test statistic +## @var{t} has a Student's t distribution. The default value of +## @var{m} is 0. +## +## If the second argument @var{y} is a vector, a paired-t test of the +## hypothesis @code{mean (@var{x}) = mean (@var{y})} is performed. +## +## Name-Value pair arguments can be used to set various options. +## @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). @qcode{"tail"}, can be used +## to select the desired alternative hypotheses. If the value is +## @qcode{"both"} (default) the null is tested against the two-sided +## alternative @code{mean (@var{x}) != @var{m}}. +## If it is @qcode{"right"} the one-sided alternative @code{mean (@var{x}) +## > @var{m}} is considered. Similarly for @qcode{"left"}, the one-sided +## alternative @code{mean (@var{x}) < @var{m}} is considered. +## When argument @var{x} is a matrix, @qcode{"dim"} can be used to selection +## the dimension over which to perform the test. (The default is the +## first non-singleton dimension). +## +## If @var{h} is 0 the null hypothesis is accepted, if it is 1 the null +## hypothesis is rejected. The p-value of the test is returned in @var{pval}. +## A 100(1-alpha)% confidence interval is returned in @var{ci}. @var{stats} +## is a structure containing the value of the test statistic (@var{tstat}), +## the degrees of freedom (@var{df}) and the sample standard deviation +## (@var{sd}). +## +## @end deftypefn + +## Author: Tony Richardson + +function [h, p, ci, stats] = ttest(x, my, varargin) + + % Set default arguments + my_default = 0; + alpha = 0.05; + tail = 'both'; + + % Find the first non-singleton dimension of x + dim = min(find(size(x)~=1)); + if isempty(dim), dim = 1; end + + if (nargin == 1) + my = my_default; + end + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + case 'dim' + i = i + 1; + dim = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail, 'char') + error('tail argument to ttest must be a string\n',[]); + end + + if any(and(~isscalar(my),size(x)~=size(my))) + error('Arrays in paired test must be the same size.'); + end + + % Set default values if arguments are present but empty + if isempty(my) + my = my_default; + end + + % This adjustment allows everything else to remain the + % same for both the one-sample t test and paired tests. + x = x - my; + + % Calculate the test statistic value (tval) + n = size(x, dim); + x_bar = mean(x, dim); + stats.tstat = 0; + stats.df = n-1; + stats.sd = std(x, 0, dim); + x_bar_std = stats.sd/sqrt(n); + tval = (x_bar)./x_bar_std; + stats.tstat = tval; + + % Based on the "tail" argument determine the P-value, the critical values, + % and the confidence interval. + switch lower(tail) + case 'both' + p = 2*(1 - tcdf(abs(tval),n-1)); + tcrit = -tinv(alpha/2,n-1); + ci = [x_bar-tcrit*x_bar_std; x_bar+tcrit*x_bar_std] + my; + case 'left' + p = tcdf(tval,n-1); + tcrit = -tinv(alpha,n-1); + ci = [-inf*ones(size(x_bar)); my+x_bar+tcrit*x_bar_std]; + case 'right' + p = 1 - tcdf(tval,n-1); + tcrit = -tinv(alpha,n-1); + ci = [my+x_bar-tcrit*x_bar_std; inf*ones(size(x_bar))]; + otherwise + error('Invalid tail argument to ttest\n',[]); + end + + % Reshape the ci array to match MATLAB shaping + if and(isscalar(x_bar), dim==2) + ci = ci(:)'; + elseif size(x_bar,2). + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest2 (@var{x}, @var{y}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} ttest2 (@var{x}, @var{y}, @var{Name}, @var{Value}) +## Test for mean of a normal sample with known variance. +## +## Perform a T-test of the null hypothesis @code{mean (@var{x}) == +## @var{m}} for a sample @var{x} from a normal distribution with unknown +## mean and unknown std deviation. Under the null, the test statistic +## @var{t} has a Student's t distribution. +## +## If the second argument @var{y} is a vector, a paired-t test of the +## hypothesis @code{mean (@var{x}) = mean (@var{y})} is performed. +## +## The argument @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). The string +## argument @qcode{"tail"}, can be used to select the desired alternative +## hypotheses. If @qcode{"alt"} is @qcode{"both"} (default) the null is +## tested against the two-sided alternative @code{mean (@var{x}) != @var{m}}. +## If @qcode{"alt"} is @qcode{"right"} the one-sided +## alternative @code{mean (@var{x}) > @var{m}} is considered. +## Similarly for @qcode{"left"}, the one-sided alternative @code{mean +## (@var{x}) < @var{m}} is considered. When @qcode{"vartype"} is @qcode{"equal"} +## the variances are assumed to be equal (this is the default). When +## @qcode{"vartype"} is @qcode{"unequal"} the variances are not assumed equal. +## When argument @var{x} is a matrix the @qcode{"dim"} argument can be +## used to selection the dimension over which to perform the test. +## (The default is the first non-singleton dimension.) +## +## If @var{h} is 0 the null hypothesis is accepted, if it is 1 the null +## hypothesis is rejected. The p-value of the test is returned in @var{pval}. +## A 100(1-alpha)% confidence interval is returned in @var{ci}. @var{stats} +## is a structure containing the value of the test statistic (@var{tstat}), +## the degrees of freedom (@var{df}) and the sample standard deviation +## (@var{sd}). +## +## @end deftypefn + +## Author: Tony Richardson + +function [h, p, ci, stats] = ttest2(x, y, varargin) + + alpha = 0.05; + tail = 'both'; + vartype = 'equal'; + + % Find the first non-singleton dimension of x + dim = min(find(size(x)~=1)); + if isempty(dim), dim = 1; end + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + case 'vartype' + i = i + 1; + vartype = varargin{i}; + case 'dim' + i = i + 1; + dim = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail, 'char') + error('Tail argument to ttest2 must be a string\n',[]); + end + + m = size(x, dim); + n = size(y, dim); + x_bar = mean(x,dim)-mean(y,dim); + s1_var = var(x, 0, dim); + s2_var = var(y, 0, dim); + + switch lower(vartype) + case 'equal' + stats.tstat = 0; + stats.df = (m + n - 2)*ones(size(x_bar)); + sp_var = ((m-1)*s1_var + (n-1)*s2_var)./stats.df; + stats.sd = sqrt(sp_var); + x_bar_std = sqrt(sp_var*(1/m+1/n)); + case 'unequal' + stats.tstat = 0; + se1 = sqrt(s1_var/m); + se2 = sqrt(s2_var/n); + sp_var = s1_var/m + s2_var/n; + stats.df = ((se1.^2+se2.^2).^2 ./ (se1.^4/(m-1) + se2.^4/(n-1))); + stats.sd = [sqrt(s1_var); sqrt(s2_var)]; + x_bar_std = sqrt(sp_var); + otherwise + error('Invalid fifth (vartype) argument to ttest2\n',[]); + end + + stats.tstat = x_bar./x_bar_std; + + % Based on the "tail" argument determine the P-value, the critical values, + % and the confidence interval. + switch lower(tail) + case 'both' + p = 2*(1 - tcdf(abs(stats.tstat),stats.df)); + tcrit = -tinv(alpha/2,stats.df); + %ci = [x_bar-tcrit*stats.sd; x_bar+tcrit*stats.sd]; + ci = [x_bar-tcrit.*x_bar_std; x_bar+tcrit.*x_bar_std]; + case 'left' + p = tcdf(stats.tstat,stats.df); + tcrit = -tinv(alpha,stats.df); + ci = [-inf*ones(size(x_bar)); x_bar+tcrit.*x_bar_std]; + case 'right' + p = 1 - tcdf(stats.tstat,stats.df); + tcrit = -tinv(alpha,stats.df); + ci = [x_bar-tcrit.*x_bar_std; inf*ones(size(x_bar))]; + otherwise + error('Invalid fourth (tail) argument to ttest2\n',[]); + end + + % Reshape the ci array to match MATLAB shaping + if and(isscalar(x_bar), dim==2) + ci = ci(:)'; + stats.sd = stats.sd(:)'; + elseif size(x_bar,2) +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} unidstat (@var{n}) +## Compute mean and variance of the discrete uniform distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{n} is the parameter of the discrete uniform distribution. The elements +## of @var{n} must be positive natural numbers +## @end itemize +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the discrete uniform distribution +## +## @item +## @var{v} is the variance of the discrete uniform distribution +## @end itemize +## +## @subheading Example +## +## @example +## @group +## n = 1:6; +## [m, v] = unidstat (n) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the discrete uniform distribution + +function [m, v] = unidstat (n) + + # Check arguments + if (nargin != 1) + print_usage (); + endif + + if (! isempty (n) && ! ismatrix (n)) + error ("unidstat: n must be a numeric matrix"); + endif + + # Calculate moments + m = (n + 1) ./ 2; + v = ((n .^ 2) - 1) ./ 12; + + # Continue argument check + k = find (! (n > 0) | ! (n < Inf) | ! (n == round (n))); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! n = 1:6; +%! [m, v] = unidstat (n); +%! expected_m = [1.0000, 1.5000, 2.0000, 2.5000, 3.0000, 3.5000]; +%! expected_v = [0.0000, 0.2500, 0.6667, 1.2500, 2.0000, 2.9167]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/unifstat.m b/inst/unifstat.m new file mode 100644 index 0000000..6023d87 --- /dev/null +++ b/inst/unifstat.m @@ -0,0 +1,122 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} unifstat (@var{a}, @var{b}) +## Compute mean and variance of the continuous uniform distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{a} is the first parameter of the continuous uniform distribution +## +## @item +## @var{b} is the second parameter of the continuous uniform distribution +## @end itemize +## @var{a} and @var{b} must be of common size or one of them must be scalar +## and @var{a} must be less than @var{b} +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the continuous uniform distribution +## +## @item +## @var{v} is the variance of the continuous uniform distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## a = 1:6; +## b = 2:2:12; +## [m, v] = unifstat (a, b) +## @end group +## +## @group +## [m, v] = unifstat (a, 10) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the continuous uniform distribution + +function [m, v] = unifstat (a, b) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (a) && ! ismatrix (a)) + error ("unifstat: a must be a numeric matrix"); + endif + if (! isempty (b) && ! ismatrix (b)) + error ("unifstat: b must be a numeric matrix"); + endif + + if (! isscalar (a) || ! isscalar (b)) + [retval, a, b] = common_size (a, b); + if (retval > 0) + error ("unifstat: a and b must be of common size or scalar"); + endif + endif + + # Calculate moments + m = (a + b) ./ 2; + v = ((b - a) .^ 2) ./ 12; + + # Continue argument check + k = find (! (-Inf < a) | ! (a < b) | ! (b < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! a = 1:6; +%! b = 2:2:12; +%! [m, v] = unifstat (a, b); +%! expected_m = [1.5000, 3.0000, 4.5000, 6.0000, 7.5000, 9.0000]; +%! expected_v = [0.0833, 0.3333, 0.7500, 1.3333, 2.0833, 3.0000]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! a = 1:6; +%! [m, v] = unifstat (a, 10); +%! expected_m = [5.5000, 6.0000, 6.5000, 7.0000, 7.5000, 8.0000]; +%! expected_v = [6.7500, 5.3333, 4.0833, 3.0000, 2.0833, 1.3333]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/vartest.m b/inst/vartest.m new file mode 100644 index 0000000..a81fb89 --- /dev/null +++ b/inst/vartest.m @@ -0,0 +1,117 @@ +## Copyright (C) 2014 Tony Richardson +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} vartest (@var{x}, @var{y}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} vartest (@var{x}, @var{y}, @var{Name}, @var{Value}) +## Perform a F-test for equal variances. +## +## If the second argument @var{y} is a vector, a paired-t test of the +## hypothesis @code{mean (@var{x}) = mean (@var{y})} is performed. +## +## The argument @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). The string +## argument @qcode{"tail"}, can be used to select the desired alternative +## hypotheses. If @qcode{"alt"} is @qcode{"both"} (default) the null is +## tested against the two-sided alternative @code{mean (@var{x}) != @var{m}}. +## If @qcode{"alt"} is @qcode{"right"} the one-sided +## alternative @code{mean (@var{x}) > @var{m}} is considered. +## Similarly for @qcode{"left"}, the one-sided alternative @code{mean +## (@var{x}) < @var{m}} is considered. When @qcode{"vartype"} is @qcode{"equal"} +## the variances are assumed to be equal (this is the default). When +## @qcode{"vartype"} is @qcode{"unequal"} the variances are not assumed equal. +## When argument @var{x} is a matrix the @qcode{"dim"} argument can be +## used to selection the dimension over which to perform the test. +## (The default is the first non-singleton dimension.) +## +## If @var{h} is 0 the null hypothesis is accepted, if it is 1 the null +## hypothesis is rejected. The p-value of the test is returned in @var{pval}. +## A 100(1-alpha)% confidence interval is returned in @var{ci}. @var{stats} +## is a structure containing the value of the test statistic (@var{tstat}), +## the degrees of freedom (@var{df}) and the sample standard deviation +## (@var{sd}). +## +## @end deftypefn + +## Author: Tony Richardson +## Description: Test for mean of a normal sample with known variance + +function [h, p, ci, stats] = vartest(x, v, varargin) + + % Set default arguments + alpha = 0.05; + tail = 'both'; + % Find the first non-singleton dimension of x + dim = min(find(size(x)~=1)); + if isempty(dim), dim = 1; end + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + case 'dim' + i = i + 1; + dim = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail, 'char') + error('tail argument to vartest must be a string\n',[]); + end + + s_var = var(x, 0, dim); + + df = size(x, dim) - 1; + stats.chisqstat = df*s_var/v; + + % Based on the "tail" argument determine the P-value, the critical values, + % and the confidence interval. + switch lower(tail) + case 'both' + p = 2*min(chi2cdf(stats.chisqstat,df),1-chi2cdf(stats.chisqstat,df)); + ci = [df*s_var ./ (chi2inv(1-alpha/2,df)); df*s_var ./ (chi2inv(alpha/2,df))]; + case 'left' + p = chi2cdf(stats.chisqstat,df); + chi2crit = chi2inv(alpha,df); + ci = [zeros(size(stats.chisqstat)); df*s_var ./ (chi2inv(alpha,df))]; + case 'right' + p = 1 - chi2cdf(stats.chisqstat,df); + chi2crit = chi2inv(1-alpha,df); + ci = [df*s_var ./ (chi2inv(1-alpha,df)); inf*ones(size(stats.chisqstat))]; + otherwise + error('Invalid fourth (tail) argument to vartest\n',[]); + end + + % Reshape the ci array to match MATLAB shaping + if and(isscalar(stats.chisqstat), dim==2) + ci = ci(:)'; + elseif size(stats.chisqstat,2). + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} vartest2 (@var{x}, @var{y}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{stats}] =} vartest2 (@var{x}, @var{y}, @var{Name}, @var{Value}) +## Perform a F-test for equal variances. +## +## If the second argument @var{y} is a vector, a paired-t test of the +## hypothesis @code{mean (@var{x}) = mean (@var{y})} is performed. +## +## The argument @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). The string +## argument @qcode{"tail"}, can be used to select the desired alternative +## hypotheses. If @qcode{"alt"} is @qcode{"both"} (default) the null is +## tested against the two-sided alternative @code{mean (@var{x}) != @var{m}}. +## If @qcode{"alt"} is @qcode{"right"} the one-sided +## alternative @code{mean (@var{x}) > @var{m}} is considered. +## Similarly for @qcode{"left"}, the one-sided alternative @code{mean +## (@var{x}) < @var{m}} is considered. When @qcode{"vartype"} is @qcode{"equal"} +## the variances are assumed to be equal (this is the default). When +## @qcode{"vartype"} is @qcode{"unequal"} the variances are not assumed equal. +## When argument @var{x} is a matrix the @qcode{"dim"} argument can be +## used to selection the dimension over which to perform the test. +## (The default is the first non-singleton dimension.) +## +## If @var{h} is 0 the null hypothesis is accepted, if it is 1 the null +## hypothesis is rejected. The p-value of the test is returned in @var{pval}. +## A 100(1-alpha)% confidence interval is returned in @var{ci}. @var{stats} +## is a structure containing the value of the test statistic (@var{tstat}), +## the degrees of freedom (@var{df}) and the sample standard deviation +## (@var{sd}). +## +## @end deftypefn + +## Author: Tony Richardson +## Description: Test for mean of a normal sample with known variance + +function [h, p, ci, stats] = vartest2(x, y, varargin) + + % Set default arguments + alpha = 0.05; + tail = 'both'; + % Find the first non-singleton dimension of x + dim = min(find(size(x)~=1)); + if isempty(dim), dim = 1; end + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + case 'dim' + i = i + 1; + dim = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail, 'char') + error('tail argument to vartest2 must be a string\n',[]); + end + + s1_var = var(x, 0, dim); + s2_var = var(y, 0, dim); + + stats.fstat = s1_var ./ s2_var; + df1= size(x, dim) - 1; + df2 = size(y, dim) - 1; + + % Based on the "tail" argument determine the P-value, the critical values, + % and the confidence interval. + switch lower(tail) + case 'both' + p = 2*min(fcdf(stats.fstat,df1,df2),1 - fcdf(stats.fstat,df1,df2)); + fcrit = finv(1-alpha/2,df1,df2); + ci = [s1_var ./ (fcrit*s2_var); fcrit*s1_var ./ s2_var]; + case 'left' + p = fcdf(stats.fstat,df1,df2); + fcrit = finv(alpha,df1,df2); + ci = [zeros(size(stats.fstat)); s1_var ./ (fcrit*s2_var)]; + case 'right' + p = 1 - fcdf(stats.fstat,df1,df2); + fcrit = finv(1-alpha,df1,df2); + ci = [s1_var ./ (fcrit*s2_var); inf*ones(size(stats.fstat))]; + otherwise + error('Invalid fourth (tail) argument to vartest2\n',[]); + end + + % Reshape the ci array to match MATLAB shaping + if and(isscalar(stats.fstat), dim==2) + ci = ci(:)'; + elseif size(stats.fstat,2). + +## Author: Juan Pablo Carbajal + +## -*- texinfo -*- +## @defun {@var{h} =} violin (@var{x}) +## @defunx {@var{h} =} violin (@dots{}, @var{property}, @var{value}, @dots{}) +## @defunx {@var{h} =} violin (@var{hax}, @dots{}) +## @defunx {@var{h} =} violin (@dots{}, @asis{"horizontal"}) +## Produce a Violin plot of the data @var{x}. +## +## The input data @var{x} can be a N-by-m array containg N observations of m variables. +## It can also be a cell with m elements, for the case in which the varibales +## are not uniformly sampled. +## +## The following @var{property} can be set using @var{property}/@var{value} pairs +## (default values in parenthesis). +## The value of the property can be a scalar indicating that it applies +## to all the variables in the data. +## It can also be a cell/array, indicating the property for each variable. +## In this case it should have m columns (as many as variables). +## +## @table @asis +## +## @item Color +## (@asis{"y"}) Indicates the filling color of the violins. +## +## @item Nbins +## (50) Internally, the function calls @command{hist} to compute the histogram of the data. +## This property indicates how many bins to use. See @command{help hist} +## for more details. +## +## @item SmoothFactor +## (4) The fuction performs simple kernel density estimation and automatically +## finds the bandwith of the kernel function that best approximates the histogram +## using optimization (@command{sqp}). +## The result is in general very noisy. To smooth the result the bandwidth is +## multiplied by the value of this property. The higher the value the smoother +## the violings, but values too high might remove features from the data distribution. +## +## @item Bandwidth +## (NA) If this property is given a value other than NA, it sets the bandwith of the +## kernel function. No optimization is peformed and the property @asis{SmoothFactor} +## is ignored. +## +## @item Width +## (0.5) Sets the maximum width of the violins. Violins are centered at integer axis +## values. The distance between two violin middle axis is 1. Setting a value +## higher thna 1 in this property will cause the violins to overlap. +## @end table +## +## If the string @asis{"Horizontal"} is among the input arguments, the violin +## plot is rendered along the x axis with the variables in the y axis. +## +## The returned structure @var{h} has handles to the plot elements, allowing +## customization of the visualization using set/get functions. +## +## Example: +## +## @example +## title ("Grade 3 heights"); +## axis ([0,3]); +## set (gca, "xtick", 1:2, "xticklabel", @{"girls"; "boys"@}); +## h = violin (@{randn(100,1)*5+140, randn(130,1)*8+135@}, "Nbins", 10); +## set (h.violin, "linewidth", 2) +## @end example +## +## @seealso{boxplot, hist} +## @end defun + +function h = violin (ax, varargin) + + old_hold = ishold (); + # First argument is not an axis + if (~ishandle (ax) || ~isscalar (ax)) + x = ax; + ax = gca (); + else + x = varargin{1}; + varargin(1) = []; + endif + + ###################### + ## Parse parameters ## + parser = inputParser (); + parser.CaseSensitive = false; + parser.FunctionName = 'violin'; + + parser.addParamValue ('Nbins', 50); + parser.addParamValue ('SmoothFactor', 4); + parser.addParamValue ('Bandwidth', NA); + parser.addParamValue ('Width', 0.5); + parser.addParamValue ('Color', "y"); + parser.addSwitch ('Horizontal'); + + parser.parse (varargin{:}); + res = parser.Results; + + c = res.Color; # Color of violins + if (ischar (c)) c = c(:); endif + nb = res.Nbins; # Number of bins in histogram + sf = res.SmoothFactor; # Smoothing factor for kernel estimation + r0 = res.Bandwidth; # User value for KDE bandwth to prevent optimization + is_horiz = res.Horizontal; # Whether the plot must be rotated + width = res.Width; # Width of the violins + clear parser res + ###################### + + ## Make everything a cell for code simplicity + if (~iscell (x)) + [N Nc] = size (x); + x = mat2cell (x, N, ones (1, Nc)); + else + Nc = numel (x); + endif + + try + [nb, c, sf, r0, width] = to_cell (nb, c, sf, r0, width, Nc); + catch err + if strcmp (err.identifier, "to_cell:element_idx") + n = str2num (err.message); + txt = {"Nbins", "Color", "SmoothFactor", "Bandwidth", "Width"}; + error ("Octave:invaid-input-arg", ... + ["options should be scalars or call/array with as many values as" ... + " numbers of variables in the data (wrong size of %s)."], txt{n}); + else + rethrow (lasterror()) + endif + end + + ## Build violins + [px py mx] = cellfun (@(y,n,s,r)build_polygon(y, n, s, r), ... + x, nb, sf, r0, "unif", 0); + + Nc = 1:numel (px); + Ncc = mat2cell (Nc, 1, ones (1, Nc(end))); + + # get hold state + old_hold = ishold (); + + # Draw plain violins + tmp = cellfun (@(x,y,n,u, w)patch(ax, (w * x + n)(:), y(:) ,u), ... + px, py, Ncc, c, width); + h.violin = tmp; + + hold on + # Overlay mean value + tmp = cellfun (@(z,y)plot(ax, z, y,'.k', "markersize", 6), Ncc, mx); + h.mean = tmp; + + # Overlay median + Mx = cellfun (@median, x, "unif", 0); + tmp = cellfun (@(z,y)plot(ax, z, y, 'ok'), Ncc, Mx); + h.median = tmp; + + # Overlay 1nd and 3th quartiles + LUBU = cellfun (@(x,y)abs(quantile(x,[0.25 0.75])-y), x, Mx, "unif", 0); + tmp = cellfun (@(x,y,z)errorbar(ax, x, y, z(1),z(2)), Ncc, Mx, LUBU)(:); + # Flatten errorbar output handles + tmp2 = allchild (tmp); + if (~iscell (tmp2)) + tmp2 = mat2cell (tmp2, ones(length (tmp2), 1), 1); + endif + tmp = mat2cell (tmp, ones (length (tmp), 1), 1); + tmp = cellfun (@vertcat, tmp, tmp2, "unif", 0); + h.quartile = cell2mat (tmp); + + hold off + + # Rotate the plot if it is horizontal + if (is_horiz) + structfun (@swap_axes, h); + set (ax, "ytick", Nc); + else + set (ax, "xtick", Nc); + endif + + if (nargout < 1); + clear h; + endif + + # restore hold state + if (old_hold) + hold on + endif +endfunction + +function k = kde(x,r) + k = mean (stdnormal_pdf (x / r)) / r; + k /= max (k); +endfunction + +function [px py mx] = build_polygon (x, nb, sf, r) + N = size (x, 1); + mx = mean (x); + sx = std (x); + X = (x - mx ) / sx; + + [count bin] = hist (X, nb); + count /= max (count); + + Y = X - bin; + if isna (r) + r0 = 1.06 * N^(1/5); + r = sqp (r0, @(r)sumsq (kde(Y,r) - count), [], [], 1e-3, 1e2); + else + sf = 1; + endif + sig = sf * r; + + ## Create violin polygon + # smooth tails: extend to 1.83 sigmas, i.e. ~99% of data. + xx = linspace (0, 1.83 * sig, 5); + bin = [bin(1)-fliplr(xx) bin bin(end)+xx]; + py = [bin; fliplr(bin)].' * sx + mx; + + v = kde (X-bin, sig).'; + px = [v -flipud(v)]; + +endfunction + +function tf = swap_axes (h) + tmp = mat2cell (h(:), ones (length (h),1), 1); +% tmp = cellfun (@(x)[x; allchild(x)], tmp, "unif", 0); + tmpy = cellfun(@(x)get(x, "ydata"), tmp, "unif", 0); + tmpx = cellfun(@(x)get(x, "xdata"), tmp, "unif", 0); + cellfun (@(h,x,y)set (h, "xdata", y, "ydata", x), tmp, tmpx, tmpy); + tf = true; +endfunction + +function varargout = to_cell (varargin) + + m = varargin{end}; + varargin(end) = []; + + for i = 1:numel(varargin) + x = varargin{i}; + if (isscalar (x)) x = repmat (x, m, 1); endif + + if (iscell (x)) + if (numel(x) ~= m) # no dimension equals m + error ("to_cell:element_idx", "%d\n",i); + endif + varargout{i} = x; + continue + endif + + sz = size (x); + d = find (sz == m); + if (isempty (d)) # no dimension equals m + error ("to_cell:element_idx", "%d\n",i); + elseif (length (d) == 2) + #both dims are m, choose 1st + elseif (d == 1) # 2nd dimension is m --> transpose + x = x.'; + sz = fliplr (sz); + endif + + varargout{i} = mat2cell (x, sz(1), ones (m,1)); + + endfor + +endfunction + +%!demo +%! clf +%! x = zeros (9e2, 10); +%! for i=1:10 +%! x(:,i) = (0.1 * randn (3e2, 3) * (randn (3,1) + 1) + ... +%! 2 * randn (1,3))(:); +%! endfor +%! h = violin (x, "color", "c"); +%! axis tight +%! set (h.violin, "linewidth", 2); +%! set (gca, "xgrid", "on"); +%! xlabel ("Variables") +%! ylabel ("Values") + +%!demo +%! clf +%! data = {randn(100,1)*5+140, randn(130,1)*8+135}; +%! subplot (1,2,1) +%! title ("Grade 3 heights - vertical"); +%! set (gca, "xtick", 1:2, "xticklabel", {"girls"; "boys"}); +%! violin (data, "Nbins", 10); +%! axis tight +%! +%! subplot(1,2,2) +%! title ("Grade 3 heights - horizontal"); +%! set (gca, "ytick", 1:2, "yticklabel", {"girls"; "boys"}); +%! violin (data, "horizontal", "Nbins", 10); +%! axis tight + +%!demo +%! clf +%! data = exprnd (0.1, 500,4); +%! violin (data, "nbins", {5,10,50,100}); +%! axis ([0 5 0 max(data(:))]) + +%!demo +%! clf +%! data = exprnd (0.1, 500,4); +%! violin (data, "color", jet(4)); +%! axis ([0 5 0 max(data(:))]) + +%!demo +%! clf +%! data = repmat(exprnd (0.1, 500,1), 1, 4); +%! violin (data, "width", linspace (0.1,0.5,4)); +%! axis ([0 5 0 max(data(:))]) + +%!demo +%! clf +%! data = repmat(exprnd (0.1, 500,1), 1, 4); +%! violin (data, "nbins", [5,10,50,100], "smoothfactor", [4 4 8 10]); +%! axis ([0 5 0 max(data(:))]) diff --git a/inst/vmpdf.m b/inst/vmpdf.m new file mode 100644 index 0000000..ef42f70 --- /dev/null +++ b/inst/vmpdf.m @@ -0,0 +1,46 @@ +## Copyright (C) 2009 Soren Hauberg +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @var{theta} = vmpdf (@var{x}, @var{mu}, @var{k}) +## Evaluates the Von Mises probability density function. +## +## The Von Mises distribution has probability density function +## @example +## f (@var{x}) = exp (@var{k} * cos (@var{x} - @var{mu})) / @var{Z} , +## @end example +## where @var{Z} is a normalisation constant. By default, @var{mu} is 0 and +## @var{k} is 1. +## @seealso{vmrnd} +## @end deftypefn + +function p = vmpdf (x, mu = 0, k = 1) + ## Check input + if (!isreal (x)) + error ("vmpdf: first input must be real"); + endif + + if (!isreal (mu)) + error ("vmpdf: second input must be a scalar"); + endif + + if (!isreal (k) || k <= 0) + error ("vmpdf: third input must be a real positive scalar"); + endif + + ## Evaluate PDF + Z = 2 * pi * besseli (0, k); + p = exp (k * cos (x-mu)) / Z; +endfunction diff --git a/inst/vmrnd.m b/inst/vmrnd.m new file mode 100644 index 0000000..94d68b2 --- /dev/null +++ b/inst/vmrnd.m @@ -0,0 +1,76 @@ +## Copyright (C) 2009 Soren Hauberg +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} @var{theta} = vmrnd (@var{mu}, @var{k}) +## @deftypefnx{Function File} @var{theta} = vmrnd (@var{mu}, @var{k}, @var{sz}) +## Draw random angles from a Von Mises distribution with mean @var{mu} and +## concentration @var{k}. +## +## The Von Mises distribution has probability density function +## @example +## f (@var{x}) = exp (@var{k} * cos (@var{x} - @var{mu})) / @var{Z} , +## @end example +## where @var{Z} is a normalisation constant. +## +## The output, @var{theta}, is a matrix of size @var{sz} containing random angles +## drawn from the given Von Mises distribution. By default, @var{mu} is 0 +## and @var{k} is 1. +## @seealso{vmpdf} +## @end deftypefn + +function theta = vmrnd (mu = 0, k = 1, sz = 1) + ## Check input + if (!isreal (mu)) + error ("vmrnd: first input must be a scalar"); + endif + + if (!isreal (k) || k <= 0) + error ("vmrnd: second input must be a real positive scalar"); + endif + + if (isscalar (sz)) + sz = [sz, sz]; + elseif (!isvector (sz)) + error ("vmrnd: third input must be a scalar or a vector"); + endif + + ## Simulate! + if (k < 1e-6) + ## k is small: sample uniformly on circle + theta = 2 * pi * rand (sz) - pi; + + else + a = 1 + sqrt (1 + 4 * k.^2); + b = (a - sqrt (2 * a)) / (2 * k); + r = (1 + b^2) / (2 * b); + + N = prod (sz); + notdone = true (N, 1); + while (any (notdone)) + u (:, notdone) = rand (3, N); + + z (notdone) = cos (pi * u (1, notdone)); + f (notdone) = (1 + r * z (notdone)) ./ (r + z (notdone)); + c (notdone) = k * (r - f (notdone)); + + notdone = (u (2, :) >= c .* (2 - c)) & (log (c) - log (u (2, :)) + 1 - c < 0); + N = sum (notdone); + endwhile + + theta = mu + sign (u (3, :) - 0.5) .* acos (f); + theta = reshape (theta, sz); + endif +endfunction diff --git a/inst/wblplot.m b/inst/wblplot.m new file mode 100644 index 0000000..b3166bb --- /dev/null +++ b/inst/wblplot.m @@ -0,0 +1,380 @@ +## Copyright (C) 2014 Bj{\"o}rn Vennberg +## +## This program is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +## -*- texinfo -*- +## @deftypefn {wblplot.m} wblplot (@var{data},...) +## @deftypefnx {wblplot.m} {@var{handle} =} wblplot (@var{data},...) +## @deftypefnx {wblplot.m} {[@var{handle} @var{param}] =} wblplot (@var{data}) +## @deftypefnx {wblplot.m} {[@var{handle} @var{param}] =} wblplot (@var{data} , @var{censor}) +## @deftypefnx {wblplot.m} {[@var{handle} @var{param}] =} wblplot (@var{data} , @var{censor}, @var{freq}) +## @deftypefnx {wblplot.m} {[@var{handle} @var{param}] =} wblplot (@var{data} , @var{censor}, @var{freq}, @var{confint}) +## @deftypefnx {wblplot.m} {[@var{handle} @var{param}] =} wblplot (@var{data} , @var{censor}, @var{freq}, @var{confint}, @var{fancygrid}) +## @deftypefnx {wblplot.m} {[@var{handle} @var{param}] =} wblplot (@var{data} , @var{censor}, @var{freq}, @var{confint}, @var{fancygrid}, @var{showlegend}) +## +## +## @noindent +## Plot a column vector @var{data} on a Weibull probability plot using rank regression. +## +## @var{censor}: optional parameter is a column vector of same size as @var{data} +## with 1 for right censored data and 0 for exact observation. +## Pass [] when no censor data are available. +## +## @var{freq}: optional vector same size as data with the number of occurences for corresponding data. +## Pass [] when no frequency data are available. +## +## @var{confint}: optional confidence limits for ploting upper and lower +## confidence bands using beta binomial confidence bounds. If a single +## value is given this will be used such as LOW = a and HIGH = 1 - a. +## Pass [] if confidence bounds is not requested. +## +## @var{fancygrid}: optional parameter which if set to anything but 1 will turn of the the fancy gridlines. +## +## @var{showlegend}: optional parameter that when set to zero(0) turns off the legend. +## +## If one output argument is given, a @var{handle} for the data marker and plotlines are returned +## which can be used for further modification of line and marker style. +## +## If a second output argument is specified, a @var{param} vector with scale, +## shape and correlation factor is returned. +## +## @seealso{normplot, wblpdf} +## @end deftypefn + +## Author: Bj{\"o}rn Vennberg +## Created: 2014-11-11 +## 2014-11-22 Updated argin check +## 2014-11-22 Code clean up, error checking +## 2014-11-23 Turn off legend box, check for zero and negative values in data. +## 2014-11-23 Censored data check and inclusion of demo samples. Help info updates. + +function [handle param] = wblplot (data , censor=[], freq=[], confint=[], fancygrid=1, showlegend=1) + [mm nn] = size(data); + if mm > 1 && nn > 1 + error ("wblplot: can only handle a single data vector") + elseif mm == 1 && nn > 1 + data=data(:); + mm = nn; + end + if any(data<=0) + error("wblplot: data vector must be positive and non zero") + end + + if isempty(freq) + freq = ones(mm,1); + N = mm; + else + [mmf nnf]=size(freq); + if (mmf == mm && nnf == 1) || (mmf == 1 && nnf == mm) + freq = freq(:); + N=sum(freq); ## Total number of samples + if any(freq<=0) + error("wblplot: frequency vector must be positive non zero integers") + end + else + error("wblplot: frequency must be vector of same length as data") + end + end + + if isempty(censor) + censor = zeros(mm,1); + else + [mmc nnc]=size(censor); + if (mmc == mm && nnc == 1) || (mmc == 1 && nnc == mm) + censor = censor(:); + else + error("wblplot: censor must be a vector of same length as data") + end + ## Make sure censored data is sorted corectly so that no censored samples + ## are processed before failures if they have the same time. + if any(censor>0) + ind=find(censor>0); + ind2=find(data(1:end-1)==data(2:end)); + if ~isempty(ind) && ~isempty(ind2) + if any(ind==ind2) + tmp=censor(ind2); + censor(ind2)=censor(ind2+1); + censor(ind2+1)=tmp; + tmp=freq(ind2); + freq(ind2)=freq(ind2+1); + freq(ind2+1)=tmp; + end + end + end + end + + ## Determine the order number + wbdat=zeros(length(find(censor==0)),3); + Op = 0; + Oi = 0; + c = N; + nf = 0; + for k = 1 : mm + if censor(k, 1) == 0 + nf = nf + 1; + wbdat(nf, 1) = data(k, 1); + for s = 1 : freq(k, 1); + Oi = Op + ((N + 1) - Op) / (1 + c); + Op = Oi; + c = c - 1; + end + wbdat(nf, 3) = Oi; + else + c = c - freq(k, 1); + endif + end + ## Compute median rank + a=wbdat(:, 3)./(N-wbdat(:, 3)+1); + f=finv(0.5,2*(N-wbdat(:, 3)+1),2*wbdat(:, 3)); + + wbdat(:, 2) = a./(f+a); + + datx = log(wbdat(:,1)); + daty = log(log(1 ./ (1 - wbdat(:,2)))); + + ## Rank regression + poly = polyfit(datx, daty, 1); + ## Shape factor + beta_rry = poly(1); + ## Scale factor + eta_rry = exp(-(poly(2) / beta_rry)); + + ## Determin min-max values of view port + aa=ceil(log10(max(wbdat(:,1)))); + bb=log10(max(wbdat(:,1))); + if aa-bb<0.2 + aa=ceil(log10(max(wbdat(:,1))))+1; + end + xmax= 10^aa; + + if log10(min(wbdat(:,1)))-floor(log10(min(wbdat(:,1))))<0.2 + xmin=10^(floor(log10(min(wbdat(:,1))))-1); + else + xmin=10^floor(log10(min(wbdat(:,1)))); + end + + if min(wbdat(:,2))>0.20 + ymin = log(log(1/(1-0.1))); + elseif min(wbdat(:,2))>0.02 + ymin = log(log(1/(1-0.01))); + elseif min(wbdat(:,2))>0.002 + ymin = log(log(1/(1-0.001))); + else + ymin = log(log(1/(1-0.0001))); + end + + ymax= log(log(1/(1-0.999))); + x=[0;0]; + y=[0;0]; + + label = char('0.10','1.00','10.00','99.00'); + prob = [0.001 0.01 0.1 0.99]; + tick = log(log(1./(1-prob))); + xbf = [xmin;xmax]; + ybf = polyval(poly, log(xbf)); + + newplot(); + x(1, 1) = xmin; + x(2, 1) = xmax; + if fancygrid==1 + for k = 1 : 4 + + ## Y major grids + x(1, 1) = xmin; + x(2, 1) = xmax*10; + y(1, 1) = log(log(1 / (1 - 10 ^ (-k)))); + y(2, 1) = y(1, 1); + ymajorgrid(k) = line(x,y,'LineStyle','-','Marker','none','Color',[1 0.75 0.75],'LineWidth',0.1); + end + + ## Y Minor grids 2 - 9 + x(1, 1) = xmin; + x(2, 1) = xmax*10; + for m = 1 : 4 + for k = 1 : 8 + y(1, 1) = log(log(1 / (1 - ((k + 1) / (10 ^ m))))); + y(2, 1) = y(1, 1); + yminorgrid(k) = line(x,y,'LineStyle','-','Marker','none','Color',[0.75 1 0.75],'LineWidth',0.1); + end + end + ## X-axis grid + y(1, 1) = ymin; + y(2, 1) = ymax; + for m = log10(xmin) : log10(xmax) + x(1, 1) = 10 ^ m; + x(2, 1) = x(1, 1); + y(1, 1) = ymin; % + y(2, 1) = ymax; % + xmajorgrid(k) = line(x,y,'LineStyle','-','Marker','none','Color',[1 0.75 0.75]); + + for k = 1 : 8 + ## X Minor grids - 2 - 9 + x(1, 1) = (k + 1) * (10 ^ m); + x(2, 1) = (k + 1) * (10 ^ m); + xminorgrid(k) = line(x,y,'LineStyle','-','Marker','none','Color',[0.75 1 0.75],'LineWidth',0.1); + end + end + end + + set(gca,'XScale','log'); + set(gca,'YTick',tick,'YTickLabel',label); + + + xlabel('Data','FontSize',12); + ylabel('Unreliability, F(t)=1-R(t)','FontSize',12); + title('Weibull Probability Plot','FontSize',12); + set(gcf,'Color',[0.9,0.9,0.9]) + set(gcf,'name','WblPlot') + hold on + + h=plot(wbdat(:,1),daty,'o'); + set(h,'markerfacecolor',[0,0,1]) + set(h,'markersize',8) + h2 = line(xbf,ybf,'LineStyle','-','Marker','none','Color',[0.25 0.25 1],'LineWidth',1); + ## If requested plot beta binomial confidens bounds + if ~isempty(confint) + cb_high=[]; + cb_low=[]; + if length(confint)==1 + if confint >0.5 + cb_high=confint; + cb_low=1-confint; + else + cb_high=1-confint; + cb_low=confint; + end + else + cb_high=confint(2); + cb_low=confint(1); + end + conf=zeros(N+4,3); + betainv = 1 / beta_rry; + N2 = [1:N]'; + N2=[0.3;0.7;N2;N2(end)+0.5;N2(end)+0.8]; ## Extend the ends a bit + ypos = medianranks(0.5, N, N2); + conf(:, 1) = eta_rry * log(1./ (1 - ypos)).^ betainv; + conf(:, 2) = medianranks(cb_low, N, N2); + conf(:, 3) = medianranks(cb_high, N, N2); + confy=log(log(1./(1-conf(:,2:3)))); + + confu=[conf(:,1) confy]; + + if conf(1,1)>xmin ## Not totally correct but it looks better to extend the lines. + p1=polyfit(log(conf(1:2,1)),confy(1:2,1),1); + y1=polyval(p1,log(xmin)); + p2=polyfit(log(conf(1:2,1)),confy(1:2,2),1); + y2=polyval(p2,log(xmin)); + confu=[xmin y1 y2;confu]; + end + + if conf(end,1)= 2 + param = [eta_rry beta_rry rsq]; + if ~isempty(confint) + handle = [h; h2; h3]; + else + handle = [h; h2]; + end + end + if nargout == 1 + if ~isempty(confint) + handle = [h; h2; h3]; + else + handle = [h; h2]; + end + end +endfunction + + + + +function [ ret ] = medianranks (alpha, n, ii) + a=ii./(n-ii+1); + f=finv(alpha,2*(n-ii+1),2*ii); + ret=a./(f+a); +endfunction + + +%!demo +%! x=[16 34 53 75 93 120]; +%! wblplot(x); + + +%!demo +%! x=[2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67]'; +%! c=[0 1 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1]'; +%! [h p]=wblplot(x,c) + + + +%!demo +%! x=[16, 34, 53, 75, 93, 120, 150, 191, 240 ,339]; +%! [h p]=wblplot(x,[],[],0.05) +%! ## Benchmark Reliasoft eta = 146.2545 beta 1.1973 rho = 0.9999 + + + +%!demo +%! x=[46 64 83 105 123 150 150]; +%! c=[0 0 0 0 0 0 1]; +%! f=[1 1 1 1 1 1 4]; +%! wblplot(x,c,f,0.05); + + +%!demo +%! x=[46 64 83 105 123 150 150]; +%! c=[0 0 0 0 0 0 1]; +%! f=[1 1 1 1 1 1 4]; +%! ## Subtract 30.92 from x to simulate a 3 parameter wbl with gamma = 30.92 +%! wblplot(x-30.92,c,f,0.05); + + +## Get current figure visibility so it can be restored after tests +%!shared visibility_setting +%! visibility_setting = get (0, "DefaultFigureVisible"); + +%!test +%! set (0, "DefaultFigureVisible", "off"); +%! x=[16, 34, 53, 75, 93, 120, 150, 191, 240 ,339]; +%! [h p]=wblplot(x,[],[],0.05); +%! assert(numel(h), 4) +%! assert(p(1), 146.2545, 1E-4) +%! assert(p(2), 1.1973, 1E-4) +%! assert(p(3), 0.9999, 5E-5) +%! set (0, "DefaultFigureVisible", visibility_setting); diff --git a/inst/wblstat.m b/inst/wblstat.m new file mode 100644 index 0000000..d8ac426 --- /dev/null +++ b/inst/wblstat.m @@ -0,0 +1,124 @@ +## Copyright (C) 2006, 2007 Arno Onken +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{m}, @var{v}] =} wblstat (@var{scale}, @var{shape}) +## Compute mean and variance of the Weibull distribution. +## +## @subheading Arguments +## +## @itemize @bullet +## @item +## @var{scale} is the scale parameter of the Weibull distribution. +## @var{scale} must be positive +## +## @item +## @var{shape} is the shape parameter of the Weibull distribution. +## @var{shape} must be positive +## @end itemize +## @var{scale} and @var{shape} must be of common size or one of them must be +## scalar +## +## @subheading Return values +## +## @itemize @bullet +## @item +## @var{m} is the mean of the Weibull distribution +## +## @item +## @var{v} is the variance of the Weibull distribution +## @end itemize +## +## @subheading Examples +## +## @example +## @group +## scale = 3:8; +## shape = 1:6; +## [m, v] = wblstat (scale, shape) +## @end group +## +## @group +## [m, v] = wblstat (6, shape) +## @end group +## @end example +## +## @subheading References +## +## @enumerate +## @item +## Wendy L. Martinez and Angel R. Martinez. @cite{Computational Statistics +## Handbook with MATLAB}. Appendix E, pages 547-557, Chapman & Hall/CRC, +## 2001. +## +## @item +## Athanasios Papoulis. @cite{Probability, Random Variables, and Stochastic +## Processes}. McGraw-Hill, New York, second edition, 1984. +## @end enumerate +## @end deftypefn + +## Author: Arno Onken +## Description: Moments of the Weibull distribution + +function [m, v] = wblstat (scale, shape) + + # Check arguments + if (nargin != 2) + print_usage (); + endif + + if (! isempty (scale) && ! ismatrix (scale)) + error ("wblstat: scale must be a numeric matrix"); + endif + if (! isempty (shape) && ! ismatrix (shape)) + error ("wblstat: shape must be a numeric matrix"); + endif + + if (! isscalar (scale) || ! isscalar (shape)) + [retval, scale, shape] = common_size (scale, shape); + if (retval > 0) + error ("wblstat: scale and shape must be of common size or scalar"); + endif + endif + + # Calculate moments + m = scale .* gamma (1 + 1 ./ shape); + v = (scale .^ 2) .* gamma (1 + 2 ./ shape) - m .^ 2; + + # Continue argument check + k = find (! (scale > 0) | ! (scale < Inf) | ! (shape > 0) | ! (shape < Inf)); + if (any (k)) + m(k) = NaN; + v(k) = NaN; + endif + +endfunction + +%!test +%! scale = 3:8; +%! shape = 1:6; +%! [m, v] = wblstat (scale, shape); +%! expected_m = [3.0000, 3.5449, 4.4649, 5.4384, 6.4272, 7.4218]; +%! expected_v = [9.0000, 3.4336, 2.6333, 2.3278, 2.1673, 2.0682]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); + +%!test +%! shape = 1:6; +%! [m, v] = wblstat (6, shape); +%! expected_m = [ 6.0000, 5.3174, 5.3579, 5.4384, 5.5090, 5.5663]; +%! expected_v = [36.0000, 7.7257, 3.7920, 2.3278, 1.5923, 1.1634]; +%! assert (m, expected_m, 0.001); +%! assert (v, expected_v, 0.001); diff --git a/inst/wishpdf.m b/inst/wishpdf.m new file mode 100644 index 0000000..8337657 --- /dev/null +++ b/inst/wishpdf.m @@ -0,0 +1,67 @@ +## Copyright (C) 2013 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with Octave; see the file COPYING. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} @var{y} = wishpdf (@var{W}, @var{Sigma}, @var{df}, @var{log_y}=false) +## Compute the probability density function of the Wishart distribution +## +## Inputs: A @var{p} x @var{p} matrix @var{W} where to find the PDF. The @var{p} x @var{p} positive definite matrix @var{Sigma} and scalar degrees of freedom parameter @var{df} characterizing the Wishart distribution. (For the density to be finite, need @var{df} > (@var{p} - 1).) +## If the flag @var{log_y} is set, return the log probability density -- this helps avoid underflow when the numerical value of the density is very small +## +## Output: @var{y} is the probability density of Wishart(@var{Sigma}, @var{df}) at @var{W}. +## +## @seealso{wishrnd, iwishpdf} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Compute the probability density function of the Wishart distribution + +function [y] = wishpdf(W, Sigma, df, log_y=false) + +if (nargin < 3) + print_usage (); +endif + +p = size(Sigma, 1); + +if (df <= (p - 1)) + error('df too small, no finite densities exist') +endif + +#calculate the logarithm of G_d(df/2), the multivariate gamma function +g = (p * (p-1) / 4) * log(pi); +for i = 1:p + g = g + log(gamma((df + (1 - i))/2)); #using lngamma_gsl(.) from the gsl package instead of log(gamma(.)) might help avoid underflow/overflow +endfor + +C = chol(Sigma); + +#use formulas for determinant of positive definite matrix for better efficiency and numerical accuracy +logdet_W = 2*sum(log(diag(chol(W)))); +logdet_Sigma = 2*sum(log(diag(C))); + +y = -(df*p)/2 * log(2) - (df/2)*logdet_Sigma - g + ((df - p - 1)/2)*logdet_W - trace(chol2inv(C)*W)/2; + +if ~log_y + y = exp(y); +endif + + +endfunction + +##test results cross-checked against dwish function in R MCMCpack library +%!assert(wishpdf(4, 3, 3.1), 0.07702496, 1E-7); +%!assert(wishpdf([2 -0.3;-0.3 4], [1 0.3;0.3 1], 4), 0.004529741, 1E-7); +%!assert(wishpdf([6 2 5; 2 10 -5; 5 -5 25], [9 5 5; 5 10 -8; 5 -8 22], 5.1), 4.474865e-10, 1E-15); + +%% Test input validation +%!error wishpdf () +%!error wishpdf (1, 2) +%!error wishpdf (1, 2, 0) + +%!error wishpdf (1, 2) diff --git a/inst/wishrnd.m b/inst/wishrnd.m new file mode 100644 index 0000000..d8de7cc --- /dev/null +++ b/inst/wishrnd.m @@ -0,0 +1,85 @@ +## Copyright (C) 2013-2019 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along with Octave; see the file COPYING. If not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {} [@var{W}[, @var{D}]] = wishrnd (@var{Sigma}, @var{df}[, @var{D}][, @var{n}=1]) +## Return a random matrix sampled from the Wishart distribution with given parameters +## +## Inputs: the @var{p} x @var{p} positive definite matrix @var{Sigma} (or the lower-triangular Cholesky factor @var{D} of @var{Sigma}) and scalar degrees of freedom parameter @var{df}. +## @var{df} can be non-integer as long as @var{df} > @var{p} +## +## Output: a random @var{p} x @var{p} matrix @var{W} from the Wishart(@var{Sigma}, @var{df}) distribution. If @var{n} > 1, then @var{W} is @var{p} x @var{p} x @var{n} and holds @var{n} such random matrices. (Optionally, the lower-triangular Cholesky factor @var{D} of @var{Sigma} is also returned.) +## +## Averaged across many samples, the mean of @var{W} should approach @var{df}*@var{Sigma}, and the variance of each element @var{W}_ij should approach @var{df}*(@var{Sigma}_ij^2 + @var{Sigma}_ii*@var{Sigma}_jj) +## +## Reference: Yu-Cheng Ku and Peter Bloomfield (2010), Generating Random Wishart Matrices with Fractional Degrees of Freedom in OX, http://www.gwu.edu/~forcpgm/YuChengKu-030510final-WishartYu-ChengKu.pdf +## +## @seealso{iwishrnd, wishpdf} +## @end deftypefn + +## Author: Nir Krakauer +## Description: Sample from the Wishart distribution + +function [W, D] = wishrnd(Sigma, df, D, n=1) + +if (nargin < 2) + print_usage (); +endif + +if nargin < 3 || isempty(D) + try + D = chol(Sigma, 'lower'); + catch + error('wishrnd: Cholesky decomposition failed; Sigma probably not positive definite') + end_try_catch +endif + +p = size(D, 1); + +if df < p + df = floor(df); #distribution not defined for small noninteger df + df_isint = 1; +else +#check for integer degrees of freedom + df_isint = (df == floor(df)); +endif + +if ~df_isint + [ii, jj] = ind2sub([p, p], 1:(p*p)); +endif + +if n > 1 + W = nan(p, p, n); +endif + +for i = 1:n + if df_isint + Z = D * randn(p, df); + else + Z = diag(sqrt(chi2rnd(df - (0:(p-1))))); #fill diagonal + #note: chi2rnd(x) is equivalent to 2*randg(x/2), but the latter seems to offer no performance advantage + Z(ii > jj) = randn(p*(p-1)/2, 1); #fill lower triangle with normally distributed variates + Z = D * Z; + endif + W(:, :, i) = Z*Z'; +endfor + +endfunction + + +%!assert(size (wishrnd (1,2)), [1, 1]); +%!assert(size (wishrnd (1,2,[])), [1, 1]); +%!assert(size (wishrnd (1,2,1)), [1, 1]); +%!assert(size (wishrnd ([],2,1)), [1, 1]); +%!assert(size (wishrnd ([3 1; 1 3], 2.00001, [], 1)), [2, 2]); +%!assert(size (wishrnd (eye(2), 2, [], 3)), [2, 2, 3]); + +%% Test input validation +%!error wishrnd () +%!error wishrnd (1) +%!error wishrnd ([1; 1], 2) diff --git a/inst/ztest.m b/inst/ztest.m new file mode 100644 index 0000000..d0d02e1 --- /dev/null +++ b/inst/ztest.m @@ -0,0 +1,115 @@ +## Copyright (C) 2014 Tony Richardson +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{z}, @var{zcrit}] =} ztest (@var{x}, @var{m}, @var{s}) +## @deftypefnx {Function File} {[@var{h}, @var{pval}, @var{ci}, @var{z}, @var{zcrit}] =} ztest (@var{x}, @var{m}, @var{s}, @var{Name}, @var{Value}) +## Test for mean of a normal sample with known variance. +## +## Perform a Z-test of the null hypothesis @code{mean (@var{x}) == @var{m}} +## for a sample @var{x} from a normal distribution with unknown +## mean and known std deviation @var{s}. Under the null, the test statistic +## @var{z} follows a standard normal distribution. +## +## Name-Value pair arguments can be used to set various options. +## @qcode{"alpha"} can be used to specify the significance level +## of the test (the default value is 0.05). @qcode{"tail"}, can be used +## to select the desired alternative hypotheses. If the value is +## @qcode{"both"} (default) the null is tested against the two-sided +## alternative @code{mean (@var{x}) != @var{m}}. +## If it is @qcode{"right"} the one-sided alternative @code{mean (@var{x}) +## > @var{m}} is considered. Similarly for @qcode{"left"}, the one-sided +## alternative @code{mean (@var{x}) < @var{m}} is considered. +## When argument @var{x} is a matrix, @qcode{"dim"} can be used to selection +## the dimension over which to perform the test. (The default is the +## first non-singleton dimension.) +## +## If @var{h} is 0 the null hypothesis is accepted, if it is 1 the null +## hypothesis is rejected. The p-value of the test is returned in @var{pval}. +## A 100(1-alpha)% confidence interval is returned in @var{ci}. The test statistic +## value is returned in @var{z} and the z critical value in @var{zcrit}. +## +## @end deftypefn + +## Author: Tony Richardson + +function [h, p, ci, zval, zcrit] = ztest(x, m, sigma, varargin) + + alpha = 0.05; + tail = 'both'; + + % Find the first non-singleton dimension of x + dim = min(find(size(x)~=1)); + if isempty(dim), dim = 1; end + + i = 1; + while ( i <= length(varargin) ) + switch lower(varargin{i}) + case 'alpha' + i = i + 1; + alpha = varargin{i}; + case 'tail' + i = i + 1; + tail = varargin{i}; + case 'dim' + i = i + 1; + dim = varargin{i}; + otherwise + error('Invalid Name argument.',[]); + end + i = i + 1; + end + + if ~isa(tail, 'char') + error('tail argument to ztest must be a string\n',[]); + end + + % Calculate the test statistic value (zval) + n = size(x, dim); + x_bar = mean(x, dim); + x_bar_std = sigma/sqrt(n); + zval = (x_bar - m)./x_bar_std; + + % Based on the "tail" argument determine the P-value, the critical values, + % and the confidence interval. + switch lower(tail) + case 'both' + p = 2*(1 - normcdf(abs(zval))); + zcrit = -norminv(alpha/2); + ci = [x_bar-zcrit*x_bar_std; x_bar+zcrit*x_bar_std]; + case 'left' + p = normcdf(zval); + zcrit = -norminv(alpha); + ci = [-inf*ones(size(x_bar)); x_bar+zcrit*x_bar_std]; + case 'right' + p = 1 - normcdf(zval); + zcrit = -norminv(alpha); + ci = [x_bar-zcrit*x_bar_std; inf*ones(size(x_bar))]; + otherwise + error('Invalid fifth (tail) argument to ztest\n',[]); + end + + % Reshape the ci array to match MATLAB shaping + if and(isscalar(x_bar), dim==2) + ci = ci(:)'; + elseif size(x_bar,2)> Statistics +Distributions + stdnormal_rnd + hygecdf + hygeinv + cauchy_inv + discrete_rnd + wienrnd + nbinpdf + unidrnd + wblrnd + geoinv + hygepdf + logncdf + cauchy_pdf + finv + chi2pdf + empirical_pdf + geornd + unidinv + betacdf + fpdf + fcdf + unifcdf + discrete_inv + stdnormal_inv + stdnormal_pdf + exprnd + normcdf + discrete_cdf + poissinv + empirical_rnd + wblcdf + laplace_cdf + stdnormal_cdf + gampdf + laplace_pdf + laplace_rnd + expcdf + unifinv + nbinrnd + betapdf + trnd + geocdf + laplace_inv + tcdf + unidpdf + normrnd + binocdf + poisscdf + tinv + discrete_pdf + empirical_inv + norminv + gamcdf + unifpdf + empirical_cdf + nbincdf + logistic_inv + frnd + chi2cdf + cauchy_cdf + lognpdf + normpdf + logistic_cdf + betarnd + wblinv + chi2inv + hygernd + poisspdf + kolmogorov_smirnov_cdf + logistic_rnd + exppdf + binopdf + lognrnd + logninv + binoinv + geopdf + betainv + cauchy_rnd + poissrnd + gamrnd + nbininv + expinv + chi2rnd + wblpdf + tpdf + logistic_pdf + binornd + unidcdf + gaminv + unifrnd +Models + logistic_regression +Hypothesis testing + kolmogorov_smirnov_test_2 + t_test_2 + hotelling_test + wilcoxon_test + chisquare_test_independence + hotelling_test_2 + bartlett_test + run_test + t_test + welch_test + anova + var_test + manova + sign_test + u_test + cor_test + mcnemar_test + chisquare_test_homogeneity + z_test + prop_test_2 + f_test_regression + kolmogorov_smirnov_test + kruskal_wallis_test + z_test_2 + t_test_regression +Descriptive statistics + mad + corr + probit + corrcoef + kendall + var + lscov + quantile + logit + mean + range + ranks + statistics + cloglog + mode + std + prctile + crosstab + moment + cov + median + spearman + kurtosis + zscore + run_count + skewness + iqr + ols + runlength +Other + center + ismissing + rmmissing +Plots + qqplot + ppplot + histc +Clustering + meansq + gls diff --git a/install-conditionally/README b/install-conditionally/README new file mode 100644 index 0000000..4a996ab --- /dev/null +++ b/install-conditionally/README @@ -0,0 +1,14 @@ +This directory contains functions currently or previously available in +core Octave. Of these functions, only those not present in the local +Octave version will be installed by the package. Under this directory: + +Only statistics functions available in some core Octave versions +should be added. (Functions with different names than (former) core +Octave functions should be placed under inst/ .) + +If functions are added, they should also be added to the file +utils/functions_to_install. + +Newer function versions than contained in core Octave can be +developed, but this should only be done for functions removed (or +scheduled for removal) from core Octave. diff --git a/install-conditionally/base/center.m b/install-conditionally/base/center.m new file mode 100644 index 0000000..7f14d61 --- /dev/null +++ b/install-conditionally/base/center.m @@ -0,0 +1,94 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## Copyright (C) 2009 VZLU Prague +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} center (@var{x}) +## @deftypefnx {} {} center (@var{x}, @var{dim}) +## Center data by subtracting its mean. +## +## If @var{x} is a vector, subtract its mean. +## +## If @var{x} is a matrix, do the above for each column. +## +## If the optional argument @var{dim} is given, operate along this dimension. +## +## Programming Note: @code{center} has obvious application for normalizing +## statistical data. It is also useful for improving the precision of general +## numerical calculations. Whenever there is a large value that is common +## to a batch of data, the mean can be subtracted off, the calculation +## performed, and then the mean added back to obtain the final answer. +## @seealso{zscore} +## @end deftypefn + +## Author: KH +## Description: Center by subtracting means + +function retval = center (x, dim) + + if (nargin != 1 && nargin != 2) + print_usage (); + endif + + if (! (isnumeric (x) || islogical (x))) + error ("center: X must be a numeric vector or matrix"); + endif + + if (isinteger (x)) + x = double (x); + endif + + nd = ndims (x); + sz = size (x); + if (nargin != 2) + ## Find the first non-singleton dimension. + (dim = find (sz > 1, 1)) || (dim = 1); + else + if (! (isscalar (dim) && dim == fix (dim) && dim > 0)) + error ("center: DIM must be an integer and a valid dimension"); + endif + endif + + n = size (x, dim); + + if (n == 0) + retval = x; + else + ## FIXME: Use bsxfun, rather than broadcasting, until broadcasting + ## supports diagonal and sparse matrices (Bugs #41441, #35787). + retval = bsxfun (@minus, x, mean (x, dim)); + ## retval = x - mean (x, dim); # automatic broadcasting + endif + +endfunction + + +%!assert (center ([1,2,3]), [-1,0,1]) +%!assert (center (single ([1,2,3])), single ([-1,0,1])) +%!assert (center (int8 ([1,2,3])), [-1,0,1]) +%!assert (center (logical ([1, 0, 0, 1])), [0.5, -0.5, -0.5, 0.5]) +%!assert (center (ones (3,2,0,2)), zeros (3,2,0,2)) +%!assert (center (ones (3,2,0,2, "single")), zeros (3,2,0,2, "single")) +%!assert (center (magic (3)), [3,-4,1;-2,0,2;-1,4,-3]) +%!assert (center ([1 2 3; 6 5 4], 2), [-1 0 1; 1 0 -1]) +%!assert (center (1, 3), 0) + +## Test input validation +%!error center () +%!error center (1, 2, 3) +%!error center (1, ones (2,2)) +%!error center (1, 1.5) +%!error center (1, 0) diff --git a/install-conditionally/base/cloglog.m b/install-conditionally/base/cloglog.m new file mode 100644 index 0000000..3ab7c12 --- /dev/null +++ b/install-conditionally/base/cloglog.m @@ -0,0 +1,56 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} cloglog (@var{x}) +## Return the complementary log-log function of @var{x}. +## +## The complementary log-log function is defined as +## @tex +## $$ +## {\rm cloglog}(x) = - \log (- \log (x)) +## $$ +## @end tex +## @ifnottex +## +## @example +## cloglog (x) = - log (- log (@var{x})) +## @end example +## +## @end ifnottex +## @end deftypefn + +## Author: KH +## Description: Complementary log-log function + +function y = cloglog (x) + + if (nargin != 1) + print_usage (); + endif + + y = - log (- log (x)); + +endfunction + + +%!assert (cloglog (0), -Inf) +%!assert (cloglog (1), Inf) +%!assert (cloglog (1/e), 0) + +## Test input validation +%!error cloglog () +%!error cloglog (1, 2) diff --git a/install-conditionally/base/corr.m b/install-conditionally/base/corr.m new file mode 100644 index 0000000..2047411 --- /dev/null +++ b/install-conditionally/base/corr.m @@ -0,0 +1,109 @@ +## Copyright (C) 1996-2017 John W. Eaton +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} corr (@var{x}) +## @deftypefnx {} {} corr (@var{x}, @var{y}) +## Compute matrix of correlation coefficients. +## +## If each row of @var{x} and @var{y} is an observation and each column is +## a variable, then the @w{(@var{i}, @var{j})-th} entry of +## @code{corr (@var{x}, @var{y})} is the correlation between the +## @var{i}-th variable in @var{x} and the @var{j}-th variable in @var{y}. +## @tex +## $$ +## {\rm corr}(x,y) = {{\rm cov}(x,y) \over {\rm std}(x) \, {\rm std}(y)} +## $$ +## @end tex +## @ifnottex +## +## @example +## corr (@var{x},@var{y}) = cov (@var{x},@var{y}) / (std (@var{x}) * std (@var{y})) +## @end example +## +## @end ifnottex +## If called with one argument, compute @code{corr (@var{x}, @var{x})}, +## the correlation between the columns of @var{x}. +## @seealso{cov} +## @end deftypefn + +## Author: Kurt Hornik +## Created: March 1993 +## Adapted-By: jwe + +function retval = corr (x, y = []) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + ## Input validation is done by cov.m. Don't repeat tests here + + ## Special case, scalar is always 100% correlated with itself + if (isscalar (x)) + if (isa (x, "single")) + retval = single (1); + else + retval = 1; + endif + return; + endif + + ## No check for division by zero error, which happens only when + ## there is a constant vector and should be rare. + if (nargin == 2) + c = cov (x, y); + s = std (x)' * std (y); + retval = c ./ s; + else + c = cov (x); + s = sqrt (diag (c)); + retval = c ./ (s * s'); + endif + +endfunction + + +%!test +%! x = rand (10); +%! cc1 = corr (x); +%! cc2 = corr (x, x); +%! assert (size (cc1) == [10, 10] && size (cc2) == [10, 10]); +%! assert (cc1, cc2, sqrt (eps)); + +%!test +%! x = [1:3]'; +%! y = [3:-1:1]'; +%! assert (corr (x, y), -1, 5*eps); +%! assert (corr (x, flipud (y)), 1, 5*eps); +%! assert (corr ([x, y]), [1 -1; -1 1], 5*eps); + +%!test +%! x = single ([1:3]'); +%! y = single ([3:-1:1]'); +%! assert (corr (x, y), single (-1), 5*eps); +%! assert (corr (x, flipud (y)), single (1), 5*eps); +%! assert (corr ([x, y]), single ([1 -1; -1 1]), 5*eps); + +%!assert (corr (5), 1) +%!assert (corr (single (5)), single (1)) + +## Test input validation +%!error corr () +%!error corr (1, 2, 3) +%!error corr ([1; 2], ["A", "B"]) +%!error corr (ones (2,2,2)) +%!error corr (ones (2,2), ones (2,2,2)) diff --git a/install-conditionally/base/corrcoef.m b/install-conditionally/base/corrcoef.m new file mode 100644 index 0000000..fe6d4be --- /dev/null +++ b/install-conditionally/base/corrcoef.m @@ -0,0 +1,250 @@ +## Copyright (C) 2016-2017 Guillaume Flandin +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {@var{r} =} corrcoef (@var{x}) +## @deftypefnx {} {@var{r} =} corrcoef (@var{x}, @var{y}) +## @deftypefnx {} {[@var{r}, @var{p}] =} corrcoef (@dots{}) +## @deftypefnx {} {[@var{r}, @var{p}, @var{lci}, @var{hci}] =} corrcoef (@dots{}) +## @deftypefnx {} {[@dots{}] =} corrcoef (@dots{}, @var{param}, @var{value}, @dots{}) +## Compute a matrix of correlation coefficients. +## +## @var{x} is an array where each column contains a variable and each row is +## an observation. +## +## If a second input @var{y} (of the same size as @var{x}) is given then +## calculate the correlation coefficients between @var{x} and @var{y}. +## +## @var{r} is a matrix of Pearson's product moment correlation coefficients for +## each pair of variables. +## +## @var{p} is a matrix of pair-wise p-values testing for the null hypothesis of +## a correlation coefficient of zero. +## +## @var{lci} and @var{hci} are matrices containing, respectively, the lower and +## higher bounds of the 95% confidence interval of each correlation +## coefficient. +## +## @var{param}, @var{value} are pairs of optional parameters and values. +## Valid options are: +## +## @table @asis +## @item @qcode{"alpha"} +## Confidence level used for the definition of the bounds of the confidence +## interval, @var{lci} and @var{hci}. Default is 0.05, i.e., 95% confidence +## interval. +## +## @item @qcode{"rows"} +## Determine processing of NaN values. Acceptable values are @qcode{"all"}, +## @qcode{"complete"}, and @qcode{"pairwise"}. Default is @qcode{"all"}. +## With @qcode{"complete"}, only the rows without NaN values are considered. +## With @qcode{"pairwise"}, the selection of NaN-free rows is made for each +## pair of variables. +## +## @end table +## +## @seealso{corr, cov, cor_test} +## @end deftypefn + +## FIXME: It would be good to add a definition of the calculation method +## for a Pearson product moment correlation to the documentation. + +function [r, p, lci, hci] = corrcoef (x, varargin) + + if (nargin == 0) + print_usage (); + endif + + alpha = 0.05; + rows = "all"; + + if (nargin > 1) + + ## Check for numeric y argument + if (isnumeric (varargin{1})) + x = [x(:), varargin{1}(:)]; + varargin(1) = []; + endif + + ## Check for Parameter/Value arguments + for i = 1:2:numel (varargin) + + if (! ischar (varargin{i})) + error ("corrcoef: parameter %d must be a string", i); + endif + parameter = varargin{i}; + if (numel (varargin) < i+1) + error ('corrcoef: parameter "%s" missing value', parameter); + endif + value = varargin{i+1}; + + switch (tolower (parameter)) + case "alpha" + if (isnumeric (value) && isscalar (value) + && value >= 0 && value <= 1) + alpha = value; + else + error ('corrcoef: "alpha" must be a number between 0 and 1'); + endif + + case "rows" + if (! ischar (value)) + error ('corrcoef: "rows" value must be a string'); + endif + value = tolower (value); + switch (value) + case {"all", "complete", "pairwise"} + rows = value; + otherwise + error ('corrcoef: "rows" must be "all", "complete", or "pairwise".'); + endswitch + + otherwise + error ('corrcoef: Unknown option "%s"', parameter); + + endswitch + endfor + endif + + if (strcmp (rows, "complete")) + x(any (isnan (x), 2), :) = []; + endif + + if (isempty (x) || isscalar (x)) + r = p = lci = hci = NaN; + return; + endif + + ## Flags for calculation + pairwise = strcmp (rows, "pairwise"); + calc_pval = nargout > 1; + + if (isrow (x)) + x = x(:); + endif + [m, n] = size (x); + r = eye (n); + if (calc_pval) + p = eye (n); + endif + if (strcmp (rows, "pairwise")) + mpw = m * ones (n); + endif + for i = 1:n + if (! pairwise && any (isnan (x(:,i)))) + r(i,i) = NaN; + if (nargout > 1) + p(i,i) = NaN; + endif + endif + for j = i+1:n + xi = x(:,i); + xj = x(:,j); + if (pairwise) + idx = any (isnan ([xi xj]), 2); + xi(idx) = xj(idx) = []; + mpw(i,j) = mpw(j,i) = m - nnz (idx); + endif + r(i,j) = r(j,i) = corr (xi, xj); + if (calc_pval) + T = cor_test (xi, xj, "!=", "pearson"); + p(i,j) = p(j,i) = T.pval; + endif + endfor + endfor + + if (nargout > 2) + if (pairwise) + m = mpw; + endif + CI = sqrt (2) * erfinv (1-alpha) ./ sqrt (m-3); + lci = tanh (atanh (r) - CI); + hci = tanh (atanh (r) + CI); + endif + +endfunction + + +%!test +%! x = rand (5); +%! r = corrcoef (x); +%! assert (size (r) == [5, 5]); + +%!test +%! x = [1 2 3]; +%! r = corrcoef (x); +%! assert (size (r) == [1, 1]); + +%!test +%! x = []; +%! r = corrcoef (x); +%! assert (isnan (r)); + +%!test +%! x = [NaN]; +%! r = corrcoef (x); +%! assert (isnan (r)); + +%!test +%! x = [1]; +%! r = corrcoef (x); +%! assert (isnan (r)); + +%!test +%! x = [NaN NaN]; +%! r = corrcoef (x); +%! assert (size(r) == [1, 1] && isnan (r)); + +%!test +%! x = rand (5); +%! [r, p] = corrcoef (x); +%! assert (size (r) == [5, 5] && size (p) == [5 5]); + +%!test +%! x = rand (5,1); +%! y = rand (5,1); +%! R1 = corrcoef (x, y); +%! R2 = corrcoef ([x, y]); +%! assert (R1, R2); + +%!test +%! x = [1;2;3]; +%! y = [1;2;3]; +%! r = corrcoef (x, y); +%! assert (r, ones (2,2)); + +%!test +%! x = [1;2;3]; +%! y = [3;2;1]; +%! r = corrcoef (x, y); +%! assert (r, [1, -1; -1, 1]); + +%!test +%! x = [1;2;3]; +%! y = [1;1;1]; +%! r = corrcoef (x, y); +%! assert (r, [1, NaN; NaN, 1]); + +%!test +%!error corrcoef () +%!error corrcoef (1, 2, 3) +%!error corrcoef (1, 2, "alpha") +%!error <"alpha" must be a number> corrcoef (1,2, "alpha", "1") +%!error <"alpha" must be a number> corrcoef (1,2, "alpha", ones (2,2)) +%!error <"alpha" must be a number between 0 and 1> corrcoef (1,2, "alpha", -1) +%!error <"alpha" must be a number between 0 and 1> corrcoef (1,2, "alpha", 2) +%!error <"rows" must be "all"...> corrcoef (1,2, "rows", "foobar") +%!error corrcoef (1,2, "foobar", 1) diff --git a/install-conditionally/base/cov.m b/install-conditionally/base/cov.m new file mode 100644 index 0000000..0deeb0b --- /dev/null +++ b/install-conditionally/base/cov.m @@ -0,0 +1,565 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## Copyright (C) 2017 Nicholas R. Jankowski +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} cov (@var{x}) +## @deftypefnx {} {} cov (@var{x}, @var{opt}) +## @deftypefnx {} {} cov (@var{x}, @var{y}) +## @deftypefnx {} {} cov (@var{x}, @var{y}, @var{opt}) +## @deftypefnx {} {} cov (@var{x}, @var{y}, @var{opt}, @var{NaN-option}) +## Compute the covariance matrix. +## +## If each row of @var{x} and @var{y} is an observation, and each column is +## a variable, then the @w{(@var{i}, @var{j})-th} entry of +## @code{cov (@var{x}, @var{y})} is the covariance between the @var{i}-th +## variable in @var{x} and the @var{j}-th variable in @var{y}. +## @tex +## $$ +## \sigma_{ij} = {1 \over N-1} \sum_{i=1}^N (x_i - \bar{x})(y_i - \bar{y}) +## $$ +## where $\bar{x}$ and $\bar{y}$ are the mean values of @var{x} and @var{y}. +## @end tex +## @ifnottex +## +## @example +## cov (@var{x}) = 1/(N-1) * SUM_i (@var{x}(i) - mean(@var{x})) * (@var{y}(i) - mean(@var{y})) +## @end example +## +## where @math{N} is the length of the @var{x} and @var{y} vectors. +## +## @end ifnottex +## +## If called with one argument, compute @code{cov (@var{x}, @var{x})}, the +## covariance between the columns of @var{x}. +## +## If called with two vector arguments, compute +## @code{cov (@var{x}, @var{y})}, the covariance between two random variables +## @var{x} and @var{y}. The output will be the 2 by 2 covariance matrix. +## +## If called with two matrix arguments, the matrices are treated as vectors and +## covariance is computed as @code{cov (@var{x}(:), @var{y}(:))}. The output +## will be the 2 by 2 covariance matrix. +## +## The optional argument @var{opt} determines the type of normalization to use. +## Valid values are +## +## @table @asis +## @item 0: +## normalize with @math{N-1}, provides the best unbiased estimator of the +## covariance [default] +## +## @item 1: +## normalize with @math{N}, this provides the second moment around the mean +## @end table +## +## The optional argument @var{NaN-option} controls how @code{cov} deals with NaN +## values in the data. The three valid values are +## +## @table @asis +## @item includenan: +## leave NaN values in @var{x} and @var{y}. Output will follow the normal +## rules for handling NaN values in arithemtic operations [default] +## +## @item omitnans: +## rows containing NaN values are trimmed from both @var{x} and @var{y} prior +## to calculating the covariance. (A NaN in one variable will that row from +## both @var{x} and @var{y}.) +## +## @item partialnans: +## rows containing NaN values are ignored from both @var{x} and @var{y} +## independently when for each @var{i}-th and @var{j}-th covariance +## calculation. This may result in a different number of observations, +## @math{N}, being used to calculated each element of the covariance matrix. +## @end table +## +## Compatibility Note: This version of @code{cov} attempts to maintain full +## compatibility with @sc{matlab}'s cov function by treating @var{x} and +## @var{y} as two univariate distributions regardless of shape, resulting in +## a 2x2 output matrix. Previous versions of cov in Octave treated rows +## of @var{x} and @var{y} as multivariate random variables. Code relying on +## Octave's previous definition will need to be changed when running this newer +## version of @code{cov}. +## @seealso{corr} +## @end deftypefn + +## Author: KH +## Author: Nicholas Jankowski +## Description: Compute covariances + +function c = cov (x, varargin) + + %%input sorting + + switch nargin + case 1 + [y, opt, handlenan] = deal ({"no_y", 0, "includenan"}{:}); + + case 4 + [y, opt, handlenan] = deal (varargin{:}); + + case {2,3} + [y, opt, handlenan] = deal ({"no_y", 0, "includenan"}{:}); + + for vararg_idx = 1 : (nargin-1) + v = varargin{vararg_idx}; + if ischar (v) + if (vararg_idx == 1 && nargin == 3) + error ('cov: NaN handling string must be the last input'); + else + handlenan = v; + + endif + + else + if (isscalar(v) && (v == 0 || v == 1)) + opt = v; + + elseif (vararg_idx ~= 2) + y = v; + + else + print_usage(); + + endif + endif + + endfor + + otherwise + print_usage(); + + endswitch + + ## check sorted X + if ~((isnumeric (x) || islogical (x)) && (ndims (x) == 2)) + error ("cov: X must be a numeric 2-D matrix or vector"); + endif + + ##vector x needs to be column for calulations, flip before any nantrimming + if (isrow (x)) + x = x'; + endif + + if ~(strcmp (y, "no_y")) + ## check sorted Y assuming one is given + if ~((isnumeric (y) || islogical (y)) && (ndims (y) == 2)) + error ("cov: Y must be a numeric 2-D matrix or vector"); + endif + + if (numel (x) ~= numel (y)) + error ("cov: X and Y must have the same number of elements"); + endif + endif + + ## check sorted opt + if (opt ~= 0 && opt ~= 1) + error ("cov: normalization factor OPT must be either 0 or 1"); + endif + + ## check sorted NaN handling switch, correct for spelling, adjust x and y + switch handlenan + case {"includenan"} + ## okay, do nothing + case {"omitrows", "omitrow"} + handlenan = "omitrows"; + if (strcmp (y, "no_y")) + #trim out rows with nans from x + x = x(~any (isnan (x), 2), :); + else + nan_locs = any (isnan ([x(:), y(:)]), 2); + x = x(~nan_locs); + y = y(~nan_locs); + + endif + + case {"partialrows", "partialrow"} + handlenan = "partialrows"; + if ~(strcmp (y, "no_y")) + ##no need to handle anything differently for single input + x_nan_locs = any (isnan (x(:)), 2); + y_nan_locs = any (isnan (y(:)), 2); + both_nan_locs = any (isnan ([x(:), y(:)]), 2); + + x_xytrim = x(~both_nan_locs); + y_xytrim = y(~both_nan_locs); + + x = x(~x_nan_locs); + y = y(~y_nan_locs); + + endif + + otherwise + error (["cov: unknown NaN handling parameter, '", handlenan, "'"]); + endswitch + + ## opt being single shouldn't affect output + if (isa (opt, "single")) + opt = double (opt); + endif + + ## end input sorting/checking + + ## Primary handling difference is whether there are one or two inputs. + if (strcmp (y, "no_y")) + + ## Special case, scalar has zero covariance + if (isscalar (x)) + if isnan (x) + c = NaN; + else + c = 0; + endif + + if (isa (x, "single")) + c = single (c); + endif + + return; + + elseif (isempty (x)) %not scalar x, check if empty + sx = size (x); + + if all (sx == 0) + c = NaN; + elseif (sx(1) > 0) + c = []; + else + c = NaN (sx(2)); + endif + + if (isa (x, "single")) + c = single (c); + endif + + return; + + else %not scalar x, not empty, no y, generate covariance matrix + + if strcmp (handlenan, 'partialrows') + ## if 'partialrows', need to calc each element separately with 'omitrows' + ## c(i,j) is cov(x(:,i),x(:,j),'omitrows) + ## TODO: find more efficient method. maybe can flatten recursion + + szx = size (x); + + for rw = 1:szx(1) + for cl = 1:szx(2) + c(rw,cl) = (cov (x(:,rw), x(:,cl), 'omitrows'))(2); + endfor + endfor + return + + else + ## if some elements are NaN, they propagate through calc as needed + + n = rows (x); + x = center (x, 1); + + if n == 1 + c = x' * x; %% to handle case of omitrows trimming to 1 row + else + c = x' * x / (n - 1 + opt); %will preserve single type + endif + + return; + endif + + endif + + else %there is a y + + if (isscalar (x)) + if (isnan (x) || isnan (y)) + c = NaN (2, 2); + if ~isnan (x) + c(1,1) = 0; + elseif ~isnan (y) + c(2,2) = 0; + endif + + if (isa (x, "single") || isa (y, "single") ) + c = single (c); + endif + + return; + + else %scalar but neither a nan... both should be numbers... + + if (isa (x, "single") || isa (y, "single") ) + c = single ([0 0; 0 0]); + else + c = [0 0; 0 0]; + endif + + return; + + endif + + else % matrix or vector handled the same way, generate 2x2 covariance matrix + + if (isempty (x) || isempty (y)) + if (isa (x, "single") || isa (y, "single") ) + c = single (NaN (2, 2)); + else + c = NaN (2, 2); + endif + + return; + endif + + if (~strcmp (handlenan, 'partialrows')) + + denom = numel(x) - 1 + opt; + x = center (x(:), 1); + y = center (y(:), 1); + + c1 = x' * x; + c23 = x' * y; + c4 = y' * y; + + c = [c1, c23; c23, c4] ./ denom; + + return; + + else + ## 'partialrows': handle each element separatley + + denom_xy = numel (x_xytrim) - 1 + opt; + x_xytrim = center (x_xytrim(:), 1); + y_xytrim = center (y_xytrim(:), 1); + c23 = (x_xytrim' * y_xytrim) ./ denom_xy; + + denom_x = numel(x) - 1 + opt; + x = center (x(:), 1); + c1 = (x' * x) ./ denom_x; + + denom_y = numel(y) - 1 + opt; + y = center (y(:), 1); + c4 = (y' * y) ./ denom_y; + + c = [c1, c23; c23, c4]; + + return; + + endif + + endif + + endif + +endfunction + + +%!test +%! x = rand (10); +%! cx1 = cov (x); +%! cx2 = cov (x, x); +%! assert (size (cx1) == [10, 10] && size (cx2) == [2, 2]); +%! assert (cx2 - cx2(1), zeros (2, 2), eps); + +%!test +%! x = [1:3]'; +%! y = [3:-1:1]'; +%! assert (cov (x, y), [1 -1; -1 1]); +%! assert (cov (x, flipud (y)), ones (2, 2)); +%! assert (cov ([x, y]), [1 -1; -1 1]); + +%!test +%! x = single ([1:3]'); +%! y = single ([3:-1:1]'); +%! assert (cov (x, y), single ([1 -1; -1 1])); +%! assert (cov (x, flipud (y)), single (ones (2, 2))); +%! assert (cov ([x, y]), single ([1 -1; -1 1])); + +%!test +%! x = [0 2 4]; +%! y = [3 2 1]; +%! z = [4 -2; -2 1]; +%! assert (cov (x, y), z); +%! assert (cov (single (x), y), single (z)); +%! assert (cov (x, single (y)), single (z)); +%! assert (cov (single (x), single (y)), single (z)); + +%!test +%! x = [1:5]; +%! c = cov (x); +%! assert (c, 2.5); + +%!test +%! x = [1:5]; +%! c = cov (x, 0); +%! assert (c, 2.5); +%! c = cov (x, 1); +%! assert (c, 2); +%! c = cov (x, single (1)); +%! assert (c, double (2)); + +%!test +%! x = [5 0 3 7; 1 -5 7 3; 4 9 8 10]; +%! b = [13/3 53/6 -3 17/3; 53/6 151/3 6.5 145/6; -3 6.5 7 1; 17/3 145/6 1 37/3]; +%! assert (cov (x), b, 50*eps); + +%!test +%! x = [3 6 4]; +%! y = [7 12 -9]; +%! assert (cov (x, y), [7, 20.5; 20.5, 361]./3, 50*eps); + +%!test +%! x = [2 0 -9; 3 4 1]; +%! y = [5 2 6; -4 4 9]; +%! assert (cov (x, y), [66.5, -20.8; -20.8, 58.4]./3, 50*eps); + +%!test +%! x = [1 3 -7; 3 9 2; -5 4 6]; +%! assert (cov (x, 1), [104 46 -92; 46 62 47; -92 47 266]./9, 50*eps); + +%!test +%! x = [1 0; 1 0]; +%! y = [1 2; 1 1]; +%! assert (cov (x, y), [1/3 -1/6; -1/6 0.25], 50*eps); +%! assert (cov (x, y(:)), [1/3 -1/6; -1/6 0.25], 50*eps); +%! assert (cov (x, y(:)'), [1/3 -1/6; -1/6 0.25], 50*eps); +%! assert (cov (x', y(:)), [1/3 1/6; 1/6 0.25], 50*eps); +%! assert (cov (x(:), y), [1/3 -1/6; -1/6 0.25], 50*eps); +%! assert (cov (x(:)', y), [1/3 -1/6; -1/6 0.25], 50*eps); + +%!assert (cov (5), 0) +%!assert (cov (single (5)), single (0)) +%!assert (cov (1, 3), zeros (2, 2)) +%!assert (cov (5, 0), 0) +%!assert (cov (5, 1), 0) +%!assert (cov (5, 2), zeros (2, 2)) +%!assert (cov (5, 99), zeros (2, 2)) +%!assert (cov (logical(0), logical(0)), double(0)) +%!assert (cov (0, logical(0)), double(0)) +%!assert (cov (logical(0), 0), double(0)) +%!assert (cov (logical([0 1; 1 0]), logical([0 1; 1 0])), double ([1 1;1 1]./3)) + +## Test empty and NaN handling (bug #48690) +## TODO: verify compatibily for matlab > 2016b +!assert (cov ([]), NaN) +%!assert (cov (single ([])), single (NaN)) +%!assert (cov ([], []), NaN (2, 2)) +%!assert (cov (single ([]), single([])), single (NaN (2, 2))) +%!assert (cov ([], single ([])), single (NaN (2, 2))) +%!assert (cov (single ([]), []), single (NaN (2, 2))) +%!assert (cov (ones(2, 0)), []) +%!assert (cov (ones(0, 2)), NaN (2, 2)) +%!assert (cov (ones(0, 6)), NaN (6, 6)) +%!assert (cov (ones(2, 0), []), NaN (2, 2)) +%!assert (cov (NaN), NaN) +%!assert (cov (NaN, NaN), NaN (2, 2)) +%!assert (cov (5, NaN), [0, NaN; NaN, NaN]) +%!assert (cov (NaN, 5), [NaN, NaN; NaN, 0]) +%!assert (cov (single (NaN)), single (NaN)) +%!assert (cov (NaN (2, 2)), NaN (2, 2)) +%!assert (cov (single (NaN (2, 2))), single (NaN (2, 2))) +%!assert (cov (NaN(2, 9)), NaN(9, 9)) +%!assert (cov (NaN(9, 2)), NaN(2, 2)) +%!assert (cov ([NaN, 1, 2, NaN]), NaN) +%!assert (cov ([1, NaN, 2, NaN]), NaN) + +## Test nan handling parameter, 1 input +%!test +%! x = [1 3 -7; NaN 9 NaN; -5 4 6]; +%! y1 = [NaN NaN NaN;NaN 31/3 NaN;NaN NaN NaN]; +%! y2 = [28 NaN -15;NaN NaN NaN;-15 NaN 103/3]; +%! y3 = [18 -3 -39; -3 0.5 6.5; -39 6.5 84.5]; +%! assert (cov (x), y1, 50*eps); +%! assert (cov (x'), y2, 50*eps); +%! assert (cov (x, 'includenan'), y1, 50*eps); +%! assert (cov (x', 'includenan'), y2, 50*eps); +%! assert (cov (x, 'omitrows'), y3, 50*eps); +%! assert (cov (x', 'omitrows'), zeros(3, 3), 50*eps); +%! y3(2,2) = 31/3; +%! assert (cov (x, 'partialrows'), y3, 50*eps); +%! y2(isnan (y2)) = 0; +%! assert (cov (x', 'partialrows'), y2, 50*eps); + +## Test nan handling parameter, 2 inputs +%!test +%! x = magic (3); +%! x(1) = NaN; +%! y = magic (3)'; +%! assert (cov (x, y), [NaN, NaN; NaN, 7.5]); +%! assert (cov (x', y), [NaN, NaN; NaN, 7.5]); +%! assert (cov (x, y'), [NaN, NaN; NaN, 7.5]); +%! assert (cov (x', y'), [NaN, NaN; NaN, 7.5]); +%! assert (cov (x, y, 'omitrows'), [57/8 303/56; 303/56 57/8]); +%! assert (cov (x', y, 'omitrows'), [57/8 57/8; 57/8 57/8]); +%! assert (cov (x, y', 'omitrows'), [57/8 57/8; 57/8 57/8]); +%! assert (cov (x', y', 'omitrows'), [57/8 303/56; 303/56 57/8]); +%! assert (cov (x, y, 'partialrows'), [57/8 303/56; 303/56 7.5]); +%! assert (cov (x', y, 'partialrows'), [57/8 57/8; 57/8 7.5]); +%! assert (cov (x, y', 'partialrows'), [57/8 57/8; 57/8 7.5]); +%! assert (cov (x', y', 'partialrows'), [57/8 303/56; 303/56 7.5]); +%! assert (cov (y, x), [7.5, NaN; NaN, NaN]); +%! assert (cov (y', x), [7.5, NaN; NaN, NaN]); +%! assert (cov (y, x'), [7.5, NaN; NaN, NaN]); +%! assert (cov (y', x'), [7.5, NaN; NaN, NaN]); +%! assert (cov (y, x, 'omitrows'), [57/8 303/56; 303/56 57/8]); +%! assert (cov (y', x, 'omitrows'), [57/8 57/8; 57/8 57/8]); +%! assert (cov (y, x', 'omitrows'), [57/8 57/8; 57/8 57/8]); +%! assert (cov (y', x', 'omitrows'), [57/8 303/56; 303/56 57/8]); +%! assert (cov (y, x, 'partialrows'), [7.5 303/56; 303/56 57/8]); +%! assert (cov (y', x, 'partialrows'), [7.5 57/8; 57/8 57/8]); +%! assert (cov (y, x', 'partialrows'), [7.5 57/8; 57/8 57/8]); +%! assert (cov (y', x', 'partialrows'), [7.5 303/56; 303/56 57/8]); + +## Test nan handling parameter, 2 inputs, vectors +%!test +%! x = [1:5]; +%! y = [10:-2:2]; +%! assert (cov (x, y), [2.5 -5; -5 10]); +%! assert (cov ([x NaN], [y 1]), [NaN NaN; NaN, 73/6], 50*eps); +%! assert (cov ([x NaN], [y 1], 'omitrows'), [2.5 -5; -5 10],50*eps); +%! assert (cov ([x NaN], [y 1], 'partialrows'), [2.5 -5; -5 73/6],50*eps); +%! assert (cov ([x 1], [y NaN]), [8/3 NaN; NaN, NaN],50*eps); +%! assert (cov ([x 1], [y NaN], 'omitrows'), [2.5 -5; -5 10],50*eps); +%! assert (cov ([x 1], [y NaN], 'partialrows'), [8/3 -5; -5 10],50*eps); +%! assert (cov ([NaN x], [y NaN], 'omitrows'), [5/3 -10/3; -10/3 20/3],50*eps); +%! assert (cov ([NaN x], [y NaN], 'partialrows'), [2.5 -10/3; -10/3 10],50*eps); + +## Test nan handling parameter, one matrix trimmed to vector +%!test +%! x = magic(3); +%! y = magic (3) - 2; +%! assert (cov (x, y), [7.5 7.5; 7.5 7.5]); +%! x(3:4) = NaN; +%! assert (cov (x), [NaN, NaN, NaN; NaN, NaN, NaN; NaN, NaN, 7]); +%! assert (cov (x, 'omitrows'), zeros (3, 3)); +%! assert (cov (x, 'partialrows'), [12.5 0 -2.5; 0 8 -10; -2.5 -10 7]); +%! assert (cov (x, y), [NaN, NaN; NaN, 7.5]); +%! assert (cov (x, y, 'omitrows'), [46/7, 46/7; 46/7, 46/7], 50*eps); +%! assert (cov (x, y, 'partialrows'), [46/7, 46/7; 46/7, 7.5], 50*eps); +%! assert (cov (y, x), [7.5, NaN; NaN, NaN]); +%! assert (cov (y, x, 'omitrows'), [46/7, 46/7; 46/7, 46/7], 50*eps); +%! assert (cov (y, x, 'partialrows'), [7.5, 46/7; 46/7, 46/7], 50*eps); + +## Test input validation +%!error cov () +%!error cov (1, 2, 3, 4) +%!error cov (5,[1 2]) +%!error cov ([1; 2], ["A", "B"]) +%!error cov (ones (2, 2, 2)) +%!error cov (ones (2, 2), ones (2, 2, 2)) +%!error cov (ones (2, 2), ones (3, 2)) +%!error cov (1, []) +%!error cov (ones (1, 0, 2)) +%!error cov ([1, 2],ones(1, 0, 2)) +%!error cov (1, 2, []) +%!error cov (1, 2, 1, []) diff --git a/install-conditionally/base/crosstab.m b/install-conditionally/base/crosstab.m new file mode 100644 index 0000000..9e9faeb --- /dev/null +++ b/install-conditionally/base/crosstab.m @@ -0,0 +1,131 @@ +## Copyright (C) 2021 Stefano Guidoni +## Copyright (C) 2018 John Donoghue +## Copyright (C) 1995-2017 Kurt Hornik +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {@var{t} =} crosstab (@var{x1}, @var{x2}) +## @deftypefnx {Function File} @ +## {@var{t} =} crosstab (@var{x1}, ..., @var{xn}) +## @deftypefnx {Function File} @ +## {[@var{t}, @var{chi-2}, @var{p}, @var{labels}] =} crosstab (...) +## Create a cross-tabulation (contingency table) @var{t} from data vectors. +## +## The inputs @var{x1}, @var{x2}, ... @var{xn} must be vectors of equal length +## with a data type of numeric, logical, char, or string (cell array). +## +## As additional return values @code{crosstab} returns the chi-square statistics +## @var{chi-2}, its p-value @var{p} and a cell array @var{labels}, containing +## the labels of each input argument. +## +## Currently @var{chi-2} and @var{p} are available only for 1 or 2-dimensional +## @var{t}, with @code{crosstab} returning a NaN value for both @var{chi-2} and +## @var{p} for 3-dimensional, or more, @var{t}. +## @end deftypefn +## +## @seealso{grp2idx,tabulate} + +function [t, chi2, p, labels] = crosstab (varargin) + + ## check input + if (nargin < 2) + print_usage (); + endif + + v_length = length (varargin{1}); + + for i = 1 : nargin + if ((! isvector (varargin{i})) || (v_length != length (varargin{i}))) + error ("crosstab: x1, x2 ... xn must be vectors of the same length"); + endif + endfor + + + ## main - begin + v_reshape = []; # vector of the dimensions of t + X = []; # matrix of the indexed input values + labels = {}; # cell array of labels + + for k = 1 : nargin + [g, gn] = grp2idx (varargin{k}); + + X = [X, g]; + for h = 1 : length (gn) + labels{h, k} = gn{h, 1}; + endfor + v_reshape(k) = length (unique (varargin{k})); + endfor + + v = unique (X(:, nargin)); + t = []; + + ## core logic, this employs a recursive function "crosstab_recursive" + ## given (x1, x2, x3, ... xn) as inputs + ## t(i,j,k,...) = sum (x1(:) == v1(i) & x2(:) == v2(j) & ...) + for i = 1 : length (v) + t = [t, (crosstab_recursive (nargin - 1,... + (X(:, nargin) == v(i) | isnan (v(i)) * isnan (X(:, nargin)))))]; + endfor + + t = reshape(t, v_reshape); # build the nargin-dimensional matrix + + ## additional statistics + if (length (v_reshape) == 2) + [p, chi2] = chisquare_test_independence(t); + elseif (length (v_reshape) > 2) + ## FIXME! + ## chisquare_test_independence works with 2D matrices only + warning ("crosstab: chi-square test only available for 2D results"); + p = NaN; # placeholder + chi2 = NaN; # placeholder + endif + ## main - end + + + ## function: crosstab_recursive + ## while there are input vectors, let's do iterations over them + function t_partial = crosstab_recursive (x_idx, t_parent) + y = X(:, x_idx); + w = unique (y); + + t_partial = []; + if (x_idx == 1) + ## we have reached the last vector, + ## let the computation begin + for j = 1 : length (w) + t_partial = [t_partial, ... + sum(t_parent & (y == w(j) | isnan (w(j)) * isnan (y)));]; + endfor + else + ## if there are more vectors, + ## just add data and pass it through to the next iteration + for j = 1 : length (w) + t_partial = [t_partial, ... + (crosstab_recursive (x_idx - 1, ... + (t_parent & (y == w(j) | isnan (w(j)) * isnan (y)))))]; + endfor + endif + endfunction +endfunction + + +## Test input validation +%!error crosstab () +%!error crosstab (1) +%!error crosstab (ones (2), [1 1]) +%!error crosstab ([1 1], ones (2)) +%!error crosstab ([1], [1 1]) +%!error crosstab ([1 1], [1]) diff --git a/install-conditionally/base/gls.m b/install-conditionally/base/gls.m new file mode 100644 index 0000000..c9ff60b --- /dev/null +++ b/install-conditionally/base/gls.m @@ -0,0 +1,144 @@ +## Copyright (C) 1996-2017 John W. Eaton +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {[@var{beta}, @var{v}, @var{r}] =} gls (@var{y}, @var{x}, @var{o}) +## Generalized least squares model. +## +## Perform a generalized least squares estimation for the multivariate model +## @tex +## $y = x b + e$ +## with $\bar{e} = 0$ and cov(vec($e$)) = $(s^2)o$, +## @end tex +## @ifnottex +## @w{@math{y = x*b + e}} with @math{mean (e) = 0} and +## @math{cov (vec (e)) = (s^2) o}, +## @end ifnottex +## where +## @tex +## $y$ is a $t \times p$ matrix, $x$ is a $t \times k$ matrix, $b$ is a $k +## \times p$ matrix, $e$ is a $t \times p$ matrix, and $o$ is a $tp \times +## tp$ matrix. +## @end tex +## @ifnottex +## @math{y} is a @math{t} by @math{p} matrix, @math{x} is a @math{t} by +## @math{k} matrix, @math{b} is a @math{k} by @math{p} matrix, @math{e} +## is a @math{t} by @math{p} matrix, and @math{o} is a @math{t*p} by +## @math{t*p} matrix. +## @end ifnottex +## +## @noindent +## Each row of @var{y} and @var{x} is an observation and each column a +## variable. The return values @var{beta}, @var{v}, and @var{r} are +## defined as follows. +## +## @table @var +## @item beta +## The GLS estimator for @math{b}. +## +## @item v +## The GLS estimator for @math{s^2}. +## +## @item r +## The matrix of GLS residuals, @math{r = y - x*beta}. +## @end table +## @seealso{ols} +## @end deftypefn + +## Author: Teresa Twaroch +## Created: May 1993 +## Adapted-By: jwe + +function [beta, v, r] = gls (y, x, o) + + if (nargin != 3) + print_usage (); + endif + + if (! (isnumeric (x) && isnumeric (y) && isnumeric (o))) + error ("gls: X, Y, and O must be numeric matrices or vectors"); + endif + + if (ndims (x) != 2 || ndims (y) != 2 || ndims (o) != 2) + error ("gls: X, Y and O must be 2-D matrices or vectors"); + endif + + [rx, cx] = size (x); + [ry, cy] = size (y); + [ro, co] = size (o); + if (rx != ry) + error ("gls: number of rows of X and Y must be equal"); + endif + if (! issquare (o) || ro != ry*cy) + error ("gls: matrix O must be square matrix with rows = rows (Y) * cols (Y)"); + endif + + if (isinteger (x)) + x = double (x); + endif + if (isinteger (y)) + y = double (y); + endif + if (isinteger (o)) + o = double (o); + endif + + ## Start of algorithm + o ^= -1/2; + z = kron (eye (cy), x); + z = o * z; + y1 = o * reshape (y, ry*cy, 1); + u = z' * z; + r = rank (u); + + if (r == cx*cy) + b = inv (u) * z' * y1; + else + b = pinv (z) * y1; + endif + + beta = reshape (b, cx, cy); + + if (isargout (2) || isargout (3)) + r = y - x * beta; + if (isargout (2)) + v = (reshape (r, ry*cy, 1))' * (o^2) * reshape (r, ry*cy, 1) / (rx*cy - r); + endif + endif + +endfunction + + +%!test +%! x = [1:5]'; +%! y = 3*x + 2; +%! x = [x, ones(5,1)]; +%! o = diag (ones (5,1)); +%! assert (gls (y,x,o), [3; 2], 50*eps); + +## Test input validation +%!error gls () +%!error gls (1) +%!error gls (1, 2) +%!error gls (1, 2, 3, 4) +%!error gls ([true, true], [1, 2], ones (2)) +%!error gls ([1, 2], [true, true], ones (2)) +%!error gls ([1, 2], [1, 2], true (2)) +%!error gls (ones (2,2,2), ones (2,2), ones (4,4)) +%!error gls (ones (2,2), ones (2,2,2), ones (4,4)) +%!error gls (ones (2,2), ones (2,2), ones (4,4,4)) +%!error gls (ones (1,2), ones (2,2), ones (2,2)) +%!error gls (ones (2,2), ones (2,2), ones (2,2)) diff --git a/install-conditionally/base/histc.m b/install-conditionally/base/histc.m new file mode 100644 index 0000000..4b1ccc8 --- /dev/null +++ b/install-conditionally/base/histc.m @@ -0,0 +1,188 @@ +## Copyright (C) 2009-2017 Søren Hauberg +## Copyright (C) 2009 VZLU Prague +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {@var{n} =} histc (@var{x}, @var{edges}) +## @deftypefnx {} {@var{n} =} histc (@var{x}, @var{edges}, @var{dim}) +## @deftypefnx {} {[@var{n}, @var{idx}] =} histc (@dots{}) +## Compute histogram counts. +## +## When @var{x} is a vector, the function counts the number of elements of +## @var{x} that fall in the histogram bins defined by @var{edges}. This +## must be a vector of monotonically increasing values that define the edges +## of the histogram bins. +## @tex +## $n(k)$ +## @end tex +## @ifnottex +## @code{@var{n}(k)} +## @end ifnottex +## contains the number of elements in @var{x} for which +## @tex +## $@var{edges}(k) <= @var{x} < @var{edges}(k+1)$. +## @end tex +## @ifnottex +## @code{@var{edges}(k) <= @var{x} < @var{edges}(k+1)}. +## @end ifnottex +## The final element of @var{n} contains the number of elements of @var{x} +## exactly equal to the last element of @var{edges}. +## +## When @var{x} is an @math{N}-dimensional array, the computation is carried +## out along dimension @var{dim}. If not specified @var{dim} defaults to the +## first non-singleton dimension. +## +## When a second output argument is requested an index matrix is also returned. +## The @var{idx} matrix has the same size as @var{x}. Each element of +## @var{idx} contains the index of the histogram bin in which the +## corresponding element of @var{x} was counted. +## @seealso{hist} +## @end deftypefn + +function [n, idx] = histc (x, edges, dim) + + if (nargin < 2 || nargin > 3) + print_usage (); + endif + + if (! isreal (x)) + error ("histc: X argument must be real-valued, not complex"); + endif + + num_edges = numel (edges); + if (num_edges == 0) + warning ("histc: empty EDGES specified\n"); + n = idx = []; + return; + endif + + if (! isreal (edges)) + error ("histc: EDGES must be real-valued, not complex"); + else + ## Make sure 'edges' is sorted + edges = edges(:); + if (! issorted (edges) || edges(1) > edges(end)) + warning ("histc: edge values not sorted on input"); + edges = sort (edges); + endif + endif + + nd = ndims (x); + sz = size (x); + if (nargin < 3) + ## Find the first non-singleton dimension. + (dim = find (sz > 1, 1)) || (dim = 1); + else + if (!(isscalar (dim) && dim == fix (dim)) + || !(1 <= dim && dim <= nd)) + error ("histc: DIM must be an integer and a valid dimension"); + endif + endif + + nsz = sz; + nsz(dim) = num_edges; + + ## the splitting point is 3 bins + + if (num_edges <= 3) + + ## This is the O(M*N) algorithm. + + ## Allocate the histogram + n = zeros (nsz); + + ## Allocate 'idx' + if (nargout > 1) + idx = zeros (sz); + endif + + ## Prepare indices + idx1 = cell (1, dim-1); + for k = 1:length (idx1) + idx1{k} = 1:sz(k); + endfor + idx2 = cell (length (sz) - dim); + for k = 1:length (idx2) + idx2{k} = 1:sz(k+dim); + endfor + + ## Compute the histograms + for k = 1:num_edges-1 + b = (edges(k) <= x & x < edges(k+1)); + n(idx1{:}, k, idx2{:}) = sum (b, dim); + if (nargout > 1) + idx(b) = k; + endif + endfor + b = (x == edges(end)); + n(idx1{:}, num_edges, idx2{:}) = sum (b, dim); + if (nargout > 1) + idx(b) = num_edges; + endif + + else + + ## This is the O(M*log(N) + N) algorithm. + + ## Look-up indices. + idx = lookup (edges, x); + ## Zero invalid ones (including NaNs). x < edges(1) are already zero. + idx(! (x <= edges(end))) = 0; + + iidx = idx; + + ## In case of matrix input, we adjust the indices. + if (! isvector (x)) + nl = prod (sz(1:dim-1)); + nn = sz(dim); + nu = prod (sz(dim+1:end)); + if (nl != 1) + iidx = (iidx-1) * nl; + iidx += reshape (kron (ones (1, nn*nu), 1:nl), sz); + endif + if (nu != 1) + ne =length (edges); + iidx += reshape (kron (nl*ne*(0:nu-1), ones (1, nl*nn)), sz); + endif + endif + + ## Select valid elements. + iidx = iidx(idx != 0); + + ## Call accumarray to sum the indexed elements. + n = accumarray (iidx(:), 1, nsz); + + endif + +endfunction + + +%!test +%! x = linspace (0, 10, 1001); +%! n = histc (x, 0:10); +%! assert (n, [repmat(100, 1, 10), 1]); + +%!test +%! x = repmat (linspace (0, 10, 1001), [2, 1, 3]); +%! n = histc (x, 0:10, 2); +%! assert (n, repmat ([repmat(100, 1, 10), 1], [2, 1, 3])); + +%!error histc () +%!error histc (1) +%!error histc (1, 2, 3, 4) +%!error histc ([1:10 1+i], 2) +%!warning histc (1:10, []); +%!error histc (1, 1, 3) diff --git a/install-conditionally/base/iqr.m b/install-conditionally/base/iqr.m new file mode 100644 index 0000000..1089759 --- /dev/null +++ b/install-conditionally/base/iqr.m @@ -0,0 +1,98 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} iqr (@var{x}) +## @deftypefnx {} {} iqr (@var{x}, @var{dim}) +## Return the interquartile range, i.e., the difference between the upper +## and lower quartile of the input data. +## +## If @var{x} is a matrix, do the above for first non-singleton dimension of +## @var{x}. +## +## If the optional argument @var{dim} is given, operate along this dimension. +## +## As a measure of dispersion, the interquartile range is less affected by +## outliers than either @code{range} or @code{std}. +## @seealso{range, std} +## @end deftypefn + +## Author KH +## Description: Interquartile range + +function y = iqr (x, dim) + + if (nargin != 1 && nargin != 2) + print_usage (); + endif + + if (! (isnumeric (x) || islogical (x))) + error ("iqr: X must be a numeric vector or matrix"); + endif + + nd = ndims (x); + sz = size (x); + nel = numel (x); + if (nargin != 2) + ## Find the first non-singleton dimension. + (dim = find (sz > 1, 1)) || (dim = 1); + else + if (!(isscalar (dim) && dim == fix (dim)) + || !(1 <= dim && dim <= nd)) + error ("iqr: DIM must be an integer and a valid dimension"); + endif + endif + + ## This code is a bit heavy, but is needed until empirical_inv + ## can take a matrix, rather than just a vector argument. + n = sz(dim); + sz(dim) = 1; + if (isa (x, "single")) + y = zeros (sz, "single"); + else + y = zeros (sz); + endif + stride = prod (sz(1:dim-1)); + for i = 1 : nel / n; + offset = i; + offset2 = 0; + while (offset > stride) + offset -= stride; + offset2 += 1; + endwhile + offset += offset2 * stride * n; + rng = [0 : n-1] * stride + offset; + + y(i) = diff (empirical_inv ([1/4, 3/4], x(rng))); + endfor + +endfunction + + +%!assert (iqr (1:101), 50) +%!assert (iqr (single (1:101)), single (50)) + +## FIXME: iqr throws horrible error when running across a dimension that is 1. +%!test +%! x = [1:100]'; +%! assert (iqr (x, 1), 50); +%! assert (iqr (x', 2), 50); + +%!error iqr () +%!error iqr (1, 2, 3) +%!error iqr (1) +%!error iqr (['A'; 'B']) +%!error iqr (1:10, 3) diff --git a/install-conditionally/base/ismissing.m b/install-conditionally/base/ismissing.m new file mode 100644 index 0000000..4eed349 --- /dev/null +++ b/install-conditionally/base/ismissing.m @@ -0,0 +1,136 @@ +######################################################################## +## +## Copyright (C) 1995-2021 The Octave Project Developers +## +## See the file COPYRIGHT.md in the top-level directory of this +## distribution or . +## +## This file is part of Octave. +## +## Octave is free software: you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . +## +######################################################################## + +## -*- texinfo -*- +## @deftypefn {} {@var{TF} =} ismissing (@var{A}) +## @deftypefnx {} {@var{TF} =} ismissing (@var{A}, @var{indicator}) +## +## Find missing data in a matrix or a string array. +## +## Given an input vector, matrix or array of cell strings @var{A}, +## @code{ismissing} returns a logical vector or matrix @var{TF} with the same +## dimensions as @var{A}, where @code{true} values match missing values in the +## input data. +## +## The optional input @var{indicator} is an array of values, which represent +## missing values in the input data. The values which represent missing data by +## default depend on the data type of @var{A}: +## +## @itemize +## @item +## @code{NaN}: @code{single}, @code{double}. +## +## @item +## @code{' '} (white space): @code{char}. +## +## @item +## @code{@{''@}}: string cells. +## @end itemize +## +## @end deftypefn +## +## @seealso{all, any, isempty, isnan, rmmissing} + +function TF = ismissing (A, indicator) + + ## check "indicator" + if (nargin != 2) + indicator = []; + else + if (! isvector (indicator)) + error ("ismissing: invalid format for 'indicator'"); + endif + + if ((isnumeric (A) && ! isnumeric (indicator)) || + (ischar (A) && ! ischar (indicator)) || + (iscellstr (A) && ! (iscellstr (indicator) || ischar (indicator)))) + error ("ismissing: 'indicator' and 'A' must have the same data type"); + endif + + ## if A is an array of cell strings and indicator just a string, + ## convert indicator to a cell string with one element + if (iscellstr (A) && ischar (indicator) && ! iscellstr (indicator)) + tmpstr = indicator; + indicator = {}; + + indicator = {tmpstr}; + endif + endif + + ## main logic + if (iscellstr (A) && isempty (indicator)) + TF = false (1, length (A)); + ## remove all empty strings + for iter = 1 : length (A) + if (isempty (A{iter})) + TF(iter) = true; + endif + endfor + elseif (ismatrix (A) && isempty (indicator)) + if (isnumeric (A)) + ## numeric matrix: just remove the NaNs + TF = isnan (A); + elseif (ischar (A)) + ## char matrix: remove the white spaces + TF = isspace (A); + else + error ("ismissing: unsupported data type"); + endif + + TF = logical (TF); + elseif (! isempty (indicator)) + ## special cases with custom values for missing data + [r, c] = size (A); + TF = false (r, c); + + if (iscellstr (A)) + for iter = 1 : length (indicator) + TF(find (strcmp (A, indicator(iter)))) = true; + endfor + elseif (ismatrix (A)) + for iter = 1 : length (indicator) + TF(find (A == indicator(iter))) = true; + endfor + else + error ("ismissing: unsupported data format"); + endif + else + error ("ismissing: unsupported data format"); + endif +endfunction + + +%!assert (ismissing ([1,NaN,3]), [false,true,false]) +%!assert (ismissing ('abcd f'), [false,false,false,false,true,false]) +%!assert (ismissing ({'xxx','','xyz'}), [false,true,false]) +%!assert (ismissing ([1,2;NaN,2]), [false,false;true,false]) + +## Test input validation +%!error ismissing (); +%!error ismissing ({1, 2, 3}); +%!error ismissing ([1 2; 3 4], [5 1; 2 0]); +%!error ismissing ([1 2; 3 4], "abc"); +%!error ismissing ({"", "", ""}, 1); +%!error ismissing ({1, 2, 3}); diff --git a/install-conditionally/base/kendall.m b/install-conditionally/base/kendall.m new file mode 100644 index 0000000..f09eb0c --- /dev/null +++ b/install-conditionally/base/kendall.m @@ -0,0 +1,151 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} kendall (@var{x}) +## @deftypefnx {} {} kendall (@var{x}, @var{y}) +## @cindex Kendall's Tau +## Compute Kendall's +## @tex +## $\tau$. +## @end tex +## @ifnottex +## @var{tau}. +## @end ifnottex +## +## For two data vectors @var{x}, @var{y} of common length @math{N}, Kendall's +## @tex +## $\tau$ +## @end tex +## @ifnottex +## @var{tau} +## @end ifnottex +## is the correlation of the signs of all rank differences of +## @var{x} and @var{y}; i.e., if both @var{x} and @var{y} have distinct +## entries, then +## +## @tex +## $$ \tau = {1 \over N(N-1)} \sum_{i,j} {\rm sign}(q_i-q_j) \, {\rm sign}(r_i-r_j) $$ +## @end tex +## @ifnottex +## +## @example +## @group +## 1 +## @var{tau} = ------- SUM sign (@var{q}(i) - @var{q}(j)) * sign (@var{r}(i) - @var{r}(j)) +## N (N-1) i,j +## @end group +## @end example +## +## @end ifnottex +## @noindent +## in which the +## @tex +## $q_i$ and $r_i$ +## @end tex +## @ifnottex +## @var{q}(i) and @var{r}(i) +## @end ifnottex +## are the ranks of @var{x} and @var{y}, respectively. +## +## If @var{x} and @var{y} are drawn from independent distributions, +## Kendall's +## @tex +## $\tau$ +## @end tex +## @ifnottex +## @var{tau} +## @end ifnottex +## is asymptotically normal with mean 0 and variance +## @tex +## ${2 (2N+5) \over 9N(N-1)}$. +## @end tex +## @ifnottex +## @code{(2 * (2N+5)) / (9 * N * (N-1))}. +## @end ifnottex +## +## @code{kendall (@var{x})} is equivalent to @code{kendall (@var{x}, +## @var{x})}. +## @seealso{ranks, spearman} +## @end deftypefn + +## Author: KH +## Description: Kendall's rank correlation tau + +function tau = kendall (x, y = []) + + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + if ( ! (isnumeric (x) || islogical (x)) + || ! (isnumeric (y) || islogical (y))) + error ("kendall: X and Y must be numeric matrices or vectors"); + endif + + if (ndims (x) != 2 || ndims (y) != 2) + error ("kendall: X and Y must be 2-D matrices or vectors"); + endif + + if (isrow (x)) + x = x.'; + endif + [n, c] = size (x); + + if (nargin == 2) + if (isrow (y)) + y = y.'; + endif + if (rows (y) != n) + error ("kendall: X and Y must have the same number of observations"); + else + x = [x, y]; + endif + endif + + if (isa (x, "single") || isa (y, "single")) + cls = "single"; + else + cls = "double"; + endif + r = ranks (x); + m = sign (kron (r, ones (n, 1, cls)) - kron (ones (n, 1, cls), r)); + tau = corr (m); + + if (nargin == 2) + tau = tau(1 : c, (c + 1) : columns (x)); + endif + +endfunction + + +%!test +%! x = [1:2:10]; +%! y = [100:10:149]; +%! assert (kendall (x,y), 1, 5*eps); +%! assert (kendall (x,fliplr (y)), -1, 5*eps); + +%!assert (kendall (logical (1)), 1) +%!assert (kendall (single (1)), single (1)) + +## Test input validation +%!error kendall () +%!error kendall (1, 2, 3) +%!error kendall (['A'; 'B']) +%!error kendall (ones (2,1), ['A'; 'B']) +%!error kendall (ones (2,2,2)) +%!error kendall (ones (2,2), ones (2,2,2)) +%!error kendall (ones (2,2), ones (3,2)) diff --git a/install-conditionally/base/kurtosis.m b/install-conditionally/base/kurtosis.m new file mode 100644 index 0000000..81ee04a --- /dev/null +++ b/install-conditionally/base/kurtosis.m @@ -0,0 +1,168 @@ +## Copyright (C) 2013-2017 Julien Bect +## Copyright (C) 1996-2016 John W. Eaton +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} kurtosis (@var{x}) +## @deftypefnx {} {} kurtosis (@var{x}, @var{flag}) +## @deftypefnx {} {} kurtosis (@var{x}, @var{flag}, @var{dim}) +## Compute the sample kurtosis of the elements of @var{x}. +## +## The sample kurtosis is defined as +## @tex +## $$ +## \kappa_1 = {{{1\over N}\, +## \sum_{i=1}^N (x_i - \bar{x})^4} \over \sigma^4}, +## $$ +## where $N$ is the length of @var{x}, $\bar{x}$ its mean, and $\sigma$ +## its (uncorrected) standard deviation. +## @end tex +## @ifnottex +## +## @example +## @group +## mean ((@var{x} - mean (@var{x})).^4) +## k1 = ------------------------ +## std (@var{x}).^4 +## @end group +## @end example +## +## @end ifnottex +## +## @noindent +## The optional argument @var{flag} controls which normalization is used. +## If @var{flag} is equal to 1 (default value, used when @var{flag} is omitted +## or empty), return the sample kurtosis as defined above. If @var{flag} is +## equal to 0, return the @w{"bias-corrected"} kurtosis coefficient instead: +## @tex +## $$ +## \kappa_0 = 3 + {\scriptstyle N - 1 \over \scriptstyle (N - 2)(N - 3)} \, +## \left( (N + 1)\, \kappa_1 - 3 (N - 1) \right) +## $$ +## @end tex +## @ifnottex +## +## @example +## @group +## N - 1 +## k0 = 3 + -------------- * ((N + 1) * k1 - 3 * (N - 1)) +## (N - 2)(N - 3) +## @end group +## @end example +## +## where @math{N} is the length of the @var{x} vector. +## +## @end ifnottex +## The bias-corrected kurtosis coefficient is obtained by replacing the sample +## second and fourth central moments by their unbiased versions. It is an +## unbiased estimate of the population kurtosis for normal populations. +## +## If @var{x} is a matrix, or more generally a multi-dimensional array, return +## the kurtosis along the first non-singleton dimension. If the optional +## @var{dim} argument is given, operate along this dimension. +## +## @seealso{var, skewness, moment} +## @end deftypefn + +## Author: KH +## Created: 29 July 1994 +## Adapted-By: jwe + +function y = kurtosis (x, flag, dim) + + if (nargin < 1) || (nargin > 3) + print_usage (); + endif + + if (! (isnumeric (x) || islogical (x))) + error ("kurtosis: X must be a numeric vector or matrix"); + endif + + if (nargin < 2 || isempty (flag)) + flag = 1; # default: do not use the "bias corrected" version + else + if (! isscalar (flag) || (flag != 0 && flag != 1)) + error ("kurtosis: FLAG must be 0 or 1"); + endif + endif + + nd = ndims (x); + sz = size (x); + if (nargin < 3) + ## Find the first non-singleton dimension. + (dim = find (sz > 1, 1)) || (dim = 1); + else + if (! (isscalar (dim) && dim == fix (dim) && dim > 0)) + error ("kurtosis: DIM must be an integer and a valid dimension"); + endif + endif + + n = size (x, dim); + sz(dim) = 1; + + x = center (x, dim); # center also promotes integer, logical to double + v = var (x, 1, dim); # normalize with 1/N + y = sum (x .^ 4, dim); + idx = (v != 0); + y(idx) = y(idx) ./ (n * v(idx) .^ 2); + y(! idx) = NaN; + + ## Apply bias correction to the second and fourth central sample moment + if (flag == 0) + if (n > 3) + C = (n - 1) / ((n - 2) * (n - 3)); + y = 3 + C * ((n + 1) * y - 3 * (n - 1)); + else + y(:) = NaN; + endif + endif + +endfunction + + +%!test +%! x = [-1; 0; 0; 0; 1]; +%! y = [x, 2*x]; +%! assert (kurtosis (y), [2.5, 2.5], sqrt (eps)); + +%!assert (kurtosis ([-3, 0, 1]) == kurtosis ([-1, 0, 3])) +%!assert (kurtosis (ones (3, 5)), NaN (1, 5)) +%!assert (kurtosis (1, [], 3), NaN) + +%!assert (kurtosis ([1:5 10; 1:5 10], 0, 2), 5.4377317925288901 * [1; 1], 8 * eps) +%!assert (kurtosis ([1:5 10; 1:5 10], 1, 2), 2.9786509002956195 * [1; 1], 8 * eps) +%!assert (kurtosis ([1:5 10; 1:5 10], [], 2), 2.9786509002956195 * [1; 1], 8 * eps) + +## Test behavior on single input +%!assert (kurtosis (single ([1:5 10])), single (2.9786513), eps ("single")) +%!assert (kurtosis (single ([1 2]), 0), single (NaN)) + +## Verify no "divide-by-zero" warnings +%!test +%! warning ("on", "Octave:divide-by-zero", "local"); +%! lastwarn (""); # clear last warning +%! kurtosis (1); +%! assert (lastwarn (), ""); + +## Test input validation +%!error kurtosis () +%!error kurtosis (1, 2, 3) +%!error kurtosis (['A'; 'B']) +%!error kurtosis (1, 2) +%!error kurtosis (1, [1 0]) +%!error kurtosis (1, [], ones (2,2)) +%!error kurtosis (1, [], 1.5) +%!error kurtosis (1, [], 0) diff --git a/install-conditionally/base/logit.m b/install-conditionally/base/logit.m new file mode 100644 index 0000000..f512f49 --- /dev/null +++ b/install-conditionally/base/logit.m @@ -0,0 +1,59 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} logit (@var{p}) +## Compute the logit for each value of @var{p} +## +## The logit is defined as +## @tex +## $$ +## {\rm logit}(p) = \log\Big({p \over 1-p}\Big) +## $$ +## @end tex +## @ifnottex +## +## @example +## logit (@var{p}) = log (@var{p} / (1-@var{p})) +## @end example +## +## @end ifnottex +## @seealso{probit, logistic_cdf} +## @end deftypefn + +## Author: KH +## Description: Logit transformation + +function y = logit (p) + + if (nargin != 1) + print_usage (); + endif + + y = logistic_inv (p); + +endfunction + + +%!test +%! p = [0.01:0.01:0.99]; +%! assert (logit (p), log (p ./ (1-p)), 25*eps); + +%!assert (logit ([-1, 0, 0.5, 1, 2]), [NaN, -Inf, 0, +Inf, NaN]) + +## Test input validation +%!error logit () +%!error logit (1, 2) diff --git a/install-conditionally/base/lscov.m b/install-conditionally/base/lscov.m new file mode 100644 index 0000000..e2df3b2 --- /dev/null +++ b/install-conditionally/base/lscov.m @@ -0,0 +1,191 @@ +## Copyright (C) 2014-2017 Nir Krakauer +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {@var{x} =} lscov (@var{A}, @var{b}) +## @deftypefnx {} {@var{x} =} lscov (@var{A}, @var{b}, @var{V}) +## @deftypefnx {} {@var{x} =} lscov (@var{A}, @var{b}, @var{V}, @var{alg}) +## @deftypefnx {} {[@var{x}, @var{stdx}, @var{mse}, @var{S}] =} lscov (@dots{}) +## +## Compute a generalized linear least squares fit. +## +## Estimate @var{x} under the model @var{b} = @var{A}@var{x} + @var{w}, +## where the noise @var{w} is assumed to follow a normal distribution +## with covariance matrix @math{{\sigma^2} V}. +## +## If the size of the coefficient matrix @var{A} is n-by-p, the +## size of the vector/array of constant terms @var{b} must be n-by-k. +## +## The optional input argument @var{V} may be a n-by-1 vector of positive +## weights (inverse variances), or a n-by-n symmetric positive semidefinite +## matrix representing the covariance of @var{b}. If @var{V} is not +## supplied, the ordinary least squares solution is returned. +## +## The @var{alg} input argument, a guidance on solution method to use, is +## currently ignored. +## +## Besides the least-squares estimate matrix @var{x} (p-by-k), the function +## also returns @var{stdx} (p-by-k), the error standard deviation of +## estimated @var{x}; @var{mse} (k-by-1), the estimated data error covariance +## scale factors (@math{\sigma^2}); and @var{S} (p-by-p, or p-by-p-by-k if k +## > 1), the error covariance of @var{x}. +## +## Reference: @nospell{Golub and Van Loan} (1996), +## @cite{Matrix Computations (3rd Ed.)}, Johns Hopkins, Section 5.6.3 +## +## @seealso{ols, gls, lsqnonneg} +## @end deftypefn + +## Author: Nir Krakauer + +function [x, stdx, mse, S] = lscov (A, b, V = [], alg) + + if (nargin < 2 || (rows (A) != rows (b))) + print_usage (); + endif + + n = rows (A); + p = columns (A); + k = columns (b); + + if (! isempty (V)) + if (rows (V) != n || ! any (columns (V) == [1 n])) + error ("lscov: V should be a square matrix or a vector with the same number of rows as A"); + endif + + if (isvector (V)) + ## n-by-1 vector of inverse variances + v = diag (sqrt (V)); + A = v * A; + b = v * b; + else + ## n-by-n covariance matrix + try + ## ordinarily V will be positive definite + B = chol (V)'; + catch + ## if V is only positive semidefinite, use its + ## eigendecomposition to find a factor B such that V = B*B' + [B, lambda] = eig (V); + image_dims = (diag (lambda) > 0); + B = B(:, image_dims) * sqrt (lambda(image_dims, image_dims)); + end_try_catch + A = B \ A; + b = B \ b; + endif + endif + + pinv_A = pinv (A); #pseudoinverse + + x = pinv_A * b; + + if (isargout (3)) + dof = n - p; #degrees of freedom remaining after fit + SSE = sumsq (b - A * x); + mse = SSE / dof; + endif + + s = pinv_A * pinv_A'; + + stdx = sqrt (diag (s) * mse); + + if (isargout (4)) + if (k == 1) + S = mse * s; + else + S = NaN (p, p, k); + for i = 1:k + S(:, :, i) = mse(i) * s; + endfor + endif + endif + +endfunction + + +%!test <49040> +%! ## Longley data from the NIST Statistical Reference Dataset +%! Z = [ 60323 83.0 234289 2356 1590 107608 1947 +%! 61122 88.5 259426 2325 1456 108632 1948 +%! 60171 88.2 258054 3682 1616 109773 1949 +%! 61187 89.5 284599 3351 1650 110929 1950 +%! 63221 96.2 328975 2099 3099 112075 1951 +%! 63639 98.1 346999 1932 3594 113270 1952 +%! 64989 99.0 365385 1870 3547 115094 1953 +%! 63761 100.0 363112 3578 3350 116219 1954 +%! 66019 101.2 397469 2904 3048 117388 1955 +%! 67857 104.6 419180 2822 2857 118734 1956 +%! 68169 108.4 442769 2936 2798 120445 1957 +%! 66513 110.8 444546 4681 2637 121950 1958 +%! 68655 112.6 482704 3813 2552 123366 1959 +%! 69564 114.2 502601 3931 2514 125368 1960 +%! 69331 115.7 518173 4806 2572 127852 1961 +%! 70551 116.9 554894 4007 2827 130081 1962 ]; +%! ## Results certified by NIST using 500 digit arithmetic +%! ## b and standard error in b +%! V = [ -3482258.63459582 890420.383607373 +%! 15.0618722713733 84.9149257747669 +%! -0.358191792925910E-01 0.334910077722432E-01 +%! -2.02022980381683 0.488399681651699 +%! -1.03322686717359 0.214274163161675 +%! -0.511041056535807E-01 0.226073200069370 +%! 1829.15146461355 455.478499142212 ]; +%! rsd = 304.854073561965; +%! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)]; +%! alpha = 0.05; +%! [b, stdb, mse] = lscov (X, y); +%! assert(b, V(:,1), 3e-6); +%! assert(stdb, V(:,2), -1.e-5); +%! assert(sqrt (mse), rsd, -1E-6); + +%!test +%! ## Adapted from example in Matlab documentation +%! x1 = [.2 .5 .6 .8 1.0 1.1]'; +%! x2 = [.1 .3 .4 .9 1.1 1.4]'; +%! X = [ones(size(x1)) x1 x2]; +%! y = [.17 .26 .28 .23 .27 .34]'; +%! [b, se_b, mse, S] = lscov(X, y); +%! assert(b, [0.1203 0.3284 -0.1312]', 1E-4); +%! assert(se_b, [0.0643 0.2267 0.1488]', 1E-4); +%! assert(mse, 0.0015, 1E-4); +%! assert(S, [0.0041 -0.0130 0.0075; -0.0130 0.0514 -0.0328; 0.0075 -0.0328 0.0221], 1E-4); +%! w = [1 1 1 1 1 .1]'; +%! [bw, sew_b, msew] = lscov (X, y, w); +%! assert(bw, [0.1046 0.4614 -0.2621]', 1E-4); +%! assert(sew_b, [0.0309 0.1152 0.0814]', 1E-4); +%! assert(msew, 3.4741e-004, -1E-4); +%! V = .2*ones(length(x1)) + .8*diag(ones(size(x1))); +%! [bg, sew_b, mseg] = lscov (X, y, V); +%! assert(bg, [0.1203 0.3284 -0.1312]', 1E-4); +%! assert(sew_b, [0.0672 0.2267 0.1488]', 1E-4); +%! assert(mseg, 0.0019, 1E-4); +%! y2 = [y 2*y]; +%! [b2, se_b2, mse2, S2] = lscov (X, y2); +%! assert(b2, [b 2*b], 2*eps); +%! assert(se_b2, [se_b 2*se_b], eps); +%! assert(mse2, [mse 4*mse], eps); +%! assert(S2(:, :, 1), S, eps); +%! assert(S2(:, :, 2), 4*S, eps); + +%!test +%! ## Artificial example with positive semidefinite weight matrix +%! x = (0:0.2:2)'; +%! y = round(100*sin(x) + 200*cos(x)); +%! X = [ones(size(x)) sin(x) cos(x)]; +%! V = eye(numel(x)); +%! V(end, end-1) = V(end-1, end) = 1; +%! [b, seb, mseb, S] = lscov (X, y, V); +%! assert(b, [0 100 200]', 0.2); diff --git a/install-conditionally/base/mad.m b/install-conditionally/base/mad.m new file mode 100644 index 0000000..5f4295a --- /dev/null +++ b/install-conditionally/base/mad.m @@ -0,0 +1,107 @@ +## Copyright (C) 2017-2018 Rik Wehbring +## +## This file is part of Octave. +## +## Octave is free software: you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} mad (@var{x}) +## @deftypefnx {} {} mad (@var{x}, @var{opt}) +## @deftypefnx {} {} mad (@var{x}, @var{opt}, @var{dim}) +## Compute the mean or median absolute deviation of the elements of @var{x}. +## +## The mean absolute deviation is defined as +## +## @example +## @var{mad} = mean (abs (@var{x} - mean (@var{x}))) +## @end example +## +## The median absolute deviation is defined as +## +## @example +## @var{mad} = median (abs (@var{x} - median (@var{x}))) +## @end example +## +## If @var{x} is a matrix, compute @code{mad} for each column and return +## results in a row vector. For a multi-dimensional array, the calculation is +## done over the first non-singleton dimension. +## +## The optional argument @var{opt} determines whether mean or median absolute +## deviation is calculated. The default is 0 which corresponds to mean +## absolute deviation; A value of 1 corresponds to median absolute deviation. +## +## If the optional argument @var{dim} is given, operate along this dimension. +## +## As a measure of dispersion, @code{mad} is less affected by outliers than +## @code{std}. +## @seealso{bounds, range, iqr, std, mean, median} +## @end deftypefn + +function retval = mad (x, opt = 0, dim) + + if (nargin < 1 || nargin > 3) + print_usage (); + endif + + if (! (isnumeric (x) || islogical (x))) + error ("mad: X must be a numeric vector or matrix"); + endif + + if (isempty (opt)) + opt = 0; + elseif (! isscalar (opt) || (opt != 0 && opt != 1)) + error ("mad: OPT must be 0 or 1"); + endif + + sz = size (x); + if (nargin < 3) + ## Find the first non-singleton dimension. + (dim = find (sz > 1, 1)) || (dim = 1); + else + if (! (isscalar (dim) && dim == fix (dim) && dim > 0)) + error ("mad: DIM must be an integer and a valid dimension"); + endif + endif + + if (opt == 0) + fcn = @mean; + else + fcn = @median; + endif + + retval = fcn (abs (x - fcn (x, dim)), dim); + +endfunction + + +%!assert (mad ([0 0 1 2 100]), 31.76) +%!assert (mad (single ([0 0 1 2 100])), single (31.76)) +%!assert (mad ([0 0 1 2 100]'), 31.76) +%!assert (mad ([0 0 1 2 100], 1), 1) +%!assert (mad (single ([0 0 1 2 100]), 1), single (1)) +%!assert (mad ([0 0 1 2 100]', 1), 1) +%!assert (mad (magic (4)), [4, 4, 4, 4]) +%!assert (mad (magic (4), [], 2), [6; 2; 2; 6]) +%!assert (mad (magic (4), 1), [2.5, 3.5, 3.5, 2.5]) +%!assert (mad (magic (4), 1, 2), [5.5; 1.5; 1.5; 5.5]) + +## Test input validation +%!error mad () +%!error mad (1, 2, 3, 4) +%!error mad (['A'; 'B']) +%!error mad (1, 2) +%!error mad (1, [], ones (2,2)) +%!error mad (1, [], 1.5) +%!error mad (1, [], 0) diff --git a/install-conditionally/base/mean.m b/install-conditionally/base/mean.m new file mode 100644 index 0000000..369dd26 --- /dev/null +++ b/install-conditionally/base/mean.m @@ -0,0 +1,265 @@ +## Copyright (C) 1995-2017 Kurt Hornik +## +## This program is free software: you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation, either version 3 of the +## License, or (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; see the file COPYING. If not, see +## . + +## -*- texinfo -*- +## @deftypefn {} {} mean (@var{x}) +## @deftypefnx {} {} mean (@var{x}, @var{dim}) +## @deftypefnx {} {} mean (@var{x}, @var{opt}) +## @deftypefnx {} {} mean (@var{x}, @var{dim}, @var{opt}) +## Compute the mean of the elements of the vector @var{x}. +## +## The mean is defined as +## +## @tex +## $$ {\rm mean}(x) = \bar{x} = {1\over N} \sum_{i=1}^N x_i $$ +## where $N$ is the number of elements of @var{x}. +## +## @end tex +## @ifnottex +## +## @example +## mean (@var{x}) = SUM_i @var{x}(i) / N +## @end example +## +## where @math{N} is the length of the @var{x} vector. +## +## @end ifnottex +## If @var{x} is a matrix, compute the mean for each column and return them +## in a row vector. +## +## If the optional argument @var{dim} is given, operate along this dimension. +## +## The optional argument @var{opt} selects the type of mean to compute. +## The following options are recognized: +## +## @table @asis +## @item @qcode{"a"} +## Compute the (ordinary) arithmetic mean. [default] +## +## @item @qcode{"g"} +## Compute the geometric mean. +## +## @item @qcode{"h"} +## Compute the harmonic mean. +## @end table +## +## Both @var{dim} and @var{opt} are optional. If both are supplied, either +## may appear first. +## @seealso{median, mode} +## @end deftypefn + +## Author: KH +## Description: Compute arithmetic, geometric, and harmonic mean + +function retval = mean (x, opt1, opt2) + + if (nargin < 1 || nargin > 3) + print_usage (); + endif + + if (! (isnumeric (x) || islogical (x))) + error ("mean: X must be a numeric vector or matrix"); + endif + + need_dim = false; + + if (nargin == 1) + opt = "a"; + need_dim = true; + elseif (nargin == 2) + if (ischar (opt1)) + opt = opt1; + need_dim = true; + else + dim = opt1; + opt = "a"; + endif + elseif (nargin == 3) + if (ischar (opt1)) + opt = opt1; + dim = opt2; + elseif (ischar (opt2)) + opt = opt2; + dim = opt1; + else + error ("mean: OPT must be a string"); + endif + else + print_usage (); + endif + + nd = ndims (x); + sz = size (x); + if (need_dim) + ## Find the first non-singleton dimension. + (dim = find (sz > 1, 1)) || (dim = 1); + else + if (! (isscalar (dim) && dim == fix (dim) && dim > 0)) + error ("mean: DIM must be an integer and a valid dimension"); + endif + endif + + n = size (x, dim); + + if (isempty (x)) + %% codepath for Matlab compatibility. empty x produces NaN output, but + %% for ndim > 2, output depends on size of x and whether DIM is set. + if ((nd == 2) && (max (sz) < 2) && need_dim) + retval = NaN; + else + if (~need_dim) + sz(dim) = 1; + else + sz (find ((sz ~= 1), 1)) = 1; + endif + retval = NaN (sz); + endif + + if (isa (x, "single")) + retval = single (retval); + endif + + elseif (strcmp (opt, "a")) + retval = sum (x, dim) / n; + + elseif (strcmp (opt, "g")) + if (all (x(:) >= 0)) + retval = exp (sum (log (x), dim) ./ n); + else + error ("mean: X must not contain any negative values"); + endif + elseif (strcmp (opt, "h")) + retval = n ./ sum (1 ./ x, dim); + else + error ("mean: option '%s' not recognized", opt); + endif + +endfunction + + +%!test +%! x = -10:10; +%! y = x'; +%! z = [y, y+10]; +%! assert (mean (x), 0); +%! assert (mean (y), 0); +%! assert (mean (z), [0, 10]); + +## Test small numbers +%!assert (mean (repmat (0.1, 1, 1000), "g"), 0.1, 20*eps) + +%!assert (mean (magic (3), 1), [5, 5, 5]) +%!assert (mean (magic (3), 2), [5; 5; 5]) +%!assert (mean ([2 8], "g"), 4) +%!assert (mean ([4 4 2], "h"), 3) +%!assert (mean (logical ([1 0 1 1])), 0.75) +%!assert (mean (single ([1 0 1 1])), single (0.75)) +%!assert (mean ([1 2], 3), [1 2]) + +##tests for empty input Matlab compatibility (bug #48690) +%!assert (mean ([]), NaN) +%!assert (mean (single([])), single(NaN)) +%!assert (mean (ones (0, 0, 0, 0)), NaN (1, 0, 0, 0)) +%!assert (mean (ones (0, 0, 0, 1)), NaN (1, 0, 0, 1)) +%!assert (mean (ones (0, 0, 0, 2)), NaN (1, 0, 0, 2)) +%!assert (mean (ones (0, 0, 1, 0)), NaN (1, 0, 1, 0)) +%!assert (mean (ones (0, 0, 1, 1)), NaN (1, 1, 1, 1)) +%!assert (mean (ones (0, 0, 1, 2)), NaN (1, 0, 1, 2)) +%!assert (mean (ones (0, 0, 2, 0)), NaN (1, 0, 2, 0)) +%!assert (mean (ones (0, 0, 2, 1)), NaN (1, 0, 2, 1)) +%!assert (mean (ones (0, 0, 2, 2)), NaN (1, 0, 2, 2)) +%!assert (mean (ones (0, 1, 0, 0)), NaN (1, 1, 0, 0)) +%!assert (mean (ones (0, 1, 0, 1)), NaN (1, 1, 0, 1)) +%!assert (mean (ones (0, 1, 0, 2)), NaN (1, 1, 0, 2)) +%!assert (mean (ones (0, 1, 1, 0)), NaN (1, 1, 1, 0)) +%!assert (mean (ones (0, 1, 1, 1)), NaN (1, 1, 1, 1)) +%!assert (mean (ones (0, 1, 1, 2)), NaN (1, 1, 1, 2)) +%!assert (mean (ones (0, 1, 2, 0)), NaN (1, 1, 2, 0)) +%!assert (mean (ones (0, 1, 2, 1)), NaN (1, 1, 2, 1)) +%!assert (mean (ones (0, 1, 2, 2)), NaN (1, 1, 2, 2)) +%!assert (mean (ones (0, 2, 0, 0)), NaN (1, 2, 0, 0)) +%!assert (mean (ones (0, 2, 0, 1)), NaN (1, 2, 0, 1)) +%!assert (mean (ones (0, 2, 0, 2)), NaN (1, 2, 0, 2)) +%!assert (mean (ones (0, 2, 1, 0)), NaN (1, 2, 1, 0)) +%!assert (mean (ones (0, 2, 1, 1)), NaN (1, 2, 1, 1)) +%!assert (mean (ones (0, 2, 1, 2)), NaN (1, 2, 1, 2)) +%!assert (mean (ones (0, 2, 2, 0)), NaN (1, 2, 2, 0)) +%!assert (mean (ones (0, 2, 2, 1)), NaN (1, 2, 2, 1)) +%!assert (mean (ones (0, 2, 2, 2)), NaN (1, 2, 2, 2)) +%!assert (mean (ones (1, 0, 0, 0)), NaN (1, 1, 0, 0)) +%!assert (mean (ones (1, 0, 0, 1)), NaN (1, 1, 0, 1)) +%!assert (mean (ones (1, 0, 0, 2)), NaN (1, 1, 0, 2)) +%!assert (mean (ones (1, 0, 1, 0)), NaN (1, 1, 1, 0)) +%!assert (mean (ones (1, 0, 1, 1)), NaN (1, 1, 1, 1)) +%!assert (mean (ones (1, 0, 1, 2)), NaN (1, 1, 1, 2)) +%!assert (mean (ones (1, 0, 2, 0)), NaN (1, 1, 2, 0)) +%!assert (mean (ones (1, 0, 2, 1)), NaN (1, 1, 2, 1)) +%!assert (mean (ones (1, 0, 2, 2)), NaN (1, 1, 2, 2)) +%!assert (mean (ones (1, 1, 0, 0)), NaN (1, 1, 1, 0)) +%!assert (mean (ones (1, 1, 0, 1)), NaN (1, 1, 1, 1)) +%!assert (mean (ones (1, 1, 0, 2)), NaN (1, 1, 1, 2)) +%!assert (mean (ones (1, 1, 1, 0)), NaN (1, 1, 1, 1)) +%!assert (mean (ones (1, 1, 2, 0)), NaN (1, 1, 1, 0)) +%!assert (mean (ones (1, 2, 0, 0)), NaN (1, 1, 0, 0)) +%!assert (mean (ones (1, 2, 0, 1)), NaN (1, 1, 0, 1)) +%!assert (mean (ones (1, 2, 0, 2)), NaN (1, 1, 0, 2)) +%!assert (mean (ones (1, 2, 1, 0)), NaN (1, 1, 1, 0)) +%!assert (mean (ones (1, 2, 2, 0)), NaN (1, 1, 2, 0)) +%!assert (mean (ones (2, 0, 0, 0)), NaN (1, 0, 0, 0)) +%!assert (mean (ones (2, 0, 0, 1)), NaN (1, 0, 0, 1)) +%!assert (mean (ones (2, 0, 0, 2)), NaN (1, 0, 0, 2)) +%!assert (mean (ones (2, 0, 1, 0)), NaN (1, 0, 1, 0)) +%!assert (mean (ones (2, 0, 1, 1)), NaN (1, 0, 1, 1)) +%!assert (mean (ones (2, 0, 1, 2)), NaN (1, 0, 1, 2)) +%!assert (mean (ones (2, 0, 2, 0)), NaN (1, 0, 2, 0)) +%!assert (mean (ones (2, 0, 2, 1)), NaN (1, 0, 2, 1)) +%!assert (mean (ones (2, 0, 2, 2)), NaN (1, 0, 2, 2)) +%!assert (mean (ones (2, 1, 0, 0)), NaN (1, 1, 0, 0)) +%!assert (mean (ones (2, 1, 0, 1)), NaN (1, 1, 0, 1)) +%!assert (mean (ones (2, 1, 0, 2)), NaN (1, 1, 0, 2)) +%!assert (mean (ones (2, 1, 1, 0)), NaN (1, 1, 1, 0)) +%!assert (mean (ones (2, 1, 2, 0)), NaN (1, 1, 2, 0)) +%!assert (mean (ones (2, 2, 0, 0)), NaN (1, 2, 0, 0)) +%!assert (mean (ones (2, 2, 0, 1)), NaN (1, 2, 0, 1)) +%!assert (mean (ones (2, 2, 0, 2)), NaN (1, 2, 0, 2)) +%!assert (mean (ones (2, 2, 1, 0)), NaN (1, 2, 1, 0)) +%!assert (mean (ones (2, 2, 2, 0)), NaN (1, 2, 2, 0)) +%!assert (mean (ones (1, 1, 0, 0, 0)), NaN (1, 1, 1, 0, 0)) +%!assert (mean (ones (1, 1, 1, 1, 0)), NaN (1, 1, 1, 1, 1)) +%!assert (mean (ones (2, 1, 1, 1, 0)), NaN (1, 1, 1, 1, 0)) +%!assert (mean (ones (1, 2, 1, 1, 0)), NaN (1, 1, 1, 1, 0)) +%!assert (mean (ones (1, 3, 0, 2)), NaN (1, 1, 0, 2)) +%!assert (mean (single (ones (1, 3, 0, 2))), single (NaN (1, 1, 0, 2))) +%!assert (mean ([], 1), NaN (1, 0)) +%!assert (mean ([], 2), NaN (0, 1)) +%!assert (mean ([], 3), []) +%!assert (mean (ones (1, 0), 1), NaN (1, 0)) +%!assert (mean (ones (1, 0), 2), NaN) +%!assert (mean (ones (1, 0), 3), NaN (1, 0)) +%!assert (mean (ones (0, 1), 1), NaN) +%!assert (mean (ones (0, 1), 2), NaN (0, 1)) +%!assert (mean (ones (0, 1), 3), NaN (0, 1)) + +## Test input validation +%!error mean () +%!error mean (1, 2, 3, 4) +%!error mean ({1:5}) +%!error mean (1, 2, 3) +%!error mean (1, ones (2, 2)) +%!error mean (1, 1.5) +%!error mean (1, 0) +%!error mean ([1 -1], "g") +%!error