diff options
author | Joey Hess <joeyh@debian.org> | 2010-02-05 13:42:32 -0800 |
---|---|---|
committer | Joey Hess <joeyh@debian.org> | 2010-02-05 13:42:32 -0800 |
commit | 474dd0dbd852f84e5b7507e01636df0c6e90c412 (patch) | |
tree | 293ccce7717ad8bbb62c932d58fcde03f749b851 |
liblingua-en-words2nums-perl (0.18) unstable; urgency=low
* Remove the PM_FILTER to support Windows. (rt.cpan.org #38101)
# imported from the archive
-rw-r--r-- | MANIFEST | 12 | ||||
-rwxr-xr-x | Makefile.PL | 22 | ||||
-rw-r--r-- | README | 11 | ||||
-rw-r--r-- | TODO | 7 | ||||
-rw-r--r-- | Words2Nums.pm | 280 | ||||
-rw-r--r-- | debian/changelog | 128 | ||||
-rw-r--r-- | debian/compat | 1 | ||||
-rw-r--r-- | debian/control | 17 | ||||
-rw-r--r-- | debian/copyright | 10 | ||||
-rw-r--r-- | debian/docs | 1 | ||||
-rw-r--r-- | debian/examples | 1 | ||||
-rwxr-xr-x | debian/rules | 7 | ||||
-rw-r--r-- | samples | 78 | ||||
-rw-r--r-- | t/samples.t | 27 | ||||
-rwxr-xr-x | testnum | 14 |
15 files changed, 616 insertions, 0 deletions
diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..4a4c1ae --- /dev/null +++ b/MANIFEST @@ -0,0 +1,12 @@ +MANIFEST +Makefile.PL +README +TODO +debian/changelog +debian/control +debian/copyright +debian/rules +samples +t/samples.t +testnum +Words2Nums.pm diff --git a/Makefile.PL b/Makefile.PL new file mode 100755 index 0000000..97778b2 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,22 @@ +#!/usr/bin/perl +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'Lingua::EN::Words2Nums', + VERSION => getversion(), + $] < 5.005 ? () : ( + AUTHOR => 'Joey Hess <joey@kitenet.net>', + ABSTRACT_FROM => 'Words2Nums.pm', + ) +); + +# The version is pulled from the debian changelog, to avoid having to keep +# two copies synced. +sub getversion { + open(DCH, "debian/changelog") or die "debian/changelog: $!"; + $_=<DCH>; + my ($vers) = m/\((.+)\)/; + die "no version" unless length $vers; + return $vers; +} + @@ -0,0 +1,11 @@ +Lingua::EN::Words2Nums parses English representations of numbers like +"two thousand and one". Some samples of the range of inputs it can handle +can be found in the file "samples", which is used by the regression test +suite. + +Web site is http://kitenet.net/~joey/code/words2nums/ + +Copyright 2001, 2003 Joey Hess <joey@kitenet.net> + +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. @@ -0,0 +1,7 @@ +* spelled out numbers, ie, "nine one one" => 911, + (currently, it returns 9 + 1 + 1) + (getting this right is a pre-req for decimals..) +* fractions and decimals +* support strings that start with a number and end with non-numbers, + returning the remaining part. HARD -- numbers are currently parsed + backwards! diff --git a/Words2Nums.pm b/Words2Nums.pm new file mode 100644 index 0000000..1c7f13b --- /dev/null +++ b/Words2Nums.pm @@ -0,0 +1,280 @@ +#!/usr/bin/perl + +=head1 NAME + +Lingua::EN::Words2Nums - convert English text to numbers + +=cut + +package Lingua::EN::Words2Nums; +use warnings; +use strict; +require Exporter; +our @ISA=qw(Exporter); +our @EXPORT=qw(&words2nums); + +=head1 SYNOPSIS + + use Lingua::EN::Words2Nums; + $num=words2nums("two thousand and one"); + $num=words2nums("twenty-second"); + $num=words2nums("15 billion, 6 million, and ninteen"); + +=head1 DESCRIPTION + +This module converts English text into numbers. It supports both ordinal and +cardinal numbers, negative numbers, and very large numbers. + +The main subroutine, which is exported by default, is words2nums(). This +subroutine, when fed a string, will attempt to convert it into a number. +If it succeeds, the number will be returned. If it fails, it returns undef. + +=head1 VARIABLES + +There are a number of variables that can be used to tweak the behavior of this +module. For example, debugging can be be enabled by setting +$Lingua::EN::Words2Nums::debug=1 + +=over 4 + +=cut + +# Public global variables. +our $debug = 0; +our $billion = 10 ** 9; + +=item $Lingua::EN::Words2Nums::debug + +Default: 0. If set to a true value, outputs on standard error some useful +messages if parsing fails for some reason. + +=item $Lingua::EN::Words2Nums::billion + +Default: 10 ** 9. This is the number that will be returned for "one billion". +It defaults to the American version; the English will want to set it to +10 ** 12. Setting this number automatically changes all the larger numbers +(trillion, quadrillion, etc) to match. + +=back + +=head1 NOTES + +It does not understand decimals or fractions, yet. + +Scores are supported, eg: "four score and ten". So are dozens. So is a baker's +dozen. And a gross. + +Various mispellings of numbers are understood. + +While it handles googol correctly, googolplex is too large to fit in perl's +standard scalar type, and "inf" will be returned. + +=cut + +our %nametosub = ( + naught => [ \&num, 0 ], # Cardinal numbers, leaving out the a + nought => [ \&num, 0 ], + zero => [ \&num, 0 ], # ones that just add "th". + one => [ \&num, 1 ], first => [ \&num, 1 ], + two => [ \&num, 2 ], second => [ \&num, 2 ], + three => [ \&num, 3 ], third => [ \&num, 3 ], + four => [ \&num, 4 ], fourth => [ \&num, 4 ], + five => [ \&num, 5 ], fifth => [ \&num, 5 ], + six => [ \&num, 6 ], + seven => [ \&num, 7 ], seven => [ \&num, 7 ], + eight => [ \&num, 8 ], eighth => [ \&num, 8 ], + nine => [ \&num, 9 ], ninth => [ \&num, 9 ], + ten => [ \&num, 10 ], + eleven => [ \&num, 11 ], + twelve => [ \&num, 12 ], twelfth => [ \&num, 12 ], + thirteen => [ \&num, 13 ], + fifteen => [ \&num, 15 ], + eighteen => [ \&num, 18 ], + ninteen => [ \&num, 19 ], # common(?) mispelling + teen => [ \&suffix, 10 ], # takes care of the regular teens + twenty => [ \&num, 20 ], twentieth => [ \&num, 20 ], + thirty => [ \&num, 30 ], thirtieth => [ \&num, 30 ], + forty => [ \&num, 40 ], fortieth => [ \&num, 40 ], + fourty => [ \&num, 40 ], fourtieth => [ \&num, 40 ], # at least I mispell it like this + fifty => [ \&num, 50 ], fiftieth => [ \&num, 50 ], + sixty => [ \&num, 60 ], sixtieth => [ \&num, 60 ], + seventy => [ \&num, 70 ], seventieth => [ \&num, 70 ], + eighty => [ \&num, 80 ], eightieth => [ \&num, 80 ], + ninety => [ \&num, 90 ], ninetieth => [ \&num, 90 ], + ninty => [ \&num, 90 ], # common mispelling + hundred => [ \&prefix, 100 ], + thousand => [ \&prefix, 1000 ], + million => [ \&prefix, 10 ** 6 ], + milion => [ \&prefix, 10 ** 6 ], # common(?) mispelling + milliard => [ \&prefix, 10 ** 9 ], + billion => [ \&powprefix, 2 ], # These vary depending on country. + billiard => [ \&prefix, 10 ** 15 ], + trillion => [ \&powprefix, 3 ], + trilliard => [ \&prefix, 10 ** 21 ], + quadrillion => [ \&powprefix, 4 ], + quadrilliard => [ \&prefix, 10 ** 27 ], + quintillion => [ \&powprefix, 5 ], + quintilliard => [ \&prefix, 10 ** 33 ], + sextillion => [ \&powprefix, 6 ], + sextilliard => [ \&prefix, 10 ** 39 ], + septillion => [ \&powprefix, 7 ], + septilliard => [ \&prefix, 10 ** 45 ], + octillion => [ \&powprefix, 8 ], + octilliard => [ \&prefix, 10 ** 51 ], + nonillion => [ \&powprefix, 9 ], + nonilliard => [ \&prefix, 10 ** 57 ], + decillion => [ \&powprefix, 10 ], + decilliard => [ \&prefix, 10 ** 63 ], + undecillion => [ \&powprefix, 11 ], + undecilliard => [ \&prefix, 10 ** 69 ], + duodecillion => [ \&powprefix, 12 ], + duodecilliard => [ \&prefix, 10 ** 75 ], + tredecillion => [ \&powprefix, 13 ], + tredecilliard => [ \&prefix, 10 ** 81 ], + quattuordecillion => [ \&powprefix, 14 ], + quattuordecilliard => [ \&prefix, 10 ** 87 ], + quindecillion => [ \&powprefix, 15 ], + quindecilliard => [ \&prefix, 10 ** 93 ], + sexdecillion => [ \&powprefix, 16 ], + septendecillion => [ \&powprefix, 17 ], + octodecillion => [ \&powprefix, 18 ], + novemdecillion => [ \&powprefix, 19 ], + vigintillion => [ \&powprefix, 20 ], + unvigintillion => [ \&powprefix, 21 ], + duovigintillion => [ \&powprefix, 22 ], + duvigintillion => [ \&powprefix, 22 ], # some use this spelling + trevigintillion => [ \&powprefix, 23 ], + quattuorvigintillion => [ \&powprefix, 24 ], + quinvigintillion => [ \&powprefix, 25 ], + sexvigintillion => [ \&powprefix, 26 ], + septenvigintillion => [ \&powprefix, 27 ], + octovigintillion => [ \&powprefix, 28 ], + novemvigintillion => [ \&powprefix, 29 ], + trigintillion => [ \&powprefix, 30 ], + # This process can be continued indefinitely, but one has to stop + # somewhere. -- A Dictionary of Units of Measurement + centillion => [ \&powprefix, 100 ], + googol => [ \&googol ], + googolplex => [ \&googolplex ], + negative => [ \&invert ], + minus => [ \&invert ], + score => [ \&prefix, 20 ], + gross => [ \&prefix, 12 * 12 ], + dozen => [ \&prefix, 12 ], + bakersdozen => [ \&prefix, 13 ], + bakerdozen => [ \&prefix, 13 ], + eleventyone => [ \&num, 111 ], # This nprogram written on the day + eleventyfirst =>[ \&num, 111 ], # FOTR released. + s => [ sub {} ], # ignore 's', at the end of a word, + # easy pluralization of dozens, etc. + es => [ sub {} ], # same for 'es'; for googolplexes, etc. + th => [ sub {} ], # ignore 'th', for cardinal nums +); + +# Note the ordering, so that eg, ninety has a chance to match before nine. +my $numregexp = join("|", reverse sort keys %nametosub); +$numregexp=qr/($numregexp)/; + +my ($total, $mult, $oldpre, $newmult, $suffix, $val); + +sub num ($) { + $val = shift; + if ($suffix) { + $val += $suffix; + $suffix = 0; + } + $total += $val * $mult; + $newmult = 0; +} + +sub prefix ($) { + my $pre = shift; + if ($pre > $oldpre) { # end of a prefix chain + $total += $mult if $newmult; # special case for lone "thousand", etc. + $mult = 1; + } + $mult *= $pre; + $oldpre = $pre; + $newmult = 1; +} + +sub powprefix { + my $power = shift; + if ($billion == 10 ** 9) { # EN + prefix(10 ** (($power + 1) * 3)); + } + elsif ($billion == 10 ** 12) { # GB + prefix(10 ** ($power * 6)); + } + else { + failure("\$billion is set to odd value: $billion"); + } +} + + +sub suffix ($) { + $suffix = shift; +} + +sub invert () { + $total *= -1; +} + +sub googol () { + prefix(10 ** 100); +} + +sub googolplex () { + prefix(10 ** (10 ** 100)); +} + +sub failure ($) { + print STDERR shift()."\n" if $debug; + return; # undef on failure +} + +sub words2nums ($) { + local $_=lc(shift); + chomp $_; + + s/,//; # ignore comma, even if it's in a plain number + return $_ if /^[-+]?[.0-9\s]+$/; # short circuit for plain number + + if (/^[-+0-9.]+$/) { + return failure("+ or - not at beginning") if length $_; + } + + s/\b(and|a|of)\b//g; # ignore some common words + s/[^A-Za-z0-9.]//g; # ignore spaces and punctuation, except period. + return failure("not a number") unless length $_; + + $total=$oldpre=$suffix=$newmult=0; + $mult=1; + + # Work backwards up the string. + while (length $_) { + $nametosub{$1}[0]->($nametosub{$1}[1]) while s/$numregexp$//; + if (length $_) { + if (s/(\d+)(?:st|nd|rd|th)?$//) { + num($1); + } + else { + last; + } + } + } + return failure("error at $_") if length $_; + $total += $mult if $newmult; # special case for lone "thousand", etc. + return $total; +} + +=head1 AUTHOR + +Copyright 2001-2003 Joey Hess <joey@kitenet.net> + +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +1 diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..44ca0b5 --- /dev/null +++ b/debian/changelog @@ -0,0 +1,128 @@ +liblingua-en-words2nums-perl (0.18) unstable; urgency=low + + * Remove the PM_FILTER to support Windows. (rt.cpan.org #38101) + + -- Joey Hess <joeyh@debian.org> Fri, 05 Feb 2010 16:42:32 -0500 + +liblingua-en-words2nums-perl (0.17) unstable; urgency=low + + * Use debhelper v7; rules file minimisation. + + -- Joey Hess <joeyh@debian.org> Tue, 22 Jul 2008 00:29:12 -0400 + +liblingua-en-words2nums-perl (0.16) unstable; urgency=low + + * The repository has moved from svn to git. + * Minor improvement to debian/rules clean. + + -- Joey Hess <joeyh@debian.org> Fri, 19 Oct 2007 22:21:04 -0400 + +liblingua-en-words2nums-perl (0.15) unstable; urgency=low + + * Update url to the web site. + * Current standards-version (no real changes). + * Fix unicode error in man page. + * Fix lintian warning about rules file. + + -- Joey Hess <joeyh@debian.org> Mon, 04 Jun 2007 16:49:19 -0400 + +liblingua-en-words2nums-perl (0.14) unstable; urgency=low + + * Remove the tests that involve exponentents, as they may not on 64 bit + machines, and will cause false test failures. Closes: #250610 + + -- Joey Hess <joeyh@debian.org> Fri, 4 Jun 2004 15:09:13 -0300 + +liblingua-en-words2nums-perl (0.13) unstable; urgency=low + + * Remove quoting in Makefile.PL so it will build under 5.8.1 (this breaks + building under earlier versions of perl though). Closes: #213928 + + -- Joey Hess <joeyh@debian.org> Mon, 6 Oct 2003 19:49:19 -0400 + +liblingua-en-words2nums-perl (0.12) unstable; urgency=low + + * Move from build-depends-indep to build-depends to meet current policy. + + -- Joey Hess <joeyh@debian.org> Wed, 3 Sep 2003 12:14:45 -0400 + +liblingua-en-words2nums-perl (0.11) unstable; urgency=low + + * Do not pass through things of the form "10-11", since they're note really + numbers. + * Fix testnum to work with library in same directory. + + -- Joey Hess <joeyh@debian.org> Mon, 26 May 2003 15:48:37 -0400 + +liblingua-en-words2nums-perl (0.10) unstable; urgency=low + + * Add proper spelling of "forty", and alternate "nought" spelling. + * Make regression test work on win32, with exponents with leading zeroes. + + -- Joey Hess <joeyh@debian.org> Wed, 7 May 2003 01:34:11 -0400 + +liblingua-en-words2nums-perl (0.09) unstable; urgency=low + + * Corrected parsing of "fourth". Oops! + * Moved pm file out of deep directory in source tarball, which was + unnecessary for such a small package. + * Added AUTHOR and ABSTRACT_FROM to Makefile.PL. + * Accept douvigintillion, as well as dovigintillion; I don't know which is + right. + * Thanks to Erick Calder for his help. + + -- Joey Hess <joeyh@debian.org> Mon, 3 Feb 2003 12:16:02 -0500 + +liblingua-en-words2nums-perl (0.08) unstable; urgency=low + + * Localize $_. + + -- Joey Hess <joeyh@debian.org> Fri, 18 Oct 2002 16:09:59 -0400 + +liblingua-en-words2nums-perl (0.07) unstable; urgency=low + + * Use debhelper v4. + + -- Joey Hess <joeyh@debian.org> Sat, 1 Jun 2002 18:15:26 -0400 + +liblingua-en-words2nums-perl (0.06) unstable; urgency=low + + * Don't try to test for inf, since it seems "Infinity" is the string on some + platforms. + + -- Joey Hess <joeyh@debian.org> Sat, 1 Jun 2002 10:01:09 -0400 + +liblingua-en-words2nums-perl (0.05) unstable; urgency=low + + * Added big numbers between undecillion and trigintillion. Also + centillion, and billiard through quindecilliard. + + -- Joey Hess <joeyh@debian.org> Tue, 26 Feb 2002 23:09:29 -0500 + +liblingua-en-words2nums-perl (0.04) unstable; urgency=low + + * Corrected parsing of otherwise plain numbers that have commas in them + (123,456.789) + + -- Joey Hess <joeyh@debian.org> Sat, 12 Jan 2002 17:33:22 -0500 + +liblingua-en-words2nums-perl (0.03) unstable; urgency=low + + * If the entire string is ignorables ("and", "a", punctuation), don't + return 0, but undef. + + -- Joey Hess <joeyh@debian.org> Sat, 12 Jan 2002 14:17:16 -0500 + +liblingua-en-words2nums-perl (0.02) unstable; urgency=low + + * Added support for trillion through googolplex, and added localization + code for the different billions and other numbers. + * Lots of bugfixes, including getting the ordinals right (I hope). + + -- Joey Hess <joeyh@debian.org> Wed, 19 Dec 2001 23:08:03 -0500 + +liblingua-en-words2nums-perl (0.01) unstable; urgency=low + + * First release. + + -- Joey Hess <joeyh@debian.org> Wed, 19 Dec 2001 14:23:03 -0500 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..7f8f011 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +7 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..67adea7 --- /dev/null +++ b/debian/control @@ -0,0 +1,17 @@ +Source: liblingua-en-words2nums-perl +Section: perl +Priority: optional +Build-Depends: debhelper (>= 7), perl5, dpkg-dev (>= 1.9.0) +Maintainer: Joey Hess <joeyh@debian.org> +Standards-Version: 3.8.4 +Vcs-Git: git://git.kitenet.net/words2nums +Homepage: http://kitenet.net/~joey/code/words2nums/ + +Package: liblingua-en-words2nums-perl +Architecture: all +Depends: ${perl:Depends}, ${misc:Depends} +Description: convert English text to numbers + A perl module that can parse a wide variety of English text + and deduce the number it represents. For example, it can convert + "five million, one thousand and sixteen" to 5001016, and + "twenty-seventh" to 27. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..b08d379 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,10 @@ +Lingua::EN::Words2Nums is a Debian native package. + +Copyright 2001-2003 Joey Hess <joey@kitenet.net> + +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +That means it's dual licensed under the GPL +(/usr/share/common-licenses/GPL) and Artistic +(/usr/share/common-licenses/Artistic) licenses. diff --git a/debian/docs b/debian/docs new file mode 100644 index 0000000..2a6769e --- /dev/null +++ b/debian/docs @@ -0,0 +1 @@ +README TODO diff --git a/debian/examples b/debian/examples new file mode 100644 index 0000000..81154dd --- /dev/null +++ b/debian/examples @@ -0,0 +1 @@ +samples diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..f6db6c6 --- /dev/null +++ b/debian/rules @@ -0,0 +1,7 @@ +#!/usr/bin/make -f +%: + dh $@ + +# Not intended for use by anyone except the author. +announcedir: + @echo ${HOME}/src/joeywiki/code/words2nums/news @@ -0,0 +1,78 @@ +# Sample conversions. The result is on the left, and an input is on the +# right. This file is used for regression testing. Note that this file +# assumes that a billion is 10^9, but you can configure it otherwise when +# you use the module. +2001 two thousand one +3424 three thousand four hundred twenty four +3424 3 thousand 4 hundred 24 +798681 seven hundred ninety eight thousand six hundred eighty-one +798000 798 thousand +306172 three hundred six thousand, one hundred seventy two +306172 3 hundred and six thousand, one hundred and seventy-2 +42524 fourty-two thousand five hundred twenty-four +0 zero +1 one +1 first +2 second +3 three +3 third +9 ninth +59 fifty-ninth +1000 thousand +1000 one thousand +16 sixteen +1000524 1000,524 +999.3333333 999.3333333 +30303.30303 30303.30303 +65569565609 65569565609 +-1 -1 +-12211.1133 -12,211.1133 +153 one hundred fifty three +88 eighty-eight +42 fourtytwo +1000000 millionth +3424 thirty-four hundred twenty-four +11059 eleven thousand and fifty-nine +9622000 nine million, six hundred and twenty-two thousand +5600000 fifty-six hundred thousand +167 one hundred and sixty-seventh +25300 two hundred and fifty three hundred +65065065065 sixty-five thousand sixty-five million sixty-five thousand and sixty-five +11011011011 eleven billion eleven million eleven thousand eleven +90 four score and ten +501000000 five hundred and one million +12 dozen +48 four dozen +13 baker's dozen +13 bakers dozen +13 baker dozen +39 three baker's dozens +4000 four thousands +1001 thousand one +4603 four thousand six hundred and three +4103 four thousand, hundred and three +288 two gross +288 two grosses +1000000 a million +1000000 million +1000000000 billion +1000000000000 trillion +1 1st +2 2nd +3 3rd +4 4th +4023 4023rd +# Perl's numbers don't go this high. +# Also, the return for infinity varies with platforms, so this is not a +# good test. +#inf googolplex +#inf seven googolplexes +# Maybe when we get Bignum support.. +#10314424798490535546171949056 Ten octillion, three hundred fourteen septillion, four hundred twenty-four sextillion, seven hundred ninety-eight quintillion, four hundred ninety quadrillion, five hundred thirty-five trillion, five hundred forty-six billion, one hundred seventy-one million, nine hundred forty-nine thousand, and fifty six +# Some things that should not parse to a number: +undef and +undef , +undef +undef now is the time for all good men to come to the aid of their country +undef gazillion +undef hexillion diff --git a/t/samples.t b/t/samples.t new file mode 100644 index 0000000..a049ec8 --- /dev/null +++ b/t/samples.t @@ -0,0 +1,27 @@ +#!/usr/bin/perl +use strict; +use Test; + +our @samples; +BEGIN { + open(SAMPLES, "samples") || die "samples: $!"; + @samples=grep { ! /^#/ } <SAMPLES>; + plan tests => (scalar @samples); +} + +use Lingua::EN::Words2Nums; + +foreach (@samples) { + chomp $_; + my ($num, $text)=split(' ', $_, 2); + if ($num eq 'undef') { + ok(! defined words2nums($text)); + } + else { + my $w2n = words2nums($text); + # On win32 platform, exponents semm to have leading zero. + # This makes it work either way. + $w2n =~ s/e+0(\d+)/e+$1/; + ok($w2n, $num); + } +} @@ -0,0 +1,14 @@ +#!/usr/bin/perl +use blib; # work on uninstalled package +use Words2Nums; +import Lingua::EN::Words2Nums; + +$Lingua::EN::Words2Nums::debug=1; +if (@ARGV) { + print "$ARGV[0] => ".words2nums(shift)."\n"; +} +else { + while (<>) { + print "$_ => ".words2nums($_)."\n"; + } +} |