summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMason James <mtj@kohaaloha.com>2023-02-08 15:15:50 +1300
committerMason James <mtj@kohaaloha.com>2023-02-08 15:15:50 +1300
commit156fa19e4f698959a02d7f29db4cf7b4a0d2629d (patch)
treedb4c051b030aa487736c8cc027892badc43f1713
Import original source of Biblio-Citation-Compare 0.57
-rw-r--r--Changes25
-rw-r--r--MANIFEST12
-rw-r--r--META.json59
-rw-r--r--META.yml33
-rw-r--r--Makefile.PL30
-rw-r--r--README32
-rw-r--r--lib/Biblio/Citation/Compare.pm520
-rw-r--r--t/01-sameWork.t319
-rw-r--r--t/02-editions.t24
-rw-r--r--t/sameTitle.t33
10 files changed, 1087 insertions, 0 deletions
diff --git a/Changes b/Changes
new file mode 100644
index 0000000..bca3fad
--- /dev/null
+++ b/Changes
@@ -0,0 +1,25 @@
+Revision history for Perl extension Biblio::Citation::Compare.
+
+0.01 Mon Feb 7 20:12:19 2011
+ - original version; created by h2xs 1.23 with options
+ -AX Biblio::Citation::Compare
+0.02 Feb 8
+ - Fixed dependency issues
+0.03 Feb 8
+ - Improved doc
+0.21 Nov 24 2012
+ - Tons of improvements, stable version
+0.22 Dec 27 2014
+ - Fix for "_" in titles
+0.23 Dec 31 2014
+ - Removed debug prints
+0.24 Dec 31 2014
+ - Removed unnecessary "use" statements in Makefile.PL, fixing bug RT 101163
+0.4 Jun 11 2015
+ - Improved detection and handling of editions; minor bug fixes
+0.55 Feb 4 2023
+ - Misc and fix for failing test contributed by KohaAloha
+0.56 Feb 5 2023
+ - Fixed missing dependency for Test::Most
+0.57 Feb 6 2023
+ - Incremented Text::Names version to resolve problems with it
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..c083f2b
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,12 @@
+Biblio-Citation-Compare-0.55/META.json
+Biblio-Citation-Compare-0.55/META.yml
+Changes
+lib/Biblio/Citation/Compare.pm
+Makefile.PL
+MANIFEST This list of files
+README
+t/01-sameWork.t
+t/02-editions.t
+t/sameTitle.t
+META.yml Module YAML meta-data (added by MakeMaker)
+META.json Module JSON meta-data (added by MakeMaker)
diff --git a/META.json b/META.json
new file mode 100644
index 0000000..85a60bd
--- /dev/null
+++ b/META.json
@@ -0,0 +1,59 @@
+{
+ "abstract" : "Perl extension for performing fuzzy comparisons between bibliographic citations",
+ "author" : [
+ "David Bourget"
+ ],
+ "dynamic_config" : 1,
+ "generated_by" : "ExtUtils::MakeMaker version 7.62, CPAN::Meta::Converter version 2.150010",
+ "license" : [
+ "unknown"
+ ],
+ "meta-spec" : {
+ "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+ "version" : 2
+ },
+ "name" : "Biblio-Citation-Compare",
+ "no_index" : {
+ "directory" : [
+ "t",
+ "inc"
+ ]
+ },
+ "prereqs" : {
+ "build" : {
+ "requires" : {
+ "ExtUtils::MakeMaker" : "0"
+ }
+ },
+ "configure" : {
+ "requires" : {
+ "ExtUtils::MakeMaker" : "0"
+ }
+ },
+ "runtime" : {
+ "requires" : {
+ "HTML::Parser" : "2.25",
+ "Test::More" : "0.96",
+ "Test::Most" : "0.38",
+ "Text::LevenshteinXS" : "0.03",
+ "Text::Names" : "0.46",
+ "Text::Roman" : "3.5"
+ }
+ }
+ },
+ "release_status" : "stable",
+ "resources" : {
+ "bugtracker" : {
+ "web" : "https://github.com/dbourget/Biblio-Citation-Compare"
+ },
+ "homepage" : "https://metacpan.org/dist/Biblio-Citation-Compare",
+ "license" : [
+ "https://dev.perl.org/licenses/"
+ ],
+ "repository" : {
+ "url" : "https://github.com/dbourget/Biblio-Citation-Compare.git"
+ }
+ },
+ "version" : "0.57",
+ "x_serialization_backend" : "JSON::PP version 4.06"
+}
diff --git a/META.yml b/META.yml
new file mode 100644
index 0000000..a765005
--- /dev/null
+++ b/META.yml
@@ -0,0 +1,33 @@
+---
+abstract: 'Perl extension for performing fuzzy comparisons between bibliographic citations'
+author:
+ - 'David Bourget'
+build_requires:
+ ExtUtils::MakeMaker: '0'
+configure_requires:
+ ExtUtils::MakeMaker: '0'
+dynamic_config: 1
+generated_by: 'ExtUtils::MakeMaker version 7.62, CPAN::Meta::Converter version 2.150010'
+license: unknown
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: '1.4'
+name: Biblio-Citation-Compare
+no_index:
+ directory:
+ - t
+ - inc
+requires:
+ HTML::Parser: '2.25'
+ Test::More: '0.96'
+ Test::Most: '0.38'
+ Text::LevenshteinXS: '0.03'
+ Text::Names: '0.46'
+ Text::Roman: '3.5'
+resources:
+ bugtracker: https://github.com/dbourget/Biblio-Citation-Compare
+ homepage: https://metacpan.org/dist/Biblio-Citation-Compare
+ license: https://dev.perl.org/licenses/
+ repository: https://github.com/dbourget/Biblio-Citation-Compare.git
+version: '0.57'
+x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..f3f799c
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,30 @@
+use 5.0;
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+
+WriteMakefile(
+ NAME => 'Biblio::Citation::Compare',
+ VERSION_FROM => 'lib/Biblio/Citation/Compare.pm', # finds $VERSION
+ PREREQ_PM => {
+ 'Text::Names' => 0.46,
+ 'Text::LevenshteinXS' => 0.03,
+ 'HTML::Parser' => 2.25,
+ 'Test::More' => 0.96,
+ 'Text::Roman' => 3.5,
+ 'Test::Most' => '0.38'
+ }, # e.g., Module::Name => 1.1
+ ($] >= 5.005 ? ## Add these new keywords supported since 5.005
+ (ABSTRACT_FROM => 'lib/Biblio/Citation/Compare.pm', # retrieve abstract from module
+ AUTHOR => 'David Bourget') : ()),
+
+ META_MERGE => {
+ resources => {
+ license => 'https://dev.perl.org/licenses/',
+ homepage => 'https://metacpan.org/dist/Biblio-Citation-Compare',
+ bugtracker => 'https://github.com/dbourget/Biblio-Citation-Compare',
+ repository => 'https://github.com/dbourget/Biblio-Citation-Compare.git',
+ },
+ },
+
+);
diff --git a/README b/README
new file mode 100644
index 0000000..c481d47
--- /dev/null
+++ b/README
@@ -0,0 +1,32 @@
+Biblio-Citation-Compare version 0.11
+====================================
+
+A perl extension for to perform fuzzy comparisons of citations.
+
+INSTALLATION
+
+To install this module type the following:
+
+ perl Makefile.PL
+ make
+ make test
+ make install
+
+DEPENDENCIES
+
+This module requires these other modules and libraries:
+
+ Text::Names
+ Text::LevenshteinXS
+ HTML::Entities
+ various commonly used modules such as Test::More
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2011 by David Bourget
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.1 or,
+at your option, any later version of Perl 5 you may have available.
+
+
diff --git a/lib/Biblio/Citation/Compare.pm b/lib/Biblio/Citation/Compare.pm
new file mode 100644
index 0000000..56b4083
--- /dev/null
+++ b/lib/Biblio/Citation/Compare.pm
@@ -0,0 +1,520 @@
+package Biblio::Citation::Compare;
+
+use 5.0;
+use strict;
+use warnings;
+use Text::LevenshteinXS qw(distance);
+use HTML::Entities;
+use Text::Names qw/samePerson cleanName parseName parseName2/;
+use Text::Roman qw/isroman roman2int/;
+use utf8;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+
+our %EXPORT_TAGS = ( 'all' => [ qw(
+ sameWork sameAuthors toString extractEdition sameAuthorBits sameTitle sameAuthorsLoose
+) ] );
+
+our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+
+our @EXPORT = qw( );
+
+our $VERSION = '0.57';
+
+# to correct bogus windows entities. unfixable ones are converted to spaces.
+my %WIN2UTF = (
+ hex('80')=> hex('20AC'),# #EURO SIGN
+ hex('81')=> hex('0020'), #UNDEFINED
+ hex('82')=> hex('201A'),# #SINGLE LOW-9 QUOTATION MARK
+ hex('83')=> hex('0192'),# #LATIN SMALL LETTER F WITH HOOK
+ hex('84')=> hex('201E'),# #DOUBLE LOW-9 QUOTATION MARK
+ hex('85')=> hex('2026'),# #HORIZONTAL ELLIPSIS
+ hex('86')=> hex('2020'),# #DAGGER
+ hex('87')=> hex('2021'),# #DOUBLE DAGGER
+ hex('88')=> hex('02C6'),# #MODIFIER LETTER CIRCUMFLEX ACCENT
+ hex('89')=> hex('2030'),# #PER MILLE SIGN
+ hex('8A')=> hex('0160'),# #LATIN CAPITAL LETTER S WITH CARON
+ hex('8B')=> hex('2039'),# #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ hex('8C')=> hex('0152'),# #LATIN CAPITAL LIGATURE OE
+ hex('8D')=> hex('0020'),# #UNDEFINED
+ hex('8E')=> hex('017D'),# #LATIN CAPITAL LETTER Z WITH CARON
+ hex('8F')=> hex('0020'),# #UNDEFINED
+ hex('90')=> hex('0020'),# #UNDEFINED
+ hex('91')=> hex('2018'),# #LEFT SINGLE QUOTATION MARK
+ hex('92')=> hex('2019'),# #RIGHT SINGLE QUOTATION MARK
+ hex('93')=> hex('201C'),# #LEFT DOUBLE QUOTATION MARK
+ hex('94')=> hex('201D'),# #RIGHT DOUBLE QUOTATION MARK
+ hex('95')=> hex('2022'),# #BULLET
+ hex('96')=> hex('2013'),# #EN DASH
+ hex('97')=> hex('2014'),# #EM DASH
+ hex('98')=> hex('02DC'),# #SMALL TILDE
+ hex('99')=> hex('2122'),# #TRADE MARK SIGN
+ hex('9A')=> hex('0161'),# #LATIN SMALL LETTER S WITH CARON
+ hex('9B')=> hex('203A'),# #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ hex('9C')=> hex('0153'),# #LATIN SMALL LIGATURE OE
+ hex('9D')=> hex('0020'),# #UNDEFINED
+ hex('9E')=> hex('017E'),# #LATIN SMALL LETTER Z WITH CARON
+ hex('9F')=> hex('0178')# #LATIN CAPITAL LETTER Y WITH DIAERESIS
+);
+my $PARENS = '\s*[\[\(](.+?)[\]\)]\s*';
+my $QUOTE = '"“”`¨´‘’‛“”‟„′″‴‵‶‷⁗❛❜❝❞';
+my @ED_RES = (
+ '(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)',
+ '([1-9])\s?\w{2,5}\s[ée]d',
+ '\bv\.?(?:ersion)?\s?([0-9IXV]+)',
+ '\s([IXV0-9]+)(?:$|:)'
+);
+
+#die "no" unless "2nd edition" =~ /$EDITION/i;
+
+#my $TITLE_SPLIT = '(?:\?|\:|\.|!|\&quot;|[$QUOTE]\b)';
+my $TITLE_SPLIT = '(?:\?|\:|\.|!)';
+
+sub sameAuthors {
+ my ($list1, $list2, %opts) = @_;
+ return 0 if $#$list1 != $#$list2 and $opts{strict};
+ if ($#$list2 > $#$list1) {
+ my $t = $list1;
+ $list1 = $list2;
+ $list2 = $t;
+ }
+ for (my $i = 0; $i <= $#$list2; $i++) {
+ return 0 unless grep { samePerson($list2->[$i],$_, %opts) } @$list1;
+ }
+ return 1;
+}
+
+sub firstAuthor {
+ my $e = shift;
+ my $a = $e->{authors};
+ if ($#$a > -1) {
+ return $a->[0];
+ } else {
+ return undef;
+ }
+}
+
+sub sameWork {
+
+ my ($e, $c, $threshold,$loose,$nolinks,%opts) = @_;
+
+ my $debug = $opts{debug} || 0;
+
+ $loose = 0 unless defined $loose;
+ $threshold = 0.15 unless $threshold;
+ $opts{loose} = 1 if $loose;
+
+ if ($debug) {
+ warn "sameWork 1: " . toString($e);
+ warn "sameWork 2: " . toString($c);
+ }
+
+ if (defined $e->{doi} and length $e->{doi} and defined $c->{doi} and length $c->{doi}) {
+ if ($e->{doi} eq $c->{doi}) {
+ # we don't use doi to say 1 because often we have dois that are for a whole issue
+ # however same doi lowers the threshold
+ $threshold /= 2 if $e->{doi} eq $c->{doi};
+ $loose = 1;
+ $opts{loose} = 1;
+ } else {
+ return 0;
+ }
+ }
+
+ return 0 if (!$c);
+
+ # normalize encoding of relevant fields
+ local $e->{title} = decodeHTMLEntities($e->{title});
+ local $c->{title} = decodeHTMLEntities($c->{title});
+
+ # first check if authors,date, and title are almost literally the same
+ my $tsame = (lc $e->{title} eq lc $c->{title}) ? 1 : 0;
+ my $asame = sameAuthors($e->{authors},$c->{authors},strict=>1);
+ my $asame_loose = $asame || sameAuthors($e->{authors},$c->{authors},strict=>0); #asame_loose will be 1 while same is 0 when there are extra authors in one paper but all overlap authors match
+ my $asame_bits = $asame_loose || sameAuthorBits($e->{authors},$c->{authors});
+ my $dsame = (defined $e->{date} and defined $c->{date} and $e->{date} eq $c->{date}) ? 1 : 0;
+
+ if ($debug) {
+ warn "tsame: $tsame";
+ warn "asame: $asame";
+ warn "asame_loose: $asame_loose";
+ warn "asame_bits: $asame_bits";
+ warn "dsame: $dsame";
+ }
+
+ return 1 if ($tsame and $asame and $dsame);
+
+ # if authors quite different, not same
+ if (!$asame_bits) {
+ warn "authors too different" if $debug;
+ return 0;
+ }
+ # at this point the authors are plausibly the same
+
+ # check dates
+ my $date_wildcards = '^forthcoming|in press|manuscript|unknown|web$';
+ my $compat_dates = ($dsame or ($e->{date} && $e->{date} =~ /$date_wildcards/) or ($c->{date} && $c->{date} =~ /$date_wildcards/));
+ if (!$dsame and !$compat_dates) {
+
+ #disabled for most cases because we want to conflate editions and republications for now.
+ if ($e->{title} =~ /^Introduction.?$/ or $e->{title} =~ /^Preface.?$/) {
+ return 0 if ($e->{source} and $e->{source} ne $c->{source}) or
+ ($e->{volume} and $e->{volume} ne $c->{volume});
+ }
+
+ # numeric dates
+ if ($e->{date} and $e->{date} =~ /^\d\d\d\d$/ and $c->{date} and $c->{date} =~ /^\d\d\d\d$/) {
+ my $date_diff = $e->{date} - $c->{date};
+ # quite often people misremember dates so we permit some slack
+ # we will consider the dates compat if they close in time
+ # if dates are far apart, we know they are not exactly the same publicatoins.
+ # but they might be reprints of the same thing, which we want to conflate.
+ if ($date_diff > 3 or $date_diff < -3) {
+ if ($asame_bits) {
+ $threshold /= 2;
+ warn "dates different, lowering similarity threshold" if $debug;
+ } else {
+ warn "dates+authors too different" if $debug;
+ return 0;
+ }
+
+ } else {
+ # nearby date
+ $threshold /= 2;
+ }
+
+ } else {
+ #messed up dates, assume the worst
+ $threshold /=2;
+ }
+
+ } else {
+ $loose = 1 if $asame_loose or $asame_bits;
+ }
+
+
+
+
+ warn "pre title length" if $debug;
+ # if title very different in lengths and do not contain ":" or brackets, not the same
+ return 0 if !$tsame and (
+ abs(length($e->{title}) - length($c->{title})) > 20
+ and
+ ($e->{title} !~ /$TITLE_SPLIT/ and $c->{title} !~ /$TITLE_SPLIT/)
+ and
+ ($e->{title} !~ /$PARENS/ and $c->{title} !~ /$PARENS/)
+ );
+
+ # Compare links
+# if (!$nolinks) {
+# foreach my $l (@{$e->{links}}) {
+# print "Links e:\n" . join("\n",$e->getLinks);
+# print "Links c:\n" . join("\n",$c->getLinks);
+# return 1 if grep { $l eq $_} @{$c->{links}};
+# }
+# }
+
+ warn "pre loose mode: loose = $loose" if $debug;
+
+ #print "threshold $lname1,$lname2: $threshold\n";
+ # ok if distance short enough without doing anything
+ #print "distance: " . distance(lc $e->{title},lc $c->{title}) / (length($e->{title}) +1) . "\n";
+
+ # perform fuzzy matching
+ #my $str1 = "$e->{date}|$e->{title}";
+ my $str1 = lc _strip_non_word($e->{title});
+ my $str2 = lc _strip_non_word($c->{title});
+
+ # check for edition strings
+ my $ed1 = extractEdition($str1);
+ my $ed2 = extractEdition($str2);
+ warn "ed1: $ed1" if $debug;
+ warn "ed2: $ed2" if $debug;
+ $loose =1 if $ed1 and $ed2 and $ed1 == $ed2 and !$dsame and $asame_loose;
+
+ return 0 if ($ed1 and !$ed2) or ($ed2 and !$ed1) or ($ed1 && $ed1 != $ed2);
+ warn "not diff editions" if $debug;
+
+ # remove brackets
+ my ($parens1,$parens2);
+ $str1 =~ s/$PARENS//g;
+ $parens1 = $1;
+ $str2 =~ s/$PARENS//g;
+ $parens2 = $1;
+ return 0 if $parens1 && $parens2 && numdiff($parens1,$parens2);
+
+ warn "the text comparison is: '$str1' vs '$str2'" if $debug;
+
+ warn "pre number check" if $debug;
+ # if titles differ by a number, not the same
+ return 0 if numdiff($str1,$str2);
+
+ # ultimate test
+ #dbg("$str1\n$str2\n");
+ #dbg(my_dist_text($str1,$str2));
+ my $score = (my_dist_text($str1,$str2) / (length($str1) +1));
+
+ warn "score: $score (threshold: $threshold)" if $debug;
+ #print $score . "<br>\n";
+ return 1 if ( $score < $threshold);
+
+ # now if loose mode and only one of the titles has a ":" or other punctuation, compare the part before the punc with the other title instead
+ if ($loose) {
+
+ warn "loose: $str1 -- $str2" if $debug;
+
+ if ($e->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) {
+
+ my $str1 = _strip_non_word($1);
+ if ($c->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) {
+ return 0;
+ } else {
+ if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) {
+ return 1;
+ }
+ }
+
+ } elsif ($c->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) {
+
+ my $str2 = _strip_non_word($1);
+ if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) {
+ return 1;
+ }
+
+ } else {
+
+ return 0;
+
+ }
+ }
+
+ return 0;
+}
+
+sub sameAuthorsLoose {
+ my ($a, $b) = @_;
+ my $asame = sameAuthors($a,$b,strict=>1);
+ my $asame_loose = $asame || sameAuthors($a,$b,strict=>0);
+ return $asame_loose || sameAuthorBits($a,$b);
+}
+
+sub sameAuthorBits {
+ my ($a, $b) = @_;
+ my (@alist, @blist);
+ for (@$a) {
+ my $v = lc $_; # we copy so we don't modify the original
+ $v =~ s/[,\.]//g;
+ #$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g;
+ push @alist, split(/\s+/, $v);
+ }
+ for (@$b) {
+ my $v = lc $_;
+ $v =~ s/[,\.]//g;
+ #$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g;
+ push @blist, split(/\s+/, $v);
+ }
+ #use Data::Dumper;
+ @alist = sort @alist;
+ @blist = sort @blist;
+ #print Dumper(\@alist);
+ #print Dumper(\@blist);
+ return 0 if $#alist != $#blist;
+ for (my $i=0; $i<= $#alist; $i++) {
+ return 0 if lc $alist[$i] ne lc $blist[$i];
+ }
+ return 1;
+}
+
+#wip
+#sub author_bits {
+# my $list_ref = shift;
+# my @new;
+# for (@$list_ref) {
+# my $v = $_; # we copy so we don't modify the original
+# $v =~ s/,//;
+# $v =~ s/(\p{Ll}\p
+# push @alist, split(/\s+/, $v);
+# }
+#}
+
+sub _strip_non_word {
+ my $str = shift;
+ #abbreviation "volume" v
+ $str =~ s/\bvolume\b/v/gi;
+ $str =~ s/\bvol\.?\b/v/gi;
+ $str =~ s/\bv\.\b/v/gi;
+
+ $str =~ s/[^[0-9a-zA-Z\)\]\(\[]+/ /g;
+ $str =~ s/\s+/ /g;
+ $str =~ s/^\s+//;
+ $str =~ s/\s+$//;
+ $str;
+}
+
+my %nums = (
+ first => 1,
+ second => 2,
+ third => 3,
+ fourth => 4,
+ fifth => 5,
+ sixth => 6,
+ seventh => 7,
+ eighth => 8,
+ ninth => 9,
+ tenth => 10,
+);
+sub extract_num {
+ my $s = shift;
+ if ($s =~ /\b(\d+)/) {
+ return $1;
+ }
+ if (isroman($s)) {
+ return roman2int($s);
+ }
+
+ for my $n (keys %nums) {
+ if ($s =~ /\b$n\b/i) {
+ return $nums{$n};
+ }
+
+ }
+ return $s;
+}
+
+sub extractEdition {
+ my $s = shift;
+ for my $re (@ED_RES) {
+ if ($s =~ /$re/i) {
+ return extract_num($1);
+ }
+ }
+ return undef;
+}
+
+sub numdiff {
+ my ($s1,$s2) = @_;
+ my @n1 = ($s1 =~ /\b([IXV0-9]{1,4}|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelveth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11th|12th)\b/ig);
+ my @n2 = ($s2 =~ /\b([IXV0-9]{1,4}|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelveth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11th|12th)\b/ig);
+ #print "In s1:" . join(",",@n1) . "\n";
+ #print "In s2:" . join(",",@n2) . "\n";
+ return 0 if $#n1 ne $#n2;
+ for (0..$#n1) {
+ return 1 if lc $n1[$_] ne lc $n2[$_];
+ }
+ return 0;
+}
+
+
+sub my_dist_text {
+ my $a = lc shift;
+ my $b = lc shift;
+ $a =~ s/_/ /g;
+ $b =~ s/_/ /g;
+ return distance($a, $b);
+
+}
+sub decodeHTMLEntities {
+ my $in = shift;
+ $in =~ s/&([\d\w\#]+);/&safe_decode($1)/gei;
+ return $in;
+}
+
+sub safe_decode {
+ my $in = shift;
+ if (substr($in,0,1) eq '#') {
+ my $num = substr($in,1,1) eq 'x' ? hex(substr($in,1)) : substr($in,1);
+ # we check and fix cp1232 entities
+ return ($num < 127 or $num > 159) ?
+ HTML::Entities::decode_entities("&$in;") :
+ HTML::Entities::decode_entities("&#" . $WIN2UTF{$num} . ";");
+ } else {
+ HTML::Entities::decode_entities("&$in;")
+ }
+}
+
+sub toString {
+ my $h = shift;
+ return join("; ",@{$h->{authors}}) . " ($h->{date}) $h->{title}\n";
+}
+
+sub sameTitle {
+ my ($a, $b, $threshold,$loose,$nolinks,%opts) = @_;
+ return sameWork({ title => $a }, { title => $b }, $threshold,$loose,$nolinks,%opts);
+}
+
+1;
+__END__
+
+=head1 NAME
+
+Biblio::Citation::Compare - Perl extension for performing fuzzy comparisons between bibliographic citations
+
+=head1 SYNOPSIS
+
+ use Biblio::Citation::Compare 'sameWork','sameAuthors';
+
+ sameWork(
+ # first item
+ {
+ authors => ['Bourget, D','Lukasiak, Zbigniew'],
+ title => "A paper with such and such a title",
+ date => 2010
+ },
+ # second item
+ {
+ authors => ['Bourget, David J. R.','Lukasiak, Zbigniew'],
+ title => "A paper with such nd such a tlitle",
+ date => undef
+ }
+ );
+
+ # true!
+
+ sameAuthors(
+ ['Dave Bourget','Z Lukasiak'],
+ ['Bourget DJR','Zbigniew Z. Lukasiak']
+ );
+
+ # true!
+
+
+=head1 DESCRIPTION
+
+This module exports two subroutines which perform fuzzy comparisons between citations (described using Perl hashes) and author lists. The subroutine attempt to determine if the citations and author lists refer to the same works and ordered lists of authors, respectively.
+
+=head1 SUBROUTINES
+
+=head2 sameWork(hashref citation1, hashref citation2): boolean
+
+Takes as input two citations in a simple format illustrated in the synopsis. Returns true iff the two citations plausibly refer to the same work. A number of factors are taken into account to make the evaluation resistant to random variations. Among them: names are normalized and compared fuzzily using L<Text::Names>, allowances are made for random typos, allowances are made for short and long versions of titles (esp with titles containing a colon), small but important variations as in "Paper title part 1" and "Paper title part 2" are taken into account. The algorithm has been used to merge multiple data sources on L<PhilPapers.org>.
+
+Some advanced additional parameters are not explained here; they can only be explained by pointing to the source code. Their use should not normally be necessary.
+
+=head2 sameAuthors(arrayref list1, arrayref list2): boolean
+
+Returns true if the two lists are plausibly lists of the same authors. This is merely a convenient wrapper over L<Text::Names>::samePerson.
+
+=head2 EXPORT
+
+None by default.
+
+=head1 SEE ALSO
+
+See also L<Text::Names> for name normalization.
+
+=head1 AUTHOR
+
+David Bourget, http://www.dbourget.com
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2011 by David Bourget
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.1 or,
+at your option, any later version of Perl 5 you may have available.
+
+
+=cut
diff --git a/t/01-sameWork.t b/t/01-sameWork.t
new file mode 100644
index 0000000..63b77e7
--- /dev/null
+++ b/t/01-sameWork.t
@@ -0,0 +1,319 @@
+use lib '../lib';
+use Biblio::Citation::Compare 'sameWork','sameAuthors','toString', 'sameAuthorBits';
+use Test::Most;
+die_on_fail;
+
+my @samePersonYes = (
+ [ [ 'D. Bourget', 'Chalmers D' ], ['David J. R. Bourget','David C Chalmers'] ],
+ [ [ 'J wilson' ], ['Jessica WILSON'] ],
+ [ [ 'J. Wilson' ], ['Jessica M. WILSON'] ],
+ [ [ 'D. Bourget', 'J Wilson' ], ['J Wilson'] ]
+);
+
+my @samePersonNo = (
+ [ [ 'Hunter, David' ], ['Hunter, Daniel'] ],
+ [ [ 'D. Bourget', 'J Wilson' ], ['D. Chalmers', 'J Wilson'] ]
+);
+
+ok( sameAuthorBits(['Bourget, David','Joseph, richard'], ['David Joseph Richard Bourget']), 'same authors bits works');
+ok(!sameAuthorBits(['A, A'],['B, B']), 'same authors bits does not overgenerates');
+
+ok( sameAuthors($_->[0],$_->[1]), join(";",@{$_->[0]}) . " = " . join(";",@{$_->[1]})) for @samePersonYes;
+ok( !sameAuthors($_->[0],$_->[1]), join(";",@{$_->[0]}) . " != " . join(";",@{$_->[1]})) for @samePersonNo;
+
+my $e1 = {};
+my $e2 = {};
+my @authors = ('Bourget, David');
+$e1->{authors} = \@authors;
+$e2->{authors} = \@authors;
+$e1->{date} = 2009;
+$e2->{date} = 2009;
+
+# Test numeric difference
+$e1->{title} = "Chapter 1 of xyz";
+$e2->{title} = "Chapter 2 of xyz";
+same($e1,$e2,0);
+
+# Test numeric difference
+$e1->{title} = "Chapter one of xyz";
+$e2->{title} = "Chapter two of xyz";
+same($e1,$e2,0);
+
+
+$e1->{title} = "IV- The first pakladjs lkasdjf ";
+$e2->{title} = "X- The first pakladjs lkasdjft ";
+same($e1,$e2,0);
+
+$e1->{title} = "Theories of consciousness I";
+$e2->{title} = "Theories of consciousness 2";
+same($e1,$e2,0);
+
+$e1->{title} = "Theories of consciousness:part I";
+$e2->{title} = "Theories of consciousness:part 2";
+same($e1,$e2,0);
+
+$e1->{title} = "A book with a bracket (yes? !)";
+$e2->{title} = "A book with a bracket";
+same($e1,$e2,1);
+
+$e1->{title} = "Coyer and the Enlightenment (Studies on Voltaire)";
+$e2->{title} = "Coyer and the Enlightenment";
+same($e1,$e2,1);
+
+$e1 = {};
+$e2 = {};
+@authors = ('Abernethy, George L.');
+$e1->{authors} = \@authors;
+$e2->{authors} = [@authors,'Langford, Thomas A.'];
+$e1->{date} = 1968;
+$e2->{date} = 1968;
+
+$e1->{title} = "Philosophy of Religion";
+$e2->{title} = "Philosophy of Religion: A Book of Readings";
+same($e1,$e2,1);
+
+$e1 = {};
+$e2 = {};
+@authors = ('Abbot, Francis Ellingwood');
+$e1->{authors} = \@authors;
+$e2->{authors} = \@authors;
+$e1->{date} = 1890;
+$e2->{date} = 2010;
+
+$e1->{title} = "The Way Out of Agnosticism: Or, the Philosophy of Free Religion";
+$e2->{title} = "The Way Out of Agnosticism, Or, the Philosophy of Free Religion [Microform]";
+same($e1,$e2,1);
+
+$e1->{date} = 2008;
+$e2->{date} = 2008;
+$e1->{title} = "Market Versus Nature: The Social Phiosophy [I.E. Philosophy] of Friedrich Hayek";
+$e2->{title} = "Market Versus Nature: the Social Philosophy of Friedrich Hayek";
+same($e1,$e2,1);
+
+$e1->{title} = "The Philosophy of John Norris of Bemerton: (1657-1712)";
+$e2->{title} = "The philosophy of John Norris of Bemerton: (1657-1712) (Studien und Materialien zur Geschichte der Philosophie : Kleine Reihe ; Bd. 6)";
+same($e1,$e2,1);
+
+$e1->{title} = "The Philosophy of John Norris of Bemerton: (1657-1712)";
+$e2->{title} = "The philosophy of John Norris of Bemerton: (1657-1712)";
+same($e1,$e2,1);
+
+$e1->{title} = "The Philosophy of John Norris of Bemerton: (1657-1712)";
+$e2->{title} = "The philosophy of John Norris of Bemerton: (1657-2000)";
+same($e1,$e2,0);
+
+$e1->{title} = "Communitarian International Relations: The Epistemic Foundations of International Relations";
+$e2->{title} = "Communitarian International Relations: The Epistemic Foundations of International Relations (New International Relations)";
+same($e1,$e2,1);
+
+$e1->{title} = '"What is an Apparatus?" and Other Essays';
+$e2->{title} = '"What Is an Apparatus?" and Other Essays (Meridian: Crossing Aesthetics)';
+same($e1,$e2,1);
+
+$e1->{title} = 'Clearly not the same kalsdfjl;sdfajdfsa lfdkasjfadslkajsdf lasdfkjaf';
+$e2->{title} = 'Clearny same the not .x,zcmnvcx zm,xcvnxvc ,mxcvzn xcvxm,zcvnvxc zvv';
+same($e1,$e2,0);
+
+$e1->{title} = "Much Ado About 'Something': Critical Notice of Chalmers, Manley, Wasserman, Metametaphysics.";
+$e2->{title} = "Much Ado About 'Something'.";
+
+$e1->{authors} = ['Wilson, Jessica M.'];
+$e2->{authors} = ['Wilson, J.'];
+same($e1,$e2,1);
+
+check(
+ ['Dummett, Michael'],
+ 1973,
+ 'Frege',
+ ['Dummett, Michael'],
+ 1991,
+ 'Frege (2nd edition)',
+ 0
+);
+
+
+check(
+ ['Dummett, Michael'],
+ 1973,
+ 'Frege',
+ ['Dummett, Michael'],
+ 1991,
+ 'Frege: Philosophy of Mathematics',
+ 0
+);
+
+check(
+ ['Russell, Bertrand'],
+ "2009",
+ "Bertrand Russell's Best",
+ ['Russell, Bertrand'],
+ "2009",
+ "Bertrand Russell's Best",
+ 1
+);
+
+#
+# Common cases of degraded metadata
+#
+
+# missing firstname
+#check(
+# ['Russell, '],
+# "2009",
+# "Short",
+# ['Russell, B'],
+# "2009",
+# "Short",
+# 1
+#);
+check(
+ ['Bourget, David'],
+ 2008,
+ "The title of the work",
+ ['Other, Person'],
+ 2008,
+ "The title of the work",
+ 0
+);
+
+check(
+ ['John Doe, By'],
+ 2009,
+ 'The same title',
+ ['Doe, John'],
+ 2009,
+ 'The same title',
+ 1
+);
+
+#unsplit name
+check(
+ ['John Doe'],
+ 2009,
+ 'The same title',
+ ['Doe, John'],
+ 2009,
+ 'The same title',
+ 1
+);
+
+# missing authors
+check(
+ ['Henry Allison'],2002,"Debating Allison on Transcendental Idealism",
+ ['John Doe','Bob Dylan','Henry Allison'],2002,"Debating Allison on Transcendental Idealism",
+ 1
+);
+
+# missing authors with slight typo
+check(
+ ['H Allison'],2002,"Debating Allison on Transcendental Idealsm",
+ ['John Doe','Bob Dylan','Henry Allison'],2002,"Debating Allison on Transcendental Idealism",
+ 1
+);
+
+# missing authors not clear due to date difference
+check(
+ ['Henry Allison'],2000,"Debating Allison on Transcendental Idealsm",
+ ['John Doe','Bob Dylan','Henry Allison'],2002,"Debating Allison on Transcendental Idealism",
+ 1
+);
+
+# missing authors not clear due to date difference
+check(
+ ['Henry Allison'],2000,"Debating Allison on Idealsm",
+ ['John Doe','Bob Dylan','Henry Allison'],2002,"Debating Allison on Transcendental Idealism",
+ 0
+);
+
+check(
+ ['J. Nagel'],1974, "What is It Like to Be a Bat? Philosophical Review",
+ ['J. Nagel'],1974, "What is It Like to Be a Bat?",
+ 1
+);
+
+check(
+ ['J. Nagel'],1974, "What is It Like to Be a Bat: Philosophical Review",
+ ['J. Nagel'],1974, "What is It Like to Be a Bat:",
+ 1
+);
+
+check(
+ ['J. Nagel'],1974, "What is It Like to Be a Bat. Philosophical Review",
+ ['J. Nagel'],1974, "What is It Like to Be a Bat.",
+ 1
+);
+
+check(
+ ['J. Nagel'],1974, "What is It Like to Be a Bat. Philosophical Review",
+ ['J. Nagel'],1974, "What is It Like to Be a Bat",
+ 1
+);
+
+check(
+ ['Fredrik Björklund', 'Gunnar Björnsson', 'John Eriksson', 'Ragnar Francén Olinder', 'Caj Strandberg'],2012,"Recent Work on Motivational Internalism",
+ ['F. Bjorklund', 'G. Bjornsson', 'J. Eriksson', 'R. Francen Olinder', 'C. Strandberg'],2012,"Recent Work on Motivational Internalism",
+ 1
+);
+
+check(
+ ['Gunnar Björnsson','Fredrik Björklund'],2012,"Recent Work on Motivational Internalism",
+ ['F. Bjorklund', 'G. Bjornsson'],2012,"Recent Work on Motivational Internalism!",
+ 1
+);
+
+check(
+ ['Gunnar Björnsson'],2012,"Recent Work on Motivational Internalism",
+ ['F. Bjorklund', 'G. Bjornsson'],2012,"Recent Work on Motivational Internalism!",
+ 1
+);
+
+check(
+ ['Uriah Kriegel'],2011,"The Sources of Intentionality",
+ ['Uriah Kriegel'],2014,"_The Sources of Intentionality_",
+ 1
+);
+
+check(
+ ['Uriah Kriegel'],2005,"Real Intentionality V.2: Why intentionality entails consciousness",
+ ['Uriah Kriegel'],2008,"Real Intentionality 3: Why intentionality entails consciousness",
+ 0
+);
+
+
+
+
+ok(
+ sameWork(
+ # first item
+ {
+ authors => ['Bourget, D','Lukasiak, Zbigniew'],
+ title => "A paper with such and such a title",
+ date => 2010
+ },
+ # second item
+ {
+ authors => ['Bourget, David J. R.','Lukasiak, Zbigniew'],
+ title => "A paper with such nd such a tlitle",
+ date => undef
+ }
+ ),
+
+ 'Documentation example'
+);
+
+
+sub same {
+ my ($e1,$e2,$same) = @_;
+ is(sameWork($e1,$e2),$same, toString($e1) . ' ' . ($same ? ' == ' : ' != ') . ' ' . toString($e2));
+}
+
+sub check {
+ my ($authors1, $date1, $title1, $authors2, $date2, $title2, $yes) = @_;
+ my $e1 = {title=>$title1,date=>$date1};
+ $e1->{authors} = $authors1;
+ my $e2 = {title=>$title2,date=>$date2};
+ $e2->{authors} = $authors2;
+ return same($e1,$e2,$yes);
+}
+
+done_testing();
diff --git a/t/02-editions.t b/t/02-editions.t
new file mode 100644
index 0000000..8f5ca00
--- /dev/null
+++ b/t/02-editions.t
@@ -0,0 +1,24 @@
+use Test::More;
+use Biblio::Citation::Compare;
+use utf8;
+
+my %tests = (
+ "the second edition" => 2,
+ "the 2nd edition" => 2,
+ "the 3rd edition" => 3,
+ "the 2ieme edition" => 2,
+ "title V. 2" => 2,
+ "title V.2: bla" => 2,
+ "title 2: bla" => 2,
+ "title V X" => 10,
+ "3ieme éd." => 3,
+ "totle xxvi" => 26,
+ "A title with I in the middle" => undef,
+ "no edition" => undef,
+);
+
+for my $k (sort keys %tests) {
+ is(Biblio::Citation::Compare::extractEdition($k),$tests{$k},"$k => $tests{$k}");
+}
+
+done_testing
diff --git a/t/sameTitle.t b/t/sameTitle.t
new file mode 100644
index 0000000..8023a9c
--- /dev/null
+++ b/t/sameTitle.t
@@ -0,0 +1,33 @@
+use lib '../lib';
+use Biblio::Citation::Compare 'sameTitle';
+use Test::More;
+
+my @sameTitleYes = (
+ ['A History of Philosophy. Vol. I: Greece and Rome', 'A History of Philosophy. Vol. I : Greece and Rome'],
+ ['A book with a bracket (yes? !)', 'A book with a bracket'],
+ ['Coyer and the Enlightenment (Studies on Voltaire)', 'Coyer and the Enlightenment'],
+ ['The Way Out of Agnosticism: Or, the Philosophy of Free Religion', 'The Way Out of Agnosticism: Or, the Philosophy of Free Religion [Microform]'],
+ ['Market Versus Nature: The Social Phiosophy [I.E. Philosophy] of Friedrich Hayek', 'Market Versus Nature: the Social Philosophy of Friedrich Hayek'],
+ ['The Philosophy of John Norris of Bemerton: (1657-1712)', 'The philosophy of John Norris of Bemerton: (1657-1712) (Studien und Materialien zur Geschichte der Philosophie : Kleine Reihe ; Bd. 6)'],
+ ['Communitarian International Relations: The Epistemic Foundations of International Relations', 'Communitarian International Relations: The Epistemic Foundations of International Relations (New International Relations)'],
+ ['"What is an Apparatus?" and Other Essays', '"What Is an Apparatus?" and Other Essays (Meridian: Crossing Aesthetics)'],
+
+
+
+);
+
+my @sameTitleNo = (
+ ['A History of Philosophy. Vol. I: Greece and Rome', 'A History of Philosophy. Vol. IV: Descartes to Leibniz'],
+ ['Book Review of: "Do We Really Understand Quantum Mechanics?" by Franck Laloë', 'Do We Really Understand Quantum Mechanics?'],
+ ['Chapter 1 of xyz', 'Chapter 2 of xyz'],
+ ['IV- The first pakladjs lkasdjf', 'X- The first pakladjs lkasdjft'],
+ ['Theories of consciousness I', 'Theories of consciousness 2'],
+ ['Theories of consciousness:part I', 'Theories of consciousness:part 2'],
+ ['The Philosophy of John Norris of Bemerton: (1657-1712)', 'The philosophy of John Norris of Bemerton: (1657-2000)'],
+ ['Clearly not the same kalsdfjl;sdfajdfsa lfdkasjfadslkajsdf lasdfkjaf', 'Clearny same the not .x,zcmnvcx zm,xcvnxvc ,mxcvzn xcvxm,zcvnvxc zvv']
+);
+
+ok(sameTitle($_->[0], $_->[1]), "$_->[0] == $_->[1]") for @sameTitleYes;
+ok(!sameTitle($_->[0], $_->[1]), "$_->[0] != $_->[1]") for @sameTitleNo;
+
+done_testing();