diff options
author | Russ Allbery <rra@cpan.org> | 2021-09-10 00:03:17 -0700 |
---|---|---|
committer | Russ Allbery <rra@cpan.org> | 2021-09-10 00:03:17 -0700 |
commit | 3166452e6dd23225aaf4a6bd78f59613f6664115 (patch) | |
tree | 7351ca3117f0794101db4f1b582141666b03b5fc /lib/App/DocKnot/Spin/RSS.pm | |
parent | 5ef33789b16f7f0a8049394d348f19377358da67 (diff) |
Begin refactoring App::DocKnot::Spin::RSS
Starting to modernize the Perl in preparation for the next release.
Diffstat (limited to 'lib/App/DocKnot/Spin/RSS.pm')
-rw-r--r-- | lib/App/DocKnot/Spin/RSS.pm | 290 |
1 files changed, 189 insertions, 101 deletions
diff --git a/lib/App/DocKnot/Spin/RSS.pm b/lib/App/DocKnot/Spin/RSS.pm index 9664d42..8b94d3e 100644 --- a/lib/App/DocKnot/Spin/RSS.pm +++ b/lib/App/DocKnot/Spin/RSS.pm @@ -26,122 +26,211 @@ use POSIX qw(strftime); # Utility functions ############################################################################## -# Returns the intersection of two arrays as a list. -sub intersect { - my ($a, $b) = @_; - my %a = map { $_ => 1 } @$a; - return grep { $a{$_} } @$b; +# List intersection. +# +# $one - First list +# $two - Second list +# +# Returns: Common elements of both lists as a list +sub _intersect { + my ($one, $two) = @_; + my %one = map { $_ => 1 } $one->@*; + return grep { $one{$_} } $two->@*; } -# Construct an absolute URL from a relative URL and a base URL. -sub absolute_url { +# Construct an absolute URL from a relative URL and a base URL. This plays +# fairly fast and loose with schemes and the like, since we don't need to be +# precise for our purposes. +# +# $url - Relative URL +# $base - Base URL to which it is relative +# +# Returns: Absolute URL +sub _absolute_url { my ($url, $base) = @_; - $base =~ s,[^/]+$,,; - if ($url =~ /^[a-z]+:/) { - return $url; - } elsif ($url =~ m,^/,) { - $base =~ s,^(https?://[^/]+).*,$1,; - return "$base$url"; - } else { - while ($url =~ s,^\.\./+,,) { - $base =~ s,[^/]+/+$,,; - } - return "$base$url"; + + # If $url is already absolute, return it. + return $url if $url =~ m{ \A [[:lower:]]+ : }xms; + + # If $url starts with /, take only the scheme and host from the base URL. + if ($url =~ m{ \A / }xms) { + $base =~ s{ \A ( [[:lower:]]+ :// [^/]+ ) .* }{$1}xms; + return $base . $url; + } + + # Otherwise, strip the last component off the base URL, and then strip + # more trailing components off the base URL for every ../ element in the + # relative URL. Then glue them together. This does not deal with the + # case where there are more ../ elements than there are elements in the + # base URL. + $base =~ s{ [^/]+ \z }{}xms; + while ($url =~ s{ \A [.][.]/+ }{}xms) { + $base =~ s{ [^/]+ /+ \z }{}xms; } + return $base . $url; } -# Construct a relative URL from an absolute URL and a base URL. -sub relative_url { +# Construct a relative URL from an absolute URL and a base URL. If there is +# no base URL or if the URLs cannot be made relative to each other, return the +# relative URL unchanged. +# +# $url - Absolute URL +# $base - URL to which it should be relative +# +# Returns: Relative URL +sub _relative_url { my ($url, $base) = @_; - return $url unless $base; - $base =~ s,^(https?://[^/]+/)/*,,; - my $host = $1; - if (!$host || index ($url, $host) != 0) { + return $url if !$base; + + # Remove the protocol and host portion from the base URL and ensure that + # portion matches. + if ($base =~ s{ \A ( https?:// [^/]+ ) /* }{}xms) { + my $host = $1; + if ($url !~ s{ \A \Q$host\E /* }{}xms) { + return $url; + } + } else { return $url; } - $url =~ s,^\Q$host\E/*,,; - my @base = split (m,/+,, $base); - while ($url and @base) { - my $segment = shift @base; - unless ($url =~ s,^\Q$segment\E/*,,) { - return ('../' x (@base + 1)) . $url; + + # Split the base URL into path segments. While the input URL starts with + # a matching segment, remove it. When we run out of matching segments, + # the relative URL is a number of ../ strings equal to the number of + # remaining base segments, plus the remaining input URL. + my @base = split(m{ /+ }xms, $base); + while ($url && @base) { + my $segment = shift(@base); + if ($url !~ s{ \A \Q$segment\E (?: /+ | \z ) }{}xms) { + return ('../' x (scalar(@base) + 1)) . $url; } } - return ('../' x @base) . $url; + return ('../' x scalar(@base)) . $url; } ############################################################################## # Parsing ############################################################################## -# Parse a change file. Save the metadata into the provided hash reference and -# the changes into the provided array reference. Each element of the array -# will be a hash with keys title, date, link, and description. -sub parse_changes { - my ($file, $metadata, $changes) = @_; - open (my $fh, '<', $file) or die "$0: cannot open $file: $!\n"; - local $_; - my ($last, @blocks); - push (@blocks, {}); +# Read key/value blocks in an RFC-2822-style file. +# +# $file - File to read +# +# Returns: List of hashes corresponding to the blocks in the file. +sub _read_rfc2822_file { + my ($self, $file) = @_; + my $key; + my @blocks = ({}); my $current = $blocks[0]; - while (<$fh>) { - if (/^\s*$/) { - push (@blocks, {}); - $current = $blocks[$#blocks]; - undef $last; - } elsif (/^(\S+):[ \t]+(.+)/) { - my ($key, $value) = ($1, $2); - die "$0: cannot parse $file at line $.\n" unless $value; - $value =~ s/\s+$//; - $last = lc $key; - $current->{$last} = $value; - } elsif (/^(\S+):/) { - $last = lc $1; - $current->{$last} ||= ''; - } elsif (/^\s/) { - die "$0: cannot parse $file at line $.\n" unless $last; - my $value = $_; - $value =~ s/^\s//; - $value = "\n" if $value =~ /^\.\s*\n/; - if ($current->{$last} and $current->{$last} !~ /\n\z/) { - $current->{$last} .= "\n"; + + # Parse the file. $key holds the last key seen, used to append + # continuation values to the previous key. $current holds the current + # block being parsed and @blocks all blocks seen so far. + open(my $fh, '<', $file); + while (defined(my $line = <$fh>)) { + if ($line =~ m{ \A \s* \z }xms) { + if ($key) { + $current = {}; + push(@blocks, $current); + undef $key; + } + } elsif ($line =~ m{ \A (\S+): [ \t]+ ([^\n]+) \Z }xms) { + my ($new_key, $value) = ($1, $2); + $value =~ s{ \s+ \z }{}xms; + $key = lc($new_key); + $current->{$key} = $value; + } elsif ($line =~ m{ \A (\S+): \s* \z }xms) { + my $new_key = $1; + $key = lc($new_key); + $current->{$key} //= q{}; + } elsif ($line =~ m{ \A \s }xms) { + if (!$key) { + die "$file:$.: invalid continuation line\n"; + } + my $value = $line; + $value =~ s{ \A \s }{}xms; + if ($value =~ m{ \A [.] \s* \Z }xms) { + $value = "\n"; } - $current->{$last} .= $value; + if ($current->{$key} && $current->{$key} !~ m{ \n\z }xms) { + $current->{$key} .= "\n"; + } + $current->{$key} .= $value; + } else { + die "$file:$.: cannot parse line\n"; } } close($fh); - pop @blocks unless $last; - %$metadata = %{ shift @blocks }; - $metadata->{recent} = 15 unless defined $metadata->{recent}; + + # If the file ends in a blank line, we'll have a stray empty block. + # Remove it. + if (!$key) { + pop(@blocks); + } + + # Return the parsed blocks. + return \@blocks; +} + +# Parse a change file. Save the metadata into the provided hash reference and +# the changes into the provided array reference. Each element of the array +# will be a hash with keys title, date, link, and description. +# +# $file - File to read +# +# Returns: List of reference to metadata hash and reference to a list of +# hashes of changes +sub _parse_changes { + my ($self, $file) = @_; + my $blocks_ref = $self->_read_rfc2822_file($file); + + # The first block is our metadata. recent defaults to 15. + my $metadata_ref = shift($blocks_ref->@*); + if (!defined($metadata_ref->{recent})) { + $metadata_ref->{recent} = 15; + } + + # Canonicalize the data for the rest of the blocks, and check for + # duplicate GUIDs. my %guids; - for my $block (@blocks) { - $block->{date} = str2time ($block->{date}) - or die "$0: cannot parse date $block->{date}\n"; - if ($block->{link} && $block->{link} !~ /^http/) { - if ($block->{link} eq '/') { - $block->{link} = $metadata->{base}; + my $base = $metadata_ref->{base}; + for my $block_ref ($blocks_ref->@*) { + $block_ref->{date} = str2time($block_ref->{date}) + or die qq{cannot parse date "$block_ref->{date}"\n}; + + # Relative links are relative to the base URL in the metadata. + if ($block_ref->{link} && $base) { + if ($block_ref->{link} eq q{/}) { + $block_ref->{link} = $base; } else { - $block->{link} = $metadata->{base} . $block->{link}; + $block_ref->{link} = $base . $block_ref->{link}; } } - unless ($block->{guid}) { - my $guid; - if ($block->{journal} || $block->{review}) { - $guid = $block->{link}; + + # If no GUID was given, take it from the link for journal and review + # entries, and otherwise from the date. Then ensure it's unique. + my $guid = $block_ref->{guid}; + if (!$guid) { + if ($block_ref->{journal} || $block_ref->{review}) { + $guid = $block_ref->{link}; } else { - $guid = $block->{date}; + $guid = $block_ref->{date}; } - die "$0: duplicate GUID for entry $guid\n" if $guids{$guid}; - $block->{guid} = $guid; } - if ($block->{tags}) { - $block->{tags} = [ split (' ', $block->{tags}) ]; - } else { - $block->{tags} = []; + if ($guids{$guid}) { + die "duplicate GUID for entry $guid\n"; } - push (@{ $block->{tags} }, 'review') if $block->{review}; + $block_ref->{guid} = $guid; + + # Determine the tags. + my @tags = $block_ref->{tags} ? split(q{ }, $block_ref->{tags}) : (); + if ($block_ref->{review}) { + push(@tags, 'review'); + } + $block_ref->{tags} = \@tags; } - @$changes = @blocks; + + # Return the results. + return ($metadata_ref, $blocks_ref); } ############################################################################## @@ -281,7 +370,7 @@ EOC $description = rss_review ($entry->{review}, $metadata); } $description =~ s{(<(?:a href|img src)=\")(?!http:)([./\w][^\"]+)\"} - {$1 . absolute_url ($2, $entry->{link}) . '"'}ge; + {$1 . _absolute_url ($2, $entry->{link}) . '"'}ge; my $perma = ($entry->{guid} =~ /^http/) ? '' : ' isPermaLink="false"'; print $file " <item>\n"; print $file " <title>$title</title>\n"; @@ -422,9 +511,9 @@ sub index_output { $text = index_review ($entry->{review}, $entry->{link}); } $text =~ s{(\\(?:link|image)\s*)\[([^\]]+)\]} - {"${1}[" . absolute_url ($2, $entry->{link}) . ']'}ge; + {"${1}[" . _absolute_url ($2, $entry->{link}) . ']'}ge; $text =~ s{(\\image\s*)\[([^\]]+)\]} - {"${1}[" . relative_url ($2, $metadata->{'index-base'}) . ']'}ge; + {"${1}[" . _relative_url ($2, $metadata->{'index-base'}) . ']'}ge; print $file $text; print $file "\\class(footer)[$date \\entity[mdash]\n"; print $file " \\link[$entry->{link}]\n"; @@ -473,13 +562,12 @@ sub generate { $base //= $self->{base}; # Read in the changes. - my (%metadata, @changes); - parse_changes ($source, \%metadata, \@changes); + my ($metadata_ref, $changes_ref) = $self->_parse_changes($source); # Now, the output key tells us what files to write out. my @output; - if ($metadata{output}) { - @output = split (' ', $metadata{output}); + if ($metadata_ref->{output}) { + @output = split (' ', $metadata_ref->{output}); } else { @output = ('*:rss:index.rss'); } @@ -496,31 +584,31 @@ sub generate { next if (-f $path && -M $path <= -M $source); my @tags = split (',', $tags); my @interest; - for my $change (@changes) { - if ($tags eq '*' || intersect ($change->{tags}, \@tags)) { + for my $change ($changes_ref->@*) { + if ($tags eq '*' || _intersect($change->{tags}, \@tags)) { push (@interest, $change); } } if ($format eq 'thread') { print "Generating thread file $prettyfile\n"; open (THREAD, '>', $path) or die "$0: cannot create $path: $!\n"; - thread_output (\*THREAD, \%metadata, \@interest); + thread_output (\*THREAD, $metadata_ref, \@interest); close THREAD; } elsif ($format eq 'rss') { - if (@interest > $metadata{recent}) { - splice (@interest, $metadata{recent}); + if (@interest > $metadata_ref->{recent}) { + splice (@interest, $metadata_ref->{recent}); } print "Generating RSS file $prettyfile\n"; open (RSS, '>', $path) or die "$0: cannot create $path: $!\n"; - rss_output (\*RSS, $file, \%metadata, \@interest); + rss_output (\*RSS, $file, $metadata_ref, \@interest); close RSS; } elsif ($format eq 'index') { - if (@interest > $metadata{recent}) { - splice (@interest, $metadata{recent}); + if (@interest > $metadata_ref->{recent}) { + splice (@interest, $metadata_ref->{recent}); } print "Generating index file $prettyfile\n"; open (INDEX, '>', $path) or die "$0: cannot create $path: $!\n"; - index_output (\*INDEX, \%metadata, \@interest); + index_output (\*INDEX, $metadata_ref, \@interest); close INDEX; } } |