summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuss Allbery <rra@cpan.org>2021-09-09 16:17:46 -0700
committerRuss Allbery <rra@cpan.org>2021-09-09 16:17:46 -0700
commitf43e7f41e58cb096d0f275fb8ae6940d359d699e (patch)
tree26e36d054222217d118fb2c4c1dbd09be00146ad
parent99933cd7ba67347a5043ffdfc4b369d35f6a06f8 (diff)
Read entire thread files when parsing
Rather than trying to carefully read thread files paragraph by paragraph, read the entire file at once and reuse _split_paragraphs to separate it into parseable chunks. Add line number tracking to the input file stack and use that to improve error reporting.
-rw-r--r--lib/App/DocKnot/Spin/Thread.pm149
-rw-r--r--t/data/spin/errors/errors.th3
-rwxr-xr-xt/spin/errors.t22
3 files changed, 113 insertions, 61 deletions
diff --git a/lib/App/DocKnot/Spin/Thread.pm b/lib/App/DocKnot/Spin/Thread.pm
index 4b9e366..d7cca5e 100644
--- a/lib/App/DocKnot/Spin/Thread.pm
+++ b/lib/App/DocKnot/Spin/Thread.pm
@@ -20,6 +20,7 @@ use File::Basename qw(fileparse);
use File::Spec ();
use Git::Repository ();
use Image::Size qw(html_imgsize);
+use Perl6::Slurp qw(slurp);
use POSIX qw(strftime);
use Text::Balanced qw(extract_bracketed);
@@ -81,9 +82,34 @@ my %COMMANDS = (
);
##############################################################################
-# Output
+# Input and output
##############################################################################
+# Read a file and return its data in a form suitable for the processing stack.
+#
+# $fh - Input file handle
+# $path - Input path
+#
+# Returns: List suitable for the processing stack with the following elements:
+# $paragraphs_ref - The input text split into paragraphs and
+# reversed so that it can be used as a stack
+# $in_path - Path to the input file
+# $lineno - Current processing line number
+sub _read_file {
+ my ($self, $fh, $path) = @_;
+ my $text = slurp($fh);
+
+ # Check for broken line endings.
+ if ("\n" !~ m{ \015 }xms && $text =~ m{ \015 }xms) {
+ my $m = 'found CR characters; are your line endings correct?';
+ $self->_warning($m);
+ }
+
+ # Parse the text into paragraphs and return the data for the files stack.
+ my @paragraphs = reverse($self->_split_paragraphs($text));
+ return [\@paragraphs, $path, 1];
+}
+
# print with error checking and an explicit file handle. autodie
# unfortunately can't help us because print can't be prototyped and hence
# can't be overridden.
@@ -150,7 +176,8 @@ sub _output {
# Throws: Text exception with the provided error
sub _fatal {
my ($self, $problem) = @_;
- die "$self->{file}:$.: $problem\n";
+ my (undef, $file, $lineno) = $self->{input}[-1]->@*;
+ die "$file:$lineno: $problem\n";
}
# Warn about a problem with the current file and line.
@@ -158,7 +185,8 @@ sub _fatal {
# $problem - Warning message to report
sub _warning {
my ($self, $problem) = @_;
- warn "$0 spin:$self->{file}:$.: $problem\n";
+ my (undef, $file, $lineno) = $self->{input}[-1]->@*;
+ warn "$0 spin:$file:$lineno: $problem\n";
return;
}
@@ -386,8 +414,21 @@ sub _expand {
# use an inline context.
$block &&= $text =~ m{ \A \s* \z }xms;
- # Return the macro expansion.
- return ($self->_macro($definition, $block, @args), $text);
+ # Expand the macro.
+ my ($result, $blocktag) = $self->_macro($definition, $block, @args);
+
+ # We have now double-counted all of the lines in the macro body
+ # itself, so we need to subtract the line count in the macro
+ # definition from the line number.
+ #
+ # This unfortunately means that the line number of errors that happen
+ # inside macro arguments will be somewhat off if the macro definition
+ # itself contains newlines. I don't see a way to avoid that without
+ # much more complex parsing and state tracking.
+ $self->{input}[-1][2] -= $definition =~ tr{\n}{};
+
+ # Return the macro results.
+ return ($result, $blocktag, $text);
}
# The normal command-handling case. Ensure it is a valid command.
@@ -413,6 +454,13 @@ sub _expand {
# the commands in it, and format the results as HTML. This function is
# mutually recursive with _expand and _macro.
#
+# This function is responsible for maintaining the line number in the file
+# currently being processed, for error reporting. The strategy used is to
+# increment the line number whenever a newline is seen in processed text.
+# This means that newlines are not seen until the text containing them is
+# parsed, which in turn means that every argument that may contain a newline
+# must be parsed or must update the line number.
+#
# $text - Input text to parse
# $block - True if the parse is done in a block context
#
@@ -429,6 +477,9 @@ sub _parse_context {
if (index($text, q{\\}) == -1) {
my $output = $text;
+ # Update the line number.
+ $self->{input}[-1][2] += $text =~ tr{\n}{};
+
# If we are at block context, we need to make the text into a block
# element, which means wrapping it in <p> tags. Since that is a
# top-level block construct, also close any open block structure.
@@ -479,6 +530,9 @@ sub _parse_context {
$self->_fatal(qq(unable to parse near "$context"));
}
+ # Update the line number.
+ $self->{input}[-1][2] += $string =~ tr{\n}{};
+
# If this is not a command, and we're not at the block level, just add
# it verbatim to the output.
#
@@ -537,11 +591,17 @@ sub _parse_context {
}
# If the next bit of unparsed text starts with a newline, extract it
- # and any following whitespace now. Add it to our paragraph if we're
- # accumulating one; otherwise, add it to the output, but only add the
- # newline if we saw inline elements or there is remaining text. This
- # suppresses some useless black lines.
+ # and any following whitespace now.
if ($text =~ s{ \A \n (\s*) }{}xms) {
+ my $spaces = $1;
+
+ # Update the line number.
+ $self->{input}[-1][2] += 1 + $spaces =~ tr{\n}{};
+
+ # Add it to our paragraph if we're accumulating one; otherwise,
+ # add it to the output, but only add the newline if we saw inline
+ # elements or there is remaining text. This suppresses some
+ # useless black lines.
if ($paragraph ne q{}) {
$paragraph .= "\n$1";
} else {
@@ -604,8 +664,9 @@ sub _format_attr {
# Split a block of text apart at paired newlines so that it can be reparsed as
# paragraphs, but combine a paragraph with the next one if it has an
-# unbalanced number of open brackets. Used by containiners like \block that
-# can contain multiple paragraphs.
+# unbalanced number of open brackets. Used to parse the top-level structure
+# of a file and by containiners like \block that can contain multiple
+# paragraphs.
#
# $text - Text to split
#
@@ -836,6 +897,10 @@ sub _define_macro {
}
}
+ # We don't parse the macro definition now but we need to update the line
+ # number for accurate error reporting.
+ $self->{input}[-1][2] += $definition =~ tr{\n}{};
+
# Define the macro.
$self->{macro}{$name} = [$args, $definition];
return (1, q{});
@@ -1004,9 +1069,9 @@ sub _cmd_heading {
# Add some generator comments.
my $date = strftime('%Y-%m-%d %T -0000', gmtime());
my $from
- = $self->{file} eq q{-}
+ = $self->{input}[-1][1] eq q{-}
? q{}
- : ' from ' . fileparse($self->{file});
+ : ' from ' . fileparse($self->{input}[-1][1]);
$output .= "<!-- Spun$from by spin 1.80 on $date -->\n";
if ($self->{id}) {
$output .= "<!-- $self->{id} -->\n";
@@ -1052,12 +1117,13 @@ sub _cmd_image {
}
# Include a file. Note that this includes a file after the current paragraph,
-# not immediately at the current point, which may be a bit surprising.
+# not immediately, which may be a bit surprising.
sub _cmd_include {
my ($self, $file) = @_;
- $file = $self->_parse($file);
+ $file = realpath($self->_parse($file));
open(my $fh, '<', $file);
- push($self->{files}->@*, [$fh, $file]);
+ push($self->{input}->@*, $self->_read_file($fh, $file));
+ close($fh);
return (1, q{});
}
@@ -1186,7 +1252,7 @@ sub _cmd_signature {
# If we're spinning from standard input, don't add any of the standard
# footer, just close the HTML tags.
- if ($self->{file} eq q{-}) {
+ if ($self->{input}[-1][1] eq q{-}) {
$output .= "</body>\n</html>\n";
return (1, $output);
}
@@ -1194,7 +1260,7 @@ sub _cmd_signature {
# Otherwise, _footer does most of the work and we just add the tags.
my $link = '<a href="%URL%">spun</a>';
$output .= $self->_footer(
- $self->{file}, $self->{out_path}, $self->{id},
+ $self->{input}[-1][1], $self->{out_path}, $self->{id},
"Last modified and\n $link %MOD%",
"Last $link\n %NOW% from thread modified %MOD%",
);
@@ -1345,7 +1411,7 @@ sub spin_fh {
my ($self, $in_fh, $in_path, $out_fh, $out_path) = @_;
# Initialize object state for a new document.
- $self->{files} = [[$in_fh, $in_path]];
+ $self->{input} = [$self->_read_file($in_fh, $in_path)];
$self->{id} = undef;
$self->{macro} = {};
$self->{out_fh} = $out_fh;
@@ -1355,44 +1421,27 @@ sub spin_fh {
$self->{state} = ['BLOCK'];
$self->{variable} = {};
- # Parse the thread file a paragraph at a time (but pick up macro contents
- # that are continued across paragraphs.
+ # Read the entirety of the input file, split into paragraphs, and add it
+ # to the processing stack.
+ push($self->{input}->@*, $self->_read_file($in_fh, $in_path));
+
+ # Parse the thread file a paragraph at a time. _split_paragraphs takes
+ # care of ensuring that each paragraph contains the complete value of a
+ # command argument.
#
- # We maintain the stack of files that we're parsing in $self->{files}, and
- # _cmd_include will unshift new file handle and filename pairs onto that
- # stack. That means that the top of the stack may change any time we call
- # _parse, so we have to update the input file handle and $self->{file}
- # each time through the loop.
- local $/ = q{};
- while ($self->{files}->@*) {
- ($in_fh, $self->{file}) = $self->{files}[-1]->@*;
- while (defined(my $para = <$in_fh>)) {
- if ("\n" !~ m{ \015 }xms && $para =~ m{ \015 }xms) {
- my $m = 'found CR characters; are your line endings correct?';
- $self->_warning($m);
- }
- my $open_count = ($para =~ tr{\[}{});
- my $close_count = ($para =~ tr{\]}{});
- while (!eof && $open_count > $close_count) {
- my $extra = <$in_fh>;
- $open_count += ($extra =~ tr{\[}{});
- $close_count += ($extra =~ tr{\]}{});
- $para .= $extra;
- }
+ # The stack of parsed input is maintained in $self->{input} and the file
+ # being parsed at any given point is $self->{input}[-1]. _cmd_include
+ # will push new file information into this stack, and we pop off the top
+ # element of the stack when we exhaust its paragraphs.
+ while ($self->{input}->@*) {
+ while (defined(my $para = pop($self->{input}[-1][0]->@*))) {
my $result = $self->_parse(_escape($para), 1);
$result =~ s{ \A (?:\s*\n)+ }{}xms;
if ($result !~ m{ \A \s* \z }xms) {
$self->_output($result);
}
- ($in_fh, $self->{file}) = $self->{files}[-1]->@*;
- }
- pop($self->{files}->@*);
-
- # Close the input file handle if it was one we opened, which is all of
- # them except the last one in the stack.
- if ($self->{files}->@*) {
- close($in_fh);
}
+ pop($self->{input}->@*);
}
# Close open tags and print any deferred whitespace.
diff --git a/t/data/spin/errors/errors.th b/t/data/spin/errors/errors.th
index 3af991e..6f08cd9 100644
--- a/t/data/spin/errors/errors.th
+++ b/t/data/spin/errors/errors.th
@@ -1,6 +1,7 @@
\heading[Errors]
-\==[macro] [1] [\1 \2]
+\==[macro] [1] [\1
+ \2]
\==[badcount] [foo] [bar]
\macro[
diff --git a/t/spin/errors.t b/t/spin/errors.t
index 6c27a87..a9069ae 100755
--- a/t/spin/errors.t
+++ b/t/spin/errors.t
@@ -16,18 +16,20 @@ use File::Temp;
use Test::More tests => 2;
-# Expected errors from spinning the error file.
+# Expected errors from spinning the error file. The line numbers are still
+# not entirely correct because line tracking is very complicated and still not
+# entirely correct.
my $EXPECTED_ERRORS = <<'ERRORS';
errors.th:1: cannot find argument 2: Did not find opening bracket after prefix: "\s*", detected at offset 2
-errors.th:2: invalid macro placeholder \2 (greater than 1)
-errors.th:2: invalid macro argument count for \badcount
-errors.th:4: unknown variable \=UNKNOWN
-errors.th:4: unknown command or macro \unknown
-errors.th:5: space in anchor "#foo bar"
-errors.th:5: no package release information available
-errors.th:5: no sitemap file found
-errors.th:5: no package version information available
-errors.th:5: cannot stat file nonexistent-file
+errors.th:3: invalid macro placeholder \2 (greater than 1)
+errors.th:5: invalid macro argument count for \badcount
+errors.th:9: unknown variable \=UNKNOWN
+errors.th:11: unknown command or macro \unknown
+errors.th:14: space in anchor "#foo bar"
+errors.th:15: no package release information available
+errors.th:16: no sitemap file found
+errors.th:17: no package version information available
+errors.th:17: cannot stat file nonexistent-file
ERRORS
require_ok('App::DocKnot::Spin::Thread');