Read entire thread files when parsing

Rather than trying to carefully read thread files paragraph by paragraph, read the entire file at once and reuse _split_paragraphs to separate it into parseable chunks. Add line number tracking to the input file stack and use that to improve error reporting.
author: Russ Allbery <rra@cpan.org> 2021-09-09 16:17:46 -0700
committer: Russ Allbery <rra@cpan.org> 2021-09-09 16:17:46 -0700
commit: f43e7f41e58cb096d0f275fb8ae6940d359d699e (patch)
tree: 26e36d054222217d118fb2c4c1dbd09be00146ad
parent: 99933cd7ba67347a5043ffdfc4b369d35f6a06f8 (diff)
3 files changed, 113 insertions, 61 deletions
diff --git a/lib/App/DocKnot/Spin/Thread.pm b/lib/App/DocKnot/Spin/Thread.pm
index 4b9e366..d7cca5e 100644
--- a/lib/App/DocKnot/Spin/Thread.pm
+++ b/lib/App/DocKnot/Spin/Thread.pm
@@ -20,6 +20,7 @@ use File::Basename qw(fileparse);
 use File::Spec      ();
 use Git::Repository ();
 use Image::Size qw(html_imgsize);
+use Perl6::Slurp qw(slurp);
 use POSIX qw(strftime);
 use Text::Balanced qw(extract_bracketed);
 
@@ -81,9 +82,34 @@ my %COMMANDS = (
 );
 
 ##############################################################################
-# Output
+# Input and output
 ##############################################################################
 
+# Read a file and return its data in a form suitable for the processing stack.
+#
+# $fh   - Input file handle
+# $path - Input path
+#
+# Returns: List suitable for the processing stack with the following elements:
+#            $paragraphs_ref - The input text split into paragraphs and
+#                              reversed so that it can be used as a stack
+#            $in_path        - Path to the input file
+#            $lineno         - Current processing line number
+sub _read_file {
+    my ($self, $fh, $path) = @_;
+    my $text = slurp($fh);
+
+    # Check for broken line endings.
+    if ("\n" !~ m{ \015 }xms && $text =~ m{ \015 }xms) {
+        my $m = 'found CR characters; are your line endings correct?';
+        $self->_warning($m);
+    }
+
+    # Parse the text into paragraphs and return the data for the files stack.
+    my @paragraphs = reverse($self->_split_paragraphs($text));
+    return [\@paragraphs, $path, 1];
+}
+
 # print with error checking and an explicit file handle.  autodie
 # unfortunately can't help us because print can't be prototyped and hence
 # can't be overridden.
@@ -150,7 +176,8 @@ sub _output {
 # Throws: Text exception with the provided error
 sub _fatal {
     my ($self, $problem) = @_;
-    die "$self->{file}:$.: $problem\n";
+    my (undef, $file, $lineno) = $self->{input}[-1]->@*;
+    die "$file:$lineno: $problem\n";
 }
 
 # Warn about a problem with the current file and line.
@@ -158,7 +185,8 @@ sub _fatal {
 # $problem - Warning message to report
 sub _warning {
     my ($self, $problem) = @_;
-    warn "$0 spin:$self->{file}:$.: $problem\n";
+    my (undef, $file, $lineno) = $self->{input}[-1]->@*;
+    warn "$0 spin:$file:$lineno: $problem\n";
     return;
 }
 
@@ -386,8 +414,21 @@ sub _expand {
         # use an inline context.
         $block &&= $text =~ m{ \A \s* \z }xms;
 
-        # Return the macro expansion.
-        return ($self->_macro($definition, $block, @args), $text);
+        # Expand the macro.
+        my ($result, $blocktag) = $self->_macro($definition, $block, @args);
+
+        # We have now double-counted all of the lines in the macro body
+        # itself, so we need to subtract the line count in the macro
+        # definition from the line number.
+        #
+        # This unfortunately means that the line number of errors that happen
+        # inside macro arguments will be somewhat off if the macro definition
+        # itself contains newlines.  I don't see a way to avoid that without
+        # much more complex parsing and state tracking.
+        $self->{input}[-1][2] -= $definition =~ tr{\n}{};
+
+        # Return the macro results.
+        return ($result, $blocktag, $text);
     }
 
     # The normal command-handling case.  Ensure it is a valid command.
@@ -413,6 +454,13 @@ sub _expand {
 # the commands in it, and format the results as HTML.  This function is
 # mutually recursive with _expand and _macro.
 #
+# This function is responsible for maintaining the line number in the file
+# currently being processed, for error reporting.  The strategy used is to
+# increment the line number whenever a newline is seen in processed text.
+# This means that newlines are not seen until the text containing them is
+# parsed, which in turn means that every argument that may contain a newline
+# must be parsed or must update the line number.
+#
 # $text  - Input text to parse
 # $block - True if the parse is done in a block context
 #
@@ -429,6 +477,9 @@ sub _parse_context {
     if (index($text, q{\\}) == -1) {
         my $output = $text;
 
+        # Update the line number.
+        $self->{input}[-1][2] += $text =~ tr{\n}{};
+
         # If we are at block context, we need to make the text into a block
         # element, which means wrapping it in <p> tags.  Since that is a
         # top-level block construct, also close any open block structure.
@@ -479,6 +530,9 @@ sub _parse_context {
             $self->_fatal(qq(unable to parse near "$context"));
         }
 
+        # Update the line number.
+        $self->{input}[-1][2] += $string =~ tr{\n}{};
+
         # If this is not a command, and we're not at the block level, just add
         # it verbatim to the output.
         #
@@ -537,11 +591,17 @@ sub _parse_context {
         }
 
         # If the next bit of unparsed text starts with a newline, extract it
-        # and any following whitespace now.  Add it to our paragraph if we're
-        # accumulating one; otherwise, add it to the output, but only add the
-        # newline if we saw inline elements or there is remaining text.  This
-        # suppresses some useless black lines.
+        # and any following whitespace now.
         if ($text =~ s{ \A \n (\s*) }{}xms) {
+            my $spaces = $1;
+
+            # Update the line number.
+            $self->{input}[-1][2] += 1 + $spaces =~ tr{\n}{};
+
+            # Add it to our paragraph if we're accumulating one; otherwise,
+            # add it to the output, but only add the newline if we saw inline
+            # elements or there is remaining text.  This suppresses some
+            # useless black lines.
             if ($paragraph ne q{}) {
                 $paragraph .= "\n$1";
             } else {
@@ -604,8 +664,9 @@ sub _format_attr {
 
 # Split a block of text apart at paired newlines so that it can be reparsed as
 # paragraphs, but combine a paragraph with the next one if it has an
-# unbalanced number of open brackets.  Used by containiners like \block that
-# can contain multiple paragraphs.
+# unbalanced number of open brackets.  Used to parse the top-level structure
+# of a file and by containiners like \block that can contain multiple
+# paragraphs.
 #
 # $text - Text to split
 #
@@ -836,6 +897,10 @@ sub _define_macro {
         }
     }
 
+    # We don't parse the macro definition now but we need to update the line
+    # number for accurate error reporting.
+    $self->{input}[-1][2] += $definition =~ tr{\n}{};
+
     # Define the macro.
     $self->{macro}{$name} = [$args, $definition];
     return (1, q{});
@@ -1004,9 +1069,9 @@ sub _cmd_heading {
     # Add some generator comments.
     my $date = strftime('%Y-%m-%d %T -0000', gmtime());
     my $from
-      = $self->{file} eq q{-}
+      = $self->{input}[-1][1] eq q{-}
       ? q{}
-      : ' from ' . fileparse($self->{file});
+      : ' from ' . fileparse($self->{input}[-1][1]);
     $output .= "<!-- Spun$from by spin 1.80 on $date -->\n";
     if ($self->{id}) {
         $output .= "<!-- $self->{id} -->\n";
@@ -1052,12 +1117,13 @@ sub _cmd_image {
 }
 
 # Include a file.  Note that this includes a file after the current paragraph,
-# not immediately at the current point, which may be a bit surprising.
+# not immediately, which may be a bit surprising.
 sub _cmd_include {
     my ($self, $file) = @_;
-    $file = $self->_parse($file);
+    $file = realpath($self->_parse($file));
     open(my $fh, '<', $file);
-    push($self->{files}->@*, [$fh, $file]);
+    push($self->{input}->@*, $self->_read_file($fh, $file));
+    close($fh);
     return (1, q{});
 }
 
@@ -1186,7 +1252,7 @@ sub _cmd_signature {
 
     # If we're spinning from standard input, don't add any of the standard
     # footer, just close the HTML tags.
-    if ($self->{file} eq q{-}) {
+    if ($self->{input}[-1][1] eq q{-}) {
         $output .= "</body>\n</html>\n";
         return (1, $output);
     }
@@ -1194,7 +1260,7 @@ sub _cmd_signature {
     # Otherwise, _footer does most of the work and we just add the tags.
     my $link = '<a href="%URL%">spun</a>';
     $output .= $self->_footer(
-        $self->{file}, $self->{out_path}, $self->{id},
+        $self->{input}[-1][1], $self->{out_path}, $self->{id},
         "Last modified and\n    $link %MOD%",
         "Last $link\n    %NOW% from thread modified %MOD%",
     );
@@ -1345,7 +1411,7 @@ sub spin_fh {
     my ($self, $in_fh, $in_path, $out_fh, $out_path) = @_;
 
     # Initialize object state for a new document.
-    $self->{files}    = [[$in_fh, $in_path]];
+    $self->{input}    = [$self->_read_file($in_fh, $in_path)];
     $self->{id}       = undef;
     $self->{macro}    = {};
     $self->{out_fh}   = $out_fh;
@@ -1355,44 +1421,27 @@ sub spin_fh {
     $self->{state}    = ['BLOCK'];
     $self->{variable} = {};
 
-    # Parse the thread file a paragraph at a time (but pick up macro contents
-    # that are continued across paragraphs.
+    # Read the entirety of the input file, split into paragraphs, and add it
+    # to the processing stack.
+    push($self->{input}->@*, $self->_read_file($in_fh, $in_path));
+
+    # Parse the thread file a paragraph at a time.  _split_paragraphs takes
+    # care of ensuring that each paragraph contains the complete value of a
+    # command argument.
     #
-    # We maintain the stack of files that we're parsing in $self->{files}, and
-    # _cmd_include will unshift new file handle and filename pairs onto that
-    # stack.  That means that the top of the stack may change any time we call
-    # _parse, so we have to update the input file handle and $self->{file}
-    # each time through the loop.
-    local $/ = q{};
-    while ($self->{files}->@*) {
-        ($in_fh, $self->{file}) = $self->{files}[-1]->@*;
-        while (defined(my $para = <$in_fh>)) {
-            if ("\n" !~ m{ \015 }xms && $para =~ m{ \015 }xms) {
-                my $m = 'found CR characters; are your line endings correct?';
-                $self->_warning($m);
-            }
-            my $open_count  = ($para =~ tr{\[}{});
-            my $close_count = ($para =~ tr{\]}{});
-            while (!eof && $open_count > $close_count) {
-                my $extra = <$in_fh>;
-                $open_count  += ($extra =~ tr{\[}{});
-                $close_count += ($extra =~ tr{\]}{});
-                $para .= $extra;
-            }
+    # The stack of parsed input is maintained in $self->{input} and the file
+    # being parsed at any given point is $self->{input}[-1].  _cmd_include
+    # will push new file information into this stack, and we pop off the top
+    # element of the stack when we exhaust its paragraphs.
+    while ($self->{input}->@*) {
+        while (defined(my $para = pop($self->{input}[-1][0]->@*))) {
             my $result = $self->_parse(_escape($para), 1);
             $result =~ s{ \A (?:\s*\n)+ }{}xms;
             if ($result !~ m{ \A \s* \z }xms) {
                 $self->_output($result);
             }
-            ($in_fh, $self->{file}) = $self->{files}[-1]->@*;
-        }
-        pop($self->{files}->@*);
-
-        # Close the input file handle if it was one we opened, which is all of
-        # them except the last one in the stack.
-        if ($self->{files}->@*) {
-            close($in_fh);
         }
+        pop($self->{input}->@*);
     }
 
     # Close open tags and print any deferred whitespace.
diff --git a/t/data/spin/errors/errors.th b/t/data/spin/errors/errors.th
index 3af991e..6f08cd9 100644
--- a/t/data/spin/errors/errors.th
+++ b/t/data/spin/errors/errors.th
@@ -1,6 +1,7 @@
 \heading[Errors]
 
-\==[macro]    [1]   [\1 \2]
+\==[macro]    [1]   [\1
+                     \2]
 \==[badcount] [foo] [bar]
 
 \macro[
diff --git a/t/spin/errors.t b/t/spin/errors.t
index 6c27a87..a9069ae 100755
--- a/t/spin/errors.t
+++ b/t/spin/errors.t
@@ -16,18 +16,20 @@ use File::Temp;
 
 use Test::More tests => 2;
 
-# Expected errors from spinning the error file.
+# Expected errors from spinning the error file.  The line numbers are still
+# not entirely correct because line tracking is very complicated and still not
+# entirely correct.
 my $EXPECTED_ERRORS = <<'ERRORS';
 errors.th:1: cannot find argument 2: Did not find opening bracket after prefix: "\s*", detected at offset 2
-errors.th:2: invalid macro placeholder \2 (greater than 1)
-errors.th:2: invalid macro argument count for \badcount
-errors.th:4: unknown variable \=UNKNOWN
-errors.th:4: unknown command or macro \unknown
-errors.th:5: space in anchor "#foo bar"
-errors.th:5: no package release information available
-errors.th:5: no sitemap file found
-errors.th:5: no package version information available
-errors.th:5: cannot stat file nonexistent-file
+errors.th:3: invalid macro placeholder \2 (greater than 1)
+errors.th:5: invalid macro argument count for \badcount
+errors.th:9: unknown variable \=UNKNOWN
+errors.th:11: unknown command or macro \unknown
+errors.th:14: space in anchor "#foo bar"
+errors.th:15: no package release information available
+errors.th:16: no sitemap file found
+errors.th:17: no package version information available
+errors.th:17: cannot stat file nonexistent-file
 ERRORS
 
 require_ok('App::DocKnot::Spin::Thread');
author	Russ Allbery <rra@cpan.org>	2021-09-09 16:17:46 -0700
committer	Russ Allbery <rra@cpan.org>	2021-09-09 16:17:46 -0700
commit	f43e7f41e58cb096d0f275fb8ae6940d359d699e (patch)
tree	26e36d054222217d118fb2c4c1dbd09be00146ad
parent	99933cd7ba67347a5043ffdfc4b369d35f6a06f8 (diff)