diff options
author | Andrew Shadura <andrewsh@debian.org> | 2017-08-30 15:33:58 +0300 |
---|---|---|
committer | Andrew Shadura <andrewsh@debian.org> | 2017-08-30 15:33:58 +0300 |
commit | ec06787d2bdda608b8b24ca8b29d9edbea81ac6a (patch) | |
tree | b535394603d7bf1b87e2c2dc83883ae61c449e91 |
Import original source of HTML-Restrict 2.2.4
-rw-r--r-- | CONTRIBUTORS | 21 | ||||
-rw-r--r-- | Changes | 117 | ||||
-rw-r--r-- | INSTALL | 43 | ||||
-rw-r--r-- | LICENSE | 379 | ||||
-rw-r--r-- | MANIFEST | 36 | ||||
-rw-r--r-- | META.json | 95 | ||||
-rw-r--r-- | META.yml | 56 | ||||
-rw-r--r-- | Makefile.PL | 73 | ||||
-rw-r--r-- | README.md | 349 | ||||
-rw-r--r-- | cpanfile | 35 | ||||
-rw-r--r-- | dist.ini | 9 | ||||
-rwxr-xr-x | examples/sanitize_file.pl | 23 | ||||
-rw-r--r-- | lib/HTML/Restrict.pm | 686 | ||||
-rw-r--r-- | perlcriticrc | 86 | ||||
-rw-r--r-- | perltidyrc | 12 | ||||
-rw-r--r-- | t/00-load.t | 125 | ||||
-rw-r--r-- | t/attribute_constraints.t | 66 | ||||
-rw-r--r-- | t/author-pod-coverage.t | 15 | ||||
-rw-r--r-- | t/author-pod-spell.t | 74 | ||||
-rw-r--r-- | t/author-synopsis.t | 13 | ||||
-rw-r--r-- | t/author-tidyall.t | 19 | ||||
-rw-r--r-- | t/comments.t | 33 | ||||
-rw-r--r-- | t/declaration.t | 22 | ||||
-rw-r--r-- | t/js.t | 24 | ||||
-rw-r--r-- | t/lowercase.t | 29 | ||||
-rw-r--r-- | t/memory-leak.t | 24 | ||||
-rw-r--r-- | t/perlcriticrc | 4 | ||||
-rw-r--r-- | t/pod.t | 12 | ||||
-rw-r--r-- | t/release-cpan-changes.t | 18 | ||||
-rw-r--r-- | t/replace_img.t | 62 | ||||
-rw-r--r-- | t/scheme.t | 70 | ||||
-rw-r--r-- | t/stack.t | 19 | ||||
-rw-r--r-- | t/style.t | 24 | ||||
-rwxr-xr-x | t/xss.t | 35 | ||||
-rw-r--r-- | tidyall.ini | 26 |
35 files changed, 2734 insertions, 0 deletions
diff --git a/CONTRIBUTORS b/CONTRIBUTORS new file mode 100644 index 0000000..285239f --- /dev/null +++ b/CONTRIBUTORS @@ -0,0 +1,21 @@ + +# HTML-RESTRICT CONTRIBUTORS # + +This is the (likely incomplete) list of people who have helped +make this distribution what it is, either via code contributions, +patches, bug reports, help with troubleshooting, etc. A huge +'thank you' to all of them. + + * Arthur Axel 'fREW' Schmidt + * Ben Bullock + * David Golden + * Fitz Elliott + * Graham Knop + * Graham TerMarsch + * Karen Etheridge + * Mark Jubenville + * Olaf Alders + * perlpong + * skaufman + + @@ -0,0 +1,117 @@ +Revision history for HTML-Restrict + +2.2.4 2017-03-01 14:22:13-05:00 America/Toronto + - Updates SEE ALSO pod section (Ben Bullock) + +2.2.3 2016-07-26 18:00:55-04:00 America/Toronto + - Now with fewer non-core prerequisites (Karen Etheridge) + +2.2.2 2014-10-16 22:16:29-04:00 America/Toronto + - Fixes attribute value encoding. (Graham Knop) + +2.2.1 2014-08-18 17:22:05EDT-0400 America/Toronto + - Replace MooX::Types::MooseLike with Type::Tiny. (Samuel Kaufman) + +2.2.0 2014-08-15 23:01:22EDT-0400 America/Toronto + - Fix Changes file + - Bump to next minor version rather than changing the format of the + version numbers + +2.1.9 2013-12-10 + - Swaps namespace::clean for namespace::autoclean. (Fitz Elliott) + +2.1.8 2013-06-06 + - Unlike 2.1.7, This release doesn't contain a local::lib. + +2.1.7 2013-06-05 + - Removes Perl6::Junction in order to squash smartmatch warnings. + +2.1.6 2013-05-06 + - Fixes incorrect use of Moo's weak_ref. (Thanks to Carwyn Ellis, + Dagfinn Ilmari Mannsåker and Graham Knop for help with this.) + +2.1.5 2013-05-01 + - Eliminates a circular reference. (Graham TerMarsch) + +2.1.4 2013-04-19 + - Minor version should have been bumped in previous release. To bump it + now would be confusing. + - Require Moo 1.001000 as 1.000008 breaks this module. + +2.1.3 2013-04-18 + - Add replace_img feature (David Golden). + - Add 'cite' to list of attributes for URI scheme checks (David + Golden). + +2.1.2 2013-03-20 + - Fixes broken Pod in example script. + - Adds x_contributors to META.json. + +2.1.1 2013-03-08 + - Updates Pod. + +2.1.0 2013-03-02 + - Allows attributes to be validated against regexes (perlpong). + +2.0.0 2013-02-27 + - Bumps version to 2.0.0 due to new features breaking backwards + compatibility. + - Forces all tag rules to be provided in lower case. Breaks + backcompat, but mixed or upper case rules would have been silently + ignored. So, this is really a bug fix. + - Strips content between script tags by default. Breaks backcompat, + but the default behaviour was likely not what anyone wanted. + - Strips content between style tags by default. Breaks backcompat, + but the default behaviour was likely not what anyone wanted. + - Adds Pod for previously undocumented allow_comments, + allow_declaration and strip_enclosed_content. + - Ensures stripper stack is always reset (perlpong). + - Protects against undef string comparison (perlpong). + - Checks URI scheme for src attributes (perlpong). + - Removes accessor/mutator documentation. Would prefer if all params + were set at object instatiation. + - Adds ridiculously basic example script. + + +1.0.4 2012-05-22 + - Explicity require MooX::Types::MooseLike rather than + MooX::Types::MooseLike::Base (Matt Phillips/Github #2) + +1.0.3 2012-03-27 + - Replaces Moose with Moo for speed in non-persistent apps (Arthur Axel + 'fREW' Schmidt) + - Adds uri scheme checking for href and src attributes + - Adds (undocumented) allow_comments() and allow_declartion() methods + +1.0.2 2011-10-11 + - No longer emit warnings when pure HTML is supplied for processing + Rick Moore (RT #71553) + +1.0.1 2011-06-30 + - Pod updates + +1.0.0 2011-06-30 + - Bumped version to 1.0.0 This is no longer beta software. + +0.06 2010-03-22 + - Fixed RT #55775 "Strings equating to boolean false returned as undef" + (Duncan Forsyth) + +0.05 2010-02-03 + - process( undef ) returns without attempting to do anything. Avoids + warnings about uninitialized vars. + +0.04 2009-11-19 + - MooseX::Params::Validate 0.12 now required. Some earlier versions do + not include pos_validated_list() + +0.03 2009-11-17 + - process() no longer dies when passed an undef -- patch provided by + Mark Jubenville (ioncache) + +0.02 2009-09-17 + - Improved documentation layout and clarity + Removed documentation for parser() method -- seems like a bad idea + +0.01 2009-09-17 + - Beta release @@ -0,0 +1,43 @@ +This is the Perl distribution HTML-Restrict. + +Installing HTML-Restrict is straightforward. + +## Installation with cpanm + +If you have cpanm, you only need one line: + + % cpanm HTML::Restrict + +If it does not have permission to install modules to the current perl, cpanm +will automatically set up and install to a local::lib in your home directory. +See the local::lib documentation (https://metacpan.org/pod/local::lib) for +details on enabling it in your environment. + +## Installing with the CPAN shell + +Alternatively, if your CPAN shell is set up, you should just be able to do: + + % cpan HTML::Restrict + +## Manual installation + +As a last resort, you can manually install it. Download the tarball, untar it, +then build it: + + % perl Makefile.PL + % make && make test + +Then install it: + + % make install + +If your perl is system-managed, you can create a local::lib in your home +directory to install modules to. For details, see the local::lib documentation: +https://metacpan.org/pod/local::lib + +## Documentation + +HTML-Restrict documentation is available as POD. +You can run perldoc from a shell to read the documentation: + + % perldoc HTML::Restrict @@ -0,0 +1,379 @@ +This software is copyright (c) 2013-2017 by Olaf Alders. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +Terms of the Perl programming language system itself + +a) the GNU General Public License as published by the Free + Software Foundation; either version 1, or (at your option) any + later version, or +b) the "Artistic License" + +--- The GNU General Public License, Version 1, February 1989 --- + +This software is Copyright (c) 2013-2017 by Olaf Alders. + +This is free software, licensed under: + + The GNU General Public License, Version 1, February 1989 + + GNU GENERAL PUBLIC LICENSE + Version 1, February 1989 + + Copyright (C) 1989 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The license agreements of most software companies try to keep users +at the mercy of those companies. By contrast, our General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. The +General Public License applies to the Free Software Foundation's +software and to any other program whose authors commit to using it. +You can use it for your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Specifically, the General Public License is designed to make +sure that you have the freedom to give away or sell copies of free +software, that you receive source code or can get it if you want it, +that you can change the software or use pieces of it in new free +programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of a such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must tell them their rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any program or other work which +contains a notice placed by the copyright holder saying it may be +distributed under the terms of this General Public License. The +"Program", below, refers to any such program or work, and a "work based +on the Program" means either the Program or any work containing the +Program or a portion of it, either verbatim or with modifications. Each +licensee is addressed as "you". + + 1. You may copy and distribute verbatim copies of the Program's source +code as you receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice and +disclaimer of warranty; keep intact all the notices that refer to this +General Public License and to the absence of any warranty; and give any +other recipients of the Program a copy of this General Public License +along with the Program. You may charge a fee for the physical act of +transferring a copy. + + 2. You may modify your copy or copies of the Program or any portion of +it, and copy and distribute such modifications under the terms of Paragraph +1 above, provided that you also do the following: + + a) cause the modified files to carry prominent notices stating that + you changed the files and the date of any change; and + + b) cause the whole of any work that you distribute or publish, that + in whole or in part contains the Program or any part thereof, either + with or without modifications, to be licensed at no charge to all + third parties under the terms of this General Public License (except + that you may choose to grant warranty protection to some or all + third parties, at your option). + + c) If the modified program normally reads commands interactively when + run, you must cause it, when started running for such interactive use + in the simplest and most usual way, to print or display an + announcement including an appropriate copyright notice and a notice + that there is no warranty (or else, saying that you provide a + warranty) and that users may redistribute the program under these + conditions, and telling the user how to view a copy of this General + Public License. + + d) You may charge a fee for the physical act of transferring a + copy, and you may at your option offer warranty protection in + exchange for a fee. + +Mere aggregation of another independent work with the Program (or its +derivative) on a volume of a storage or distribution medium does not bring +the other work under the scope of these terms. + + 3. You may copy and distribute the Program (or a portion or derivative of +it, under Paragraph 2) in object code or executable form under the terms of +Paragraphs 1 and 2 above provided that you also do one of the following: + + a) accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of + Paragraphs 1 and 2 above; or, + + b) accompany it with a written offer, valid for at least three + years, to give any third party free (except for a nominal charge + for the cost of distribution) a complete machine-readable copy of the + corresponding source code, to be distributed under the terms of + Paragraphs 1 and 2 above; or, + + c) accompany it with the information you received as to where the + corresponding source code may be obtained. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form alone.) + +Source code for a work means the preferred form of the work for making +modifications to it. For an executable file, complete source code means +all the source code for all modules it contains; but, as a special +exception, it need not include source code for modules which are standard +libraries that accompany the operating system on which the executable +file runs, or for standard header files or definitions files that +accompany that operating system. + + 4. You may not copy, modify, sublicense, distribute or transfer the +Program except as expressly provided under this General Public License. +Any attempt otherwise to copy, modify, sublicense, distribute or transfer +the Program is void, and will automatically terminate your rights to use +the Program under this License. However, parties who have received +copies, or rights to use copies, from you under this General Public +License will not have their licenses terminated so long as such parties +remain in full compliance. + + 5. By copying, distributing or modifying the Program (or any work based +on the Program) you indicate your acceptance of this license to do so, +and all its terms and conditions. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the original +licensor to copy, distribute or modify the Program subject to these +terms and conditions. You may not impose any further restrictions on the +recipients' exercise of the rights granted herein. + + 7. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of the license which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +the license, you may choose any version ever published by the Free Software +Foundation. + + 8. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to humanity, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + + To do so, attach the following notices to the program. It is safest to +attach them to the start of each source file to most effectively convey +the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) 19yy <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 1, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19xx name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the +appropriate parts of the General Public License. Of course, the +commands you use may be called something other than `show w' and `show +c'; they could even be mouse-clicks or menu items--whatever suits your +program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + program `Gnomovision' (a program to direct compilers to make passes + at assemblers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +That's all there is to it! + + +--- The Artistic License 1.0 --- + +This software is Copyright (c) 2013-2017 by Olaf Alders. + +This is free software, licensed under: + + The Artistic License 1.0 + +The Artistic License + +Preamble + +The intent of this document is to state the conditions under which a Package +may be copied, such that the Copyright Holder maintains some semblance of +artistic control over the development of the package, while giving the users of +the package the right to use and distribute the Package in a more-or-less +customary fashion, plus the right to make reasonable modifications. + +Definitions: + + - "Package" refers to the collection of files distributed by the Copyright + Holder, and derivatives of that collection of files created through + textual modification. + - "Standard Version" refers to such a Package if it has not been modified, + or has been modified in accordance with the wishes of the Copyright + Holder. + - "Copyright Holder" is whoever is named in the copyright or copyrights for + the package. + - "You" is you, if you're thinking about copying or distributing this Package. + - "Reasonable copying fee" is whatever you can justify on the basis of media + cost, duplication charges, time of people involved, and so on. (You will + not be required to justify it to the Copyright Holder, but only to the + computing community at large as a market that must bear the fee.) + - "Freely Available" means that no fee is charged for the item itself, though + there may be fees involved in handling the item. It also means that + recipients of the item may redistribute it under the same conditions they + received it. + +1. You may make and give away verbatim copies of the source form of the +Standard Version of this Package without restriction, provided that you +duplicate all of the original copyright notices and associated disclaimers. + +2. You may apply bug fixes, portability fixes and other modifications derived +from the Public Domain or from the Copyright Holder. A Package modified in such +a way shall still be considered the Standard Version. + +3. You may otherwise modify your copy of this Package in any way, provided that +you insert a prominent notice in each changed file stating how and when you +changed that file, and provided that you do at least ONE of the following: + + a) place your modifications in the Public Domain or otherwise make them + Freely Available, such as by posting said modifications to Usenet or an + equivalent medium, or placing the modifications on a major archive site + such as ftp.uu.net, or by allowing the Copyright Holder to include your + modifications in the Standard Version of the Package. + + b) use the modified Package only within your corporation or organization. + + c) rename any non-standard executables so the names do not conflict with + standard executables, which must also be provided, and provide a separate + manual page for each non-standard executable that clearly documents how it + differs from the Standard Version. + + d) make other distribution arrangements with the Copyright Holder. + +4. You may distribute the programs of this Package in object code or executable +form, provided that you do at least ONE of the following: + + a) distribute a Standard Version of the executables and library files, + together with instructions (in the manual page or equivalent) on where to + get the Standard Version. + + b) accompany the distribution with the machine-readable source of the Package + with your modifications. + + c) accompany any non-standard executables with their corresponding Standard + Version executables, giving the non-standard executables non-standard + names, and clearly documenting the differences in manual pages (or + equivalent), together with instructions on where to get the Standard + Version. + + d) make other distribution arrangements with the Copyright Holder. + +5. You may charge a reasonable copying fee for any distribution of this +Package. You may charge any fee you choose for support of this Package. You +may not charge a fee for this Package itself. However, you may distribute this +Package in aggregate with other (possibly commercial) programs as part of a +larger (possibly commercial) software distribution provided that you do not +advertise this Package as a product of your own. + +6. The scripts and library files supplied as input to or produced as output +from the programs of this Package do not automatically fall under the copyright +of this Package, but belong to whomever generated them, and may be sold +commercially, and may be aggregated with this Package. + +7. C or perl subroutines supplied by you and linked into this Package shall not +be considered part of this Package. + +8. The name of the Copyright Holder may not be used to endorse or promote +products derived from this software without specific prior written permission. + +9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +The End + diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..6cb9a59 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,36 @@ +# This file was automatically generated by Dist::Zilla::Plugin::Manifest v6.008. +CONTRIBUTORS +Changes +INSTALL +LICENSE +MANIFEST +META.json +META.yml +Makefile.PL +README.md +cpanfile +dist.ini +examples/sanitize_file.pl +lib/HTML/Restrict.pm +perlcriticrc +perltidyrc +t/00-load.t +t/attribute_constraints.t +t/author-pod-coverage.t +t/author-pod-spell.t +t/author-synopsis.t +t/author-tidyall.t +t/comments.t +t/declaration.t +t/js.t +t/lowercase.t +t/memory-leak.t +t/perlcriticrc +t/pod.t +t/release-cpan-changes.t +t/replace_img.t +t/scheme.t +t/stack.t +t/style.t +t/xss.t +tidyall.ini diff --git a/META.json b/META.json new file mode 100644 index 0000000..f1fd977 --- /dev/null +++ b/META.json @@ -0,0 +1,95 @@ +{ + "abstract" : "Strip unwanted HTML tags and attributes", + "author" : [ + "Olaf Alders <olaf@wundercounter.com>" + ], + "dynamic_config" : 0, + "generated_by" : "Dist::Zilla version 6.008, CPAN::Meta::Converter version 2.150005", + "license" : [ + "perl_5" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : 2 + }, + "name" : "HTML-Restrict", + "no_index" : { + "directory" : [ + "examples", + "t", + "xt" + ] + }, + "prereqs" : { + "configure" : { + "requires" : { + "ExtUtils::MakeMaker" : "0", + "perl" : "5.006" + } + }, + "develop" : { + "requires" : { + "Pod::Coverage::TrustPod" : "0", + "Test::CPAN::Changes" : "0.19", + "Test::Code::TidyAll" : "0.50", + "Test::More" : "0.88", + "Test::Pod::Coverage" : "1.08", + "Test::Spelling" : "0.12", + "Test::Synopsis" : "0" + } + }, + "runtime" : { + "requires" : { + "Carp" : "0", + "Data::Dump" : "0", + "HTML::Entities" : "0", + "HTML::Parser" : "0", + "List::Util" : "1.33", + "Moo" : "1.002000", + "Scalar::Util" : "0", + "Sub::Quote" : "0", + "Types::Standard" : "1.000001", + "URI" : "0", + "namespace::clean" : "0", + "perl" : "5.006", + "strict" : "0" + } + }, + "test" : { + "requires" : { + "Test::Fatal" : "0", + "Test::More" : "0", + "perl" : "5.006", + "warnings" : "0" + } + } + }, + "release_status" : "stable", + "resources" : { + "bugtracker" : { + "web" : "https://github.com/oalders/html-restrict/issues" + }, + "homepage" : "https://github.com/oalders/html-restrict", + "repository" : { + "type" : "git", + "url" : "https://github.com/oalders/html-restrict.git", + "web" : "https://github.com/oalders/html-restrict" + } + }, + "version" : "2.2.4", + "x_contributors" : [ + "Arthur Axel 'fREW' Schmidt <frioux@gmail.com>", + "Ben Bullock <benkasminbullock@gmail.com>", + "David Golden <dagolden@cpan.org>", + "Fitz Elliott <fitz.elliott@gmail.com>", + "Graham Knop <haarg@haarg.org>", + "Graham TerMarsch <graham@howlingfrog.com>", + "Karen Etheridge <ether@cpan.org>", + "Mark Jubenville <mark@raybec.com>", + "Olaf Alders <olaf@wundersolutions.com>", + "perlpong <calyx238@gmail.com>", + "skaufman <sam@socialflow.com>" + ], + "x_serialization_backend" : "Cpanel::JSON::XS version 3.0225" +} + diff --git a/META.yml b/META.yml new file mode 100644 index 0000000..e4f5b68 --- /dev/null +++ b/META.yml @@ -0,0 +1,56 @@ +--- +abstract: 'Strip unwanted HTML tags and attributes' +author: + - 'Olaf Alders <olaf@wundercounter.com>' +build_requires: + Test::Fatal: '0' + Test::More: '0' + perl: '5.006' + warnings: '0' +configure_requires: + ExtUtils::MakeMaker: '0' + perl: '5.006' +dynamic_config: 0 +generated_by: 'Dist::Zilla version 6.008, CPAN::Meta::Converter version 2.150005' +license: perl +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: '1.4' +name: HTML-Restrict +no_index: + directory: + - examples + - t + - xt +requires: + Carp: '0' + Data::Dump: '0' + HTML::Entities: '0' + HTML::Parser: '0' + List::Util: '1.33' + Moo: '1.002000' + Scalar::Util: '0' + Sub::Quote: '0' + Types::Standard: '1.000001' + URI: '0' + namespace::clean: '0' + perl: '5.006' + strict: '0' +resources: + bugtracker: https://github.com/oalders/html-restrict/issues + homepage: https://github.com/oalders/html-restrict + repository: https://github.com/oalders/html-restrict.git +version: 2.2.4 +x_contributors: + - "Arthur Axel 'fREW' Schmidt <frioux@gmail.com>" + - 'Ben Bullock <benkasminbullock@gmail.com>' + - 'David Golden <dagolden@cpan.org>' + - 'Fitz Elliott <fitz.elliott@gmail.com>' + - 'Graham Knop <haarg@haarg.org>' + - 'Graham TerMarsch <graham@howlingfrog.com>' + - 'Karen Etheridge <ether@cpan.org>' + - 'Mark Jubenville <mark@raybec.com>' + - 'Olaf Alders <olaf@wundersolutions.com>' + - 'perlpong <calyx238@gmail.com>' + - 'skaufman <sam@socialflow.com>' +x_serialization_backend: 'YAML::Tiny version 1.69' diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..570d8bf --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,73 @@ +# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v6.008. +use strict; +use warnings; + +use 5.006; + +use ExtUtils::MakeMaker; + +my %WriteMakefileArgs = ( + "ABSTRACT" => "Strip unwanted HTML tags and attributes", + "AUTHOR" => "Olaf Alders <olaf\@wundercounter.com>", + "CONFIGURE_REQUIRES" => { + "ExtUtils::MakeMaker" => 0 + }, + "DISTNAME" => "HTML-Restrict", + "LICENSE" => "perl", + "MIN_PERL_VERSION" => "5.006", + "NAME" => "HTML::Restrict", + "PREREQ_PM" => { + "Carp" => 0, + "Data::Dump" => 0, + "HTML::Entities" => 0, + "HTML::Parser" => 0, + "List::Util" => "1.33", + "Moo" => "1.002000", + "Scalar::Util" => 0, + "Sub::Quote" => 0, + "Types::Standard" => "1.000001", + "URI" => 0, + "namespace::clean" => 0, + "strict" => 0 + }, + "TEST_REQUIRES" => { + "Test::Fatal" => 0, + "Test::More" => 0, + "warnings" => 0 + }, + "VERSION" => "2.2.4", + "test" => { + "TESTS" => "t/*.t" + } +); + + +my %FallbackPrereqs = ( + "Carp" => 0, + "Data::Dump" => 0, + "HTML::Entities" => 0, + "HTML::Parser" => 0, + "List::Util" => "1.33", + "Moo" => "1.002000", + "Scalar::Util" => 0, + "Sub::Quote" => 0, + "Test::Fatal" => 0, + "Test::More" => 0, + "Types::Standard" => "1.000001", + "URI" => 0, + "namespace::clean" => 0, + "strict" => 0, + "warnings" => 0 +); + + +unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) { + delete $WriteMakefileArgs{TEST_REQUIRES}; + delete $WriteMakefileArgs{BUILD_REQUIRES}; + $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs; +} + +delete $WriteMakefileArgs{CONFIGURE_REQUIRES} + unless eval { ExtUtils::MakeMaker->VERSION(6.52) }; + +WriteMakefile(%WriteMakefileArgs); diff --git a/README.md b/README.md new file mode 100644 index 0000000..0796222 --- /dev/null +++ b/README.md @@ -0,0 +1,349 @@ +# NAME + +HTML::Restrict - Strip unwanted HTML tags and attributes + +# VERSION + +version 2.2.4 + +# SYNOPSIS + + use HTML::Restrict; + + my $hr = HTML::Restrict->new(); + + # use default rules to start with (strip away all HTML) + my $processed = $hr->process(' <b>i am bold</b> '); + + # $processed now equals: 'i am bold' + + # Now, a less restrictive example: + use HTML::Restrict; + + my $hr = HTML::Restrict->new( + rules => { + b => [], + img => [qw( src alt / )] + } + ); + + my $html = q[<body><b>hello</b> <img src="pic.jpg" alt="me" id="test" /></body>]; + my $processed = $hr->process( $html ); + + # $processed now equals: <b>hello</b> <img src="pic.jpg" alt="me" /> + +# DESCRIPTION + +This module uses [HTML::Parser](https://metacpan.org/pod/HTML::Parser) to strip HTML from text in a restrictive +manner. By default all HTML is restricted. You may alter the default +behaviour by supplying your own tag rules. + +# CONSTRUCTOR AND STARTUP + +## new() + +Creates and returns a new HTML::Restrict object. + + my $hr = HTML::Restrict->new() + +HTML::Restrict doesn't require any params to be passed to new. If your goal is +to remove all HTML from text, then no further setup is required. Just pass +your text to the process() method and you're done: + + my $plain_text = $hr->process( $html ); + +If you need to set up specific rules, have a look at the params which +HTML::Restrict recognizes: + +- `rules => \%rules` + + Sets the rules which will be used to process your data. By default all HTML + tags are off limits. Use this argument to define the HTML elements and + corresponding attributes you'd like to use. Essentially, consider the default + behaviour to be: + + rules => {} + + Rules should be passed as a HASHREF of allowed tags. Each hash value should + represent the allowed attributes for the listed tag. For example, if you want + to allow a fair amount of HTML, you can try something like this: + + my %rules = ( + a => [qw( href target )], + b => [], + caption => [], + center => [], + em => [], + i => [], + img => [qw( alt border height width src style )], + li => [], + ol => [], + p => [qw(style)], + span => [qw(style)], + strong => [], + sub => [], + sup => [], + table => [qw( style border cellspacing cellpadding align )], + tbody => [], + td => [], + tr => [], + u => [], + ul => [], + ); + + my $hr = HTML::Restrict->new( rules => \%rules ) + + Or, to allow only bolded text: + + my $hr = HTML::Restrict->new( rules => { b => [] } ); + + Allow bolded text, images and some (but not all) image attributes: + + my %rules = ( + b => [ ], + img => [qw( src alt width height border / ) + ); + my $hr = HTML::Restrict->new( rules => \%rules ); + + Since [HTML::Parser](https://metacpan.org/pod/HTML::Parser) treats a closing slash as an attribute, you'll need to + add "/" to your list of allowed attributes if you'd like your tags to retain + closing slashes. For example: + + my $hr = HTML::Restrict->new( rules =>{ hr => [] } ); + $hr->process( "<hr />"); # returns: <hr> + + my $hr = HTML::Restrict->new( rules =>{ hr => [qw( / )] } ); + $hr->process( "<hr />"); # returns: <hr /> + + HTML::Restrict strips away any tags and attributes which are not explicitly + allowed. It also rebuilds your explicitly allowed tags and places their + attributes in the order in which they appear in your rules. + + So, if you define the following rules: + + my %rules = ( + ... + img => [qw( src alt title width height id / )] + ... + ); + + then your image tags will all be built like this: + + <img src=".." alt="..." title="..." width="..." height="..." id=".." /> + + This gives you greater consistency in your tag layout. If you don't care about + element order you don't need to pay any attention to this, but you should be + aware that your elements are being reconstructed rather than just stripped + down. + + As of 2.1.0, you can also specify a regex to be tested against the attribute + value. This feature should be considered experimental for the time being: + + my $hr = HTML::Restrict->new( + rules => { + iframe => [ + qw( width height allowfullscreen ), + { src => qr{^http://www\.youtube\.com}, + frameborder => qr{^(0|1)$}, + } + ], + img => [ qw( alt ), { src => qr{^/my/images/} }, ], + }, + ); + + my $html = '<img src="http://www.example.com/image.jpg" alt="Alt Text">'; + my $processed = $hr->process( $html ); + + # $processed now equals: <img alt="Alt Text"> + +- `trim => [0|1]` + + By default all leading and trailing spaces will be removed when text is + processed. Set this value to 0 in order to disable this behaviour. + +- `uri_schemes => [undef, 'http', 'https', 'irc', ... ]` + + As of version 1.0.3, URI scheme checking is performed on all href and src tag + attributes. The following schemes are allowed out of the box. No action is + required on your part: + + [ undef, 'http', 'https' ] + + (undef represents relative URIs). These restrictions have been put in place to + prevent XSS in the form of: + + <a href="javascript:alert(document.cookie)">click for cookie!</a> + + See [URI](https://metacpan.org/pod/URI) for more detailed info on scheme parsing. If, for example, you + wanted to filter out every scheme barring SSL, you would do it like this: + + uri_schemes => ['https'] + + This feature is new in 1.0.3. Previous to this, there was no schema checking + at all. Moving forward, you'll need to whitelist explicitly all URI schemas + which are not supported by default. This is in keeping with the whitelisting + behaviour of this module and is also the safest possible approach. Keep in + mind that changes to uri\_schemes are not additive, so you'll need to include + the defaults in any changes you make, should you wish to keep them: + + # defaults + irc + mailto + uri_schemes => [ 'undef', 'http', 'https', 'irc', 'mailto' ] + +- allow\_declaration => \[0|1\] + + Set this value to true if you'd like to allow/preserve DOCTYPE declarations in + your content. Useful when cleaning up your own static files or templates. This + feature is off by default. + + my $html = q[<!doctype html><body>foo</body>]; + + my $hr = HTML::Restrict->new( allow_declaration => 1 ); + $html = $hr->process( $html ); + # $html is now: "<!doctype html>foo" + +- allow\_comments => \[0|1\] + + Set this value to true if you'd like to allow/preserve HTML comments in your + content. Useful when cleaning up your own static files or templates. This + feature is off by default. + + my $html = q[<body><!-- comments! -->foo</body>]; + + my $hr = HTML::Restrict->new( allow_comments => 1 ); + $html = $hr->process( $html ); + # $html is now: "<!-- comments! -->foo" + +- replace\_img => \[0|1|CodeRef\] + + Set the value to true if you'd like to have img tags replaced with + `[IMAGE: ...]` containing the alt attribute text. If you set it to a + code reference, you can provide your own replacement (which may + even contain HTML). + + sub replacer { + my ($tagname, $attr, $text) = @_; # from HTML::Parser + return qq{<a href="$attr->{src}">IMAGE: $attr->{alt}</a>}; + } + + my $hr = HTML::Restrict->new( replace_img => \&replacer ); + + This attribute will only take effect if the img tag is not included + in the allowed HTML. + +- strip\_enclosed\_content => \[0|1\] + + The default behaviour up to 1.0.4 was to preserve the content between script + and style tags, even when the tags themselves were being deleted. So, you'd be + left with a bunch of JavaScript or CSS, just with the enclosing tags missing. + This is almost never what you want, so starting at 1.0.5 the default will be to + remove any script or style info which is enclosed in these tags, unless they + have specifically been whitelisted in the rules. This will be a sane default + when cleaning up content submitted via a web form. However, if you're using + HTML::Restrict to purge your own HTML you can be more restrictive. + + # strip the head section, in addition to JS and CSS + my $html = '<html><head>...</head><body>...<script>JS here</script>foo'; + + my $hr = HTML::Restrict->new( + strip_enclosed_content => [ 'script', 'style', 'head' ] + ); + + $html = $hr->process( $html ); + # $html is now '<html><body>...foo'; + + The caveat here is that HTML::Restrict will not try to fix broken HTML. In the + above example, if you have any opening script, style or head tags which don't + also include matching closing tags, all following content will be stripped + away, regardless of any parent tags. + + Keep in mind that changes to strip\_enclosed\_content are not additive, so if you + are adding additional tags you'll need to include the entire list of tags whose + enclosed content you'd like to remove. This feature strips script and style + tags by default. + +# SUBROUTINES/METHODS + +## process( $html ) + +This is the method which does the real work. It parses your data, removes any +tags and attributes which are not specifically allowed and returns the +resulting text. Requires and returns a SCALAR. + +## get\_rules + +Accessor which returns a hash ref of the current rule set. + +## get\_uri\_schemes + +Accessor which returns an array ref of the current valid uri schemes. + +# CAVEATS + +Please note that all tag and attribute names passed via the rules param must be +supplied in lower case. + + # correct + my $hr = HTML::Restrict->new( rules => { body => ['onload'] } ); + + # throws a fatal error + my $hr = HTML::Restrict->new( rules => { Body => ['onLoad'] } ); + +# MOTIVATION + +There are already several modules on the CPAN which accomplish much of the same +thing, but after doing a lot of poking around, I was unable to find a solution +with a simple setup which I was happy with. + +The most common use case might be stripping HTML from user submitted data +completely or allowing just a few tags and attributes to be displayed. With +the exception of URI scheme checking, this module doesn't do any validation on +the actual content of the tags or attributes. If this is a requirement, you +can either mess with the parser object, post-process the text yourself or have +a look at one of the more feature-rich modules in the SEE ALSO section below. + +My aim here is to keep things easy and, hopefully, cover a lot of the less +complex use cases with just a few lines of code and some brief documentation. +The idea is to be up and running quickly. + +# SEE ALSO + +[HTML::TagFilter](https://metacpan.org/pod/HTML::TagFilter), [HTML::Defang](https://metacpan.org/pod/HTML::Defang), [MojoMojo::Declaw](https://metacpan.org/pod/MojoMojo::Declaw), [HTML::StripScripts](https://metacpan.org/pod/HTML::StripScripts), +[HTML::Detoxifier](https://metacpan.org/pod/HTML::Detoxifier), HTML::Sanitizer, [HTML::Scrubber](https://metacpan.org/pod/HTML::Scrubber) + +# ACKNOWLEDGEMENTS + +Thanks to Raybec Communications [http://www.raybec.com](http://www.raybec.com) for funding my +work on this module and for releasing it to the world. + +Thanks also to the following for patches, bug reports and assistance: + +Mark Jubenville (ioncache) + +Duncan Forsyth + +Rick Moore + +Arthur Axel 'fREW' Schmidt + +perlpong + +David Golden + +Graham TerMarsch + +Dagfinn Ilmari Mannsåker + +Graham Knop + +Carwyn Ellis + +# AUTHOR + +Olaf Alders <olaf@wundercounter.com> + +# COPYRIGHT AND LICENSE + +This software is copyright (c) 2013-2017 by Olaf Alders. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. diff --git a/cpanfile b/cpanfile new file mode 100644 index 0000000..5b011ff --- /dev/null +++ b/cpanfile @@ -0,0 +1,35 @@ +requires "Carp" => "0"; +requires "Data::Dump" => "0"; +requires "HTML::Entities" => "0"; +requires "HTML::Parser" => "0"; +requires "List::Util" => "1.33"; +requires "Moo" => "1.002000"; +requires "Scalar::Util" => "0"; +requires "Sub::Quote" => "0"; +requires "Types::Standard" => "1.000001"; +requires "URI" => "0"; +requires "namespace::clean" => "0"; +requires "perl" => "5.006"; +requires "strict" => "0"; + +on 'test' => sub { + requires "Test::Fatal" => "0"; + requires "Test::More" => "0"; + requires "perl" => "5.006"; + requires "warnings" => "0"; +}; + +on 'configure' => sub { + requires "ExtUtils::MakeMaker" => "0"; + requires "perl" => "5.006"; +}; + +on 'develop' => sub { + requires "Pod::Coverage::TrustPod" => "0"; + requires "Test::CPAN::Changes" => "0.19"; + requires "Test::Code::TidyAll" => "0.50"; + requires "Test::More" => "0.88"; + requires "Test::Pod::Coverage" => "1.08"; + requires "Test::Spelling" => "0.12"; + requires "Test::Synopsis" => "0"; +}; diff --git a/dist.ini b/dist.ini new file mode 100644 index 0000000..bc62ed5 --- /dev/null +++ b/dist.ini @@ -0,0 +1,9 @@ +name = HTML-Restrict +author = Olaf Alders <olaf@wundercounter.com> +license = Perl_5 +copyright_holder = Olaf Alders +copyright_year = 2013-2017 +version = 2.2.4 +main_module = lib/HTML/Restrict.pm + +[@Author::OALDERS] diff --git a/examples/sanitize_file.pl b/examples/sanitize_file.pl new file mode 100755 index 0000000..c2c3e31 --- /dev/null +++ b/examples/sanitize_file.pl @@ -0,0 +1,23 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use File::Slurp; +use HTML::Restrict; + +my $filename = shift @ARGV; +die "usage: perl $0 path/to/file > path/to/new/file" if !$filename; + +my $text = read_file($filename); + +my $hr = HTML::Restrict->new; +print $hr->process($text); + +=pod + +=head1 SYNOPSIS + + perl sanitize_file path/to/filename > path/to/sanitized/file + +=cut diff --git a/lib/HTML/Restrict.pm b/lib/HTML/Restrict.pm new file mode 100644 index 0000000..bb299e4 --- /dev/null +++ b/lib/HTML/Restrict.pm @@ -0,0 +1,686 @@ +use strict; +use 5.006; + +package HTML::Restrict; +$HTML::Restrict::VERSION = '2.2.4'; +use Carp qw( croak ); +use Data::Dump qw( dump ); +use HTML::Parser; +use HTML::Entities qw( encode_entities ); +use Types::Standard 1.000001 qw[ Bool HashRef ArrayRef CodeRef ]; +use List::Util 1.33 qw( any none ); +use Scalar::Util qw( reftype weaken ); +use Sub::Quote 'quote_sub'; +use URI; + +use Moo 1.002000; +use namespace::clean; + +has 'allow_comments' => ( + is => 'rw', + isa => Bool, + default => 0, +); + +has 'allow_declaration' => ( + is => 'rw', + isa => Bool, + default => 0, +); + +has 'debug' => ( + is => 'rw', + isa => Bool, + default => 0, +); + +has 'parser' => ( + is => 'ro', + lazy => 1, + builder => '_build_parser', +); + +has 'rules' => ( + is => 'rw', + isa => HashRef, + required => 0, + default => quote_sub(q{ {} }), + trigger => \&_build_parser, + reader => 'get_rules', + writer => 'set_rules', +); + +has 'strip_enclosed_content' => ( + is => 'rw', + isa => ArrayRef, + default => sub { [ 'script', 'style' ] }, +); + +has 'replace_img' => ( + is => 'rw', + isa => Bool | CodeRef, + default => 0, +); + +has 'trim' => ( + is => 'rw', + isa => Bool, + default => 1, +); + +has 'uri_schemes' => ( + is => 'rw', + isa => ArrayRef, + required => 0, + default => sub { [ undef, 'http', 'https' ] }, + reader => 'get_uri_schemes', + writer => 'set_uri_schemes', +); + +has '_processed' => ( + is => 'rw', + isa => quote_sub( + q{ + die "$_[0] is not false or a string!" + unless !defined($_[0]) || $_[0] eq "" || "$_[0]" eq '0' || ref(\$_[0]) eq 'SCALAR' + } + ), + clearer => '_clear_processed', +); + +has '_stripper_stack' => ( + is => 'rw', + isa => ArrayRef, + default => sub { [] }, +); + +sub _build_parser { + + my $self = shift; + my $rules = shift; + + # don't allow any upper case tag or attribute names + # these rules would otherwise silently be ignored + if ($rules) { + foreach my $tag_name ( keys %{$rules} ) { + if ( lc $tag_name ne $tag_name ) { + croak "All tag names must be lower cased"; + } + if ( reftype $rules->{$tag_name} eq 'ARRAY' ) { + my @attr_names; + foreach my $attr_item ( @{ $rules->{$tag_name} } ) { + ref $attr_item eq 'HASH' + ? push( @attr_names, keys(%$attr_item) ) + : push( @attr_names, $attr_item ); + } + for (@attr_names) { + croak "All attribute names must be lower cased" + if lc $_ ne $_; + } + } + } + } + + weaken($self); + return HTML::Parser->new( + + start_h => [ + sub { + my ( $p, $tagname, $attr, $text ) = @_; + print "starting tag: $tagname", "\n" if $self->debug; + my $more = q{}; + + if ( any { $_ eq $tagname } keys %{ $self->get_rules } ) { + print dump $attr if $self->debug; + + foreach my $source_type ( 'href', 'src', 'cite' ) { + + if ( $attr->{$source_type} ) { + my $uri = URI->new( $attr->{$source_type} ); + if ( defined $uri->scheme ) { + delete $attr->{$source_type} + if none { $_ eq $uri->scheme } + grep defined, @{ $self->get_uri_schemes }; + } + else { # relative uri + delete $attr->{$source_type} + unless grep !defined, + @{ $self->get_uri_schemes }; + } + } + } + + foreach + my $attr_item ( @{ $self->get_rules->{$tagname} } ) { + if ( ref $attr_item eq 'HASH' ) { + + # validate against regex contraints + for my $attr_name ( sort keys %$attr_item ) { + if ( exists $attr->{$attr_name} ) { + my $value = encode_entities( + $attr->{$attr_name} ); + $more .= qq[ $attr_name="$value" ] + if $attr->{$attr_name} + =~ $attr_item->{$attr_name}; + } + } + } + else { + my $attr_name = $attr_item; + if ( exists $attr->{$attr_name} ) { + my $value + = encode_entities( $attr->{$attr_name} ); + $more .= qq[ $attr_name="$value" ] + unless $attr_name eq q{/}; + } + } + } + + # closing slash should (naturally) close the tag + if ( exists $attr->{q{/}} && $attr->{q{/}} eq q{/} ) { + $more .= ' /'; + } + + my $elem = "<$tagname $more>"; + $elem =~ s{\s*>}{>}gxms; + $elem =~ s{\s+}{ }gxms; + + $self->_processed( ( $self->_processed || q{} ) . $elem ); + } + elsif ( $tagname eq 'img' && $self->replace_img ) { + my $alt; + if ( ref $self->replace_img ) { + $alt = $self->replace_img->( $tagname, $attr, $text ); + } + else { + $alt + = defined( $attr->{alt} ) ? ": $attr->{alt}" : ""; + $alt = "[IMAGE$alt]"; + } + $self->_processed( ( $self->_processed || q{} ) . $alt ); + } + elsif ( + any { $_ eq $tagname } + @{ $self->strip_enclosed_content } + ) { + print "adding $tagname to strippers" if $self->debug; + push @{ $self->_stripper_stack }, $tagname; + } + + }, + "self,tagname,attr,text" + ], + + end_h => [ + sub { + my ( $p, $tagname, $attr, $text ) = @_; + print "end: $text\n" if $self->debug; + if ( any { $_ eq $tagname } keys %{ $self->get_rules } ) { + $self->_processed( ( $self->_processed || q{} ) . $text ); + } + elsif ( any { $_ eq $tagname } @{ $self->_stripper_stack } ) { + $self->_delete_tag_from_stack($tagname); + } + + }, + "self,tagname,attr,text" + ], + + text_h => [ + sub { + my ( $p, $text ) = @_; + print "text: $text\n" if $self->debug; + if ( !@{ $self->_stripper_stack } ) { + $self->_processed( ( $self->_processed || q{} ) . $text ); + } + }, + "self,text" + ], + + comment_h => [ + sub { + my ( $p, $text ) = @_; + print "comment: $text\n" if $self->debug; + if ( $self->allow_comments ) { + $self->_processed( ( $self->_processed || q{} ) . $text ); + } + }, + "self,text" + ], + + declaration_h => [ + sub { + my ( $p, $text ) = @_; + print "declaration: $text\n" if $self->debug; + if ( $self->allow_declaration ) { + $self->_processed( ( $self->_processed || q{} ) . $text ); + } + }, + "self,text" + ], + + ); + +} + +sub process { + + my $self = shift; + + # returns undef if no value was passed + return if !@_; + return $_[0] if !$_[0]; + + my ($content) = @_; + die 'content must be a string!' + unless ref( \$content ) eq 'SCALAR'; + $self->_clear_processed; + + my $parser = $self->parser; + $parser->parse($content); + $parser->eof; + + my $text = $self->_processed; + + if ( $self->trim && $text ) { + $text =~ s{\A\s*}{}gxms; + $text =~ s{\s*\z}{}gxms; + } + $self->_processed($text); + + # ensure stripper stack is reset in case of broken html + $self->_stripper_stack( [] ); + + return $self->_processed; + +} + +# strip_enclosed_content tags could be nested in the source HTML, so we +# maintain a stack of these tags. + +sub _delete_tag_from_stack { + + my $self = shift; + my $closing_tag = shift; + + my $found = 0; + my @tag_list = (); + + foreach my $tag ( reverse @{ $self->_stripper_stack } ) { + if ( $tag eq $closing_tag && $found == 0 ) { + $found = 1; + next; + } + push @tag_list, $tag; + } + + $self->_stripper_stack( [ reverse @tag_list ] ); + + return; +} + +1; # End of HTML::Restrict + +# ABSTRACT: Strip unwanted HTML tags and attributes + +__END__ + +=pod + +=encoding UTF-8 + +=head1 NAME + +HTML::Restrict - Strip unwanted HTML tags and attributes + +=head1 VERSION + +version 2.2.4 + +=head1 SYNOPSIS + + use HTML::Restrict; + + my $hr = HTML::Restrict->new(); + + # use default rules to start with (strip away all HTML) + my $processed = $hr->process(' <b>i am bold</b> '); + + # $processed now equals: 'i am bold' + + # Now, a less restrictive example: + use HTML::Restrict; + + my $hr = HTML::Restrict->new( + rules => { + b => [], + img => [qw( src alt / )] + } + ); + + my $html = q[<body><b>hello</b> <img src="pic.jpg" alt="me" id="test" /></body>]; + my $processed = $hr->process( $html ); + + # $processed now equals: <b>hello</b> <img src="pic.jpg" alt="me" /> + +=head1 DESCRIPTION + +This module uses L<HTML::Parser> to strip HTML from text in a restrictive +manner. By default all HTML is restricted. You may alter the default +behaviour by supplying your own tag rules. + +=head1 CONSTRUCTOR AND STARTUP + +=head2 new() + +Creates and returns a new HTML::Restrict object. + + my $hr = HTML::Restrict->new() + +HTML::Restrict doesn't require any params to be passed to new. If your goal is +to remove all HTML from text, then no further setup is required. Just pass +your text to the process() method and you're done: + + my $plain_text = $hr->process( $html ); + +If you need to set up specific rules, have a look at the params which +HTML::Restrict recognizes: + +=over 4 + +=item * C<< rules => \%rules >> + +Sets the rules which will be used to process your data. By default all HTML +tags are off limits. Use this argument to define the HTML elements and +corresponding attributes you'd like to use. Essentially, consider the default +behaviour to be: + + rules => {} + +Rules should be passed as a HASHREF of allowed tags. Each hash value should +represent the allowed attributes for the listed tag. For example, if you want +to allow a fair amount of HTML, you can try something like this: + + my %rules = ( + a => [qw( href target )], + b => [], + caption => [], + center => [], + em => [], + i => [], + img => [qw( alt border height width src style )], + li => [], + ol => [], + p => [qw(style)], + span => [qw(style)], + strong => [], + sub => [], + sup => [], + table => [qw( style border cellspacing cellpadding align )], + tbody => [], + td => [], + tr => [], + u => [], + ul => [], + ); + + my $hr = HTML::Restrict->new( rules => \%rules ) + +Or, to allow only bolded text: + + my $hr = HTML::Restrict->new( rules => { b => [] } ); + +Allow bolded text, images and some (but not all) image attributes: + + my %rules = ( + b => [ ], + img => [qw( src alt width height border / ) + ); + my $hr = HTML::Restrict->new( rules => \%rules ); + +Since L<HTML::Parser> treats a closing slash as an attribute, you'll need to +add "/" to your list of allowed attributes if you'd like your tags to retain +closing slashes. For example: + + my $hr = HTML::Restrict->new( rules =>{ hr => [] } ); + $hr->process( "<hr />"); # returns: <hr> + + my $hr = HTML::Restrict->new( rules =>{ hr => [qw( / )] } ); + $hr->process( "<hr />"); # returns: <hr /> + +HTML::Restrict strips away any tags and attributes which are not explicitly +allowed. It also rebuilds your explicitly allowed tags and places their +attributes in the order in which they appear in your rules. + +So, if you define the following rules: + + my %rules = ( + ... + img => [qw( src alt title width height id / )] + ... + ); + +then your image tags will all be built like this: + + <img src=".." alt="..." title="..." width="..." height="..." id=".." /> + +This gives you greater consistency in your tag layout. If you don't care about +element order you don't need to pay any attention to this, but you should be +aware that your elements are being reconstructed rather than just stripped +down. + +As of 2.1.0, you can also specify a regex to be tested against the attribute +value. This feature should be considered experimental for the time being: + + my $hr = HTML::Restrict->new( + rules => { + iframe => [ + qw( width height allowfullscreen ), + { src => qr{^http://www\.youtube\.com}, + frameborder => qr{^(0|1)$}, + } + ], + img => [ qw( alt ), { src => qr{^/my/images/} }, ], + }, + ); + + my $html = '<img src="http://www.example.com/image.jpg" alt="Alt Text">'; + my $processed = $hr->process( $html ); + + # $processed now equals: <img alt="Alt Text"> + +=item * C<< trim => [0|1] >> + +By default all leading and trailing spaces will be removed when text is +processed. Set this value to 0 in order to disable this behaviour. + +=item * C<< uri_schemes => [undef, 'http', 'https', 'irc', ... ] >> + +As of version 1.0.3, URI scheme checking is performed on all href and src tag +attributes. The following schemes are allowed out of the box. No action is +required on your part: + + [ undef, 'http', 'https' ] + +(undef represents relative URIs). These restrictions have been put in place to +prevent XSS in the form of: + + <a href="javascript:alert(document.cookie)">click for cookie!</a> + +See L<URI> for more detailed info on scheme parsing. If, for example, you +wanted to filter out every scheme barring SSL, you would do it like this: + + uri_schemes => ['https'] + +This feature is new in 1.0.3. Previous to this, there was no schema checking +at all. Moving forward, you'll need to whitelist explicitly all URI schemas +which are not supported by default. This is in keeping with the whitelisting +behaviour of this module and is also the safest possible approach. Keep in +mind that changes to uri_schemes are not additive, so you'll need to include +the defaults in any changes you make, should you wish to keep them: + + # defaults + irc + mailto + uri_schemes => [ 'undef', 'http', 'https', 'irc', 'mailto' ] + +=item * allow_declaration => [0|1] + +Set this value to true if you'd like to allow/preserve DOCTYPE declarations in +your content. Useful when cleaning up your own static files or templates. This +feature is off by default. + + my $html = q[<!doctype html><body>foo</body>]; + + my $hr = HTML::Restrict->new( allow_declaration => 1 ); + $html = $hr->process( $html ); + # $html is now: "<!doctype html>foo" + +=item * allow_comments => [0|1] + +Set this value to true if you'd like to allow/preserve HTML comments in your +content. Useful when cleaning up your own static files or templates. This +feature is off by default. + + my $html = q[<body><!-- comments! -->foo</body>]; + + my $hr = HTML::Restrict->new( allow_comments => 1 ); + $html = $hr->process( $html ); + # $html is now: "<!-- comments! -->foo" + +=item * replace_img => [0|1|CodeRef] + +Set the value to true if you'd like to have img tags replaced with +C<[IMAGE: ...]> containing the alt attribute text. If you set it to a +code reference, you can provide your own replacement (which may +even contain HTML). + + sub replacer { + my ($tagname, $attr, $text) = @_; # from HTML::Parser + return qq{<a href="$attr->{src}">IMAGE: $attr->{alt}</a>}; + } + + my $hr = HTML::Restrict->new( replace_img => \&replacer ); + +This attribute will only take effect if the img tag is not included +in the allowed HTML. + +=item * strip_enclosed_content => [0|1] + +The default behaviour up to 1.0.4 was to preserve the content between script +and style tags, even when the tags themselves were being deleted. So, you'd be +left with a bunch of JavaScript or CSS, just with the enclosing tags missing. +This is almost never what you want, so starting at 1.0.5 the default will be to +remove any script or style info which is enclosed in these tags, unless they +have specifically been whitelisted in the rules. This will be a sane default +when cleaning up content submitted via a web form. However, if you're using +HTML::Restrict to purge your own HTML you can be more restrictive. + + # strip the head section, in addition to JS and CSS + my $html = '<html><head>...</head><body>...<script>JS here</script>foo'; + + my $hr = HTML::Restrict->new( + strip_enclosed_content => [ 'script', 'style', 'head' ] + ); + + $html = $hr->process( $html ); + # $html is now '<html><body>...foo'; + +The caveat here is that HTML::Restrict will not try to fix broken HTML. In the +above example, if you have any opening script, style or head tags which don't +also include matching closing tags, all following content will be stripped +away, regardless of any parent tags. + +Keep in mind that changes to strip_enclosed_content are not additive, so if you +are adding additional tags you'll need to include the entire list of tags whose +enclosed content you'd like to remove. This feature strips script and style +tags by default. + +=back + +=head1 SUBROUTINES/METHODS + +=head2 process( $html ) + +This is the method which does the real work. It parses your data, removes any +tags and attributes which are not specifically allowed and returns the +resulting text. Requires and returns a SCALAR. + +=head2 get_rules + +Accessor which returns a hash ref of the current rule set. + +=head2 get_uri_schemes + +Accessor which returns an array ref of the current valid uri schemes. + +=head1 CAVEATS + +Please note that all tag and attribute names passed via the rules param must be +supplied in lower case. + + # correct + my $hr = HTML::Restrict->new( rules => { body => ['onload'] } ); + + # throws a fatal error + my $hr = HTML::Restrict->new( rules => { Body => ['onLoad'] } ); + +=head1 MOTIVATION + +There are already several modules on the CPAN which accomplish much of the same +thing, but after doing a lot of poking around, I was unable to find a solution +with a simple setup which I was happy with. + +The most common use case might be stripping HTML from user submitted data +completely or allowing just a few tags and attributes to be displayed. With +the exception of URI scheme checking, this module doesn't do any validation on +the actual content of the tags or attributes. If this is a requirement, you +can either mess with the parser object, post-process the text yourself or have +a look at one of the more feature-rich modules in the SEE ALSO section below. + +My aim here is to keep things easy and, hopefully, cover a lot of the less +complex use cases with just a few lines of code and some brief documentation. +The idea is to be up and running quickly. + +=head1 SEE ALSO + +L<HTML::TagFilter>, L<HTML::Defang>, L<MojoMojo::Declaw>, L<HTML::StripScripts>, +L<HTML::Detoxifier>, HTML::Sanitizer, L<HTML::Scrubber> + +=head1 ACKNOWLEDGEMENTS + +Thanks to Raybec Communications L<http://www.raybec.com> for funding my +work on this module and for releasing it to the world. + +Thanks also to the following for patches, bug reports and assistance: + +Mark Jubenville (ioncache) + +Duncan Forsyth + +Rick Moore + +Arthur Axel 'fREW' Schmidt + +perlpong + +David Golden + +Graham TerMarsch + +Dagfinn Ilmari Mannsåker + +Graham Knop + +Carwyn Ellis + +=head1 AUTHOR + +Olaf Alders <olaf@wundercounter.com> + +=head1 COPYRIGHT AND LICENSE + +This software is copyright (c) 2013-2017 by Olaf Alders. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=cut diff --git a/perlcriticrc b/perlcriticrc new file mode 100644 index 0000000..7819a28 --- /dev/null +++ b/perlcriticrc @@ -0,0 +1,86 @@ +severity = 3 +verbose = 11 + +theme = core + pbp + bugs + maintenance + cosmetic + complexity + security + tests + moose + +exclude = Subroutines::ProhibitCallsToUndeclaredSubs + +[BuiltinFunctions::ProhibitStringySplit] +severity = 3 + +[CodeLayout::RequireTrailingCommas] +severity = 3 + +[ControlStructures::ProhibitCStyleForLoops] +severity = 3 + +[InputOutput::RequireCheckedSyscalls] +functions = :builtins +exclude_functions = sleep +severity = 3 + +[Moose::RequireCleanNamespace] +modules = Moose Moose::Role MooseX::Role::Parameterized Moose::Util::TypeConstraints +cleaners = namespace::autoclean + +[NamingConventions::Capitalization] +package_exemptions = [A-Z]\w+|minFraud +file_lexical_variables = [A-Z]\w+|[^A-Z]+ +global_variables = :starts_with_upper +scoped_lexical_variables = [A-Z]\w+|[^A-Z]+ +severity = 3 + +# Given our code base, leaving this at 5 would be a huge pain +[Subroutines::ProhibitManyArgs] +max_arguments = 10 + +[RegularExpressions::ProhibitComplexRegexes] +max_characters = 200 + +[RegularExpressions::ProhibitUnusualDelimiters] +severity = 3 + +[Subroutines::ProhibitUnusedPrivateSubroutines] +private_name_regex = _(?!build)\w+ +skip_when_using = Moo::Role Moose::Role MooseX::Role::Parameterized Role::Tiny Test::Class::Moose::Role + +[TestingAndDebugging::ProhibitNoWarnings] +allow = redefine + +[ValuesAndExpressions::ProhibitEmptyQuotes] +severity = 3 + +[ValuesAndExpressions::ProhibitInterpolationOfLiterals] +severity = 3 + +[ValuesAndExpressions::RequireUpperCaseHeredocTerminator] +severity = 3 + +[Variables::ProhibitPackageVars] +add_packages = Test::Builder + +[TestingAndDebugging::RequireUseStrict] + +[TestingAndDebugging::RequireUseWarnings] + +[-ControlStructures::ProhibitCascadingIfElse] + +[-ErrorHandling::RequireCarping] +[-InputOutput::RequireBriefOpen] + +[-ValuesAndExpressions::ProhibitConstantPragma] + +# No need for /xsm everywhere +[-RegularExpressions::RequireDotMatchAnything] +[-RegularExpressions::RequireExtendedFormatting] +[-RegularExpressions::RequireLineBoundaryMatching] + +[-Subroutines::ProhibitExplicitReturnUndef] + +# http://stackoverflow.com/questions/2275317/why-does-perlcritic-dislike-using-shift-to-populate-subroutine-variables +[-Subroutines::RequireArgUnpacking] + +[-Subroutines::RequireFinalReturn] + +# "use v5.14" is more readable than "use 5.014" +[-ValuesAndExpressions::ProhibitVersionStrings] diff --git a/perltidyrc b/perltidyrc new file mode 100644 index 0000000..b7ed624 --- /dev/null +++ b/perltidyrc @@ -0,0 +1,12 @@ +--blank-lines-before-packages=0 +--iterations=2 +--no-outdent-long-comments +-b +-bar +-boc +-ci=4 +-i=4 +-l=78 +-nolq +-se +-wbb="% + - * / x != == >= <= =~ !~ < > | & >= < = **= += *= &= <<= &&= -= /= |= >>= ||= .= %= ^= x=" diff --git a/t/00-load.t b/t/00-load.t new file mode 100644 index 0000000..1c1421c --- /dev/null +++ b/t/00-load.t @@ -0,0 +1,125 @@ +#!perl + +use Test::More; + +use strict; +use warnings; + +use Data::Dump; +use HTML::Restrict; +use Scalar::Util; + +my $version = $HTML::Restrict::VERSION || 'development'; +diag( "Testing HTML::Restrict $version, Perl $], $^X" ); + +my $hr = HTML::Restrict->new( debug => 0 ); +isa_ok( $hr, 'HTML::Restrict' ); + +isa_ok( $hr->parser, 'HTML::Parser' ); + +my $default_rules = $hr->get_rules; + +cmp_ok( Scalar::Util::reftype( $default_rules ), + 'eq', 'HASH', "default rules are empty" ); + +# basic stripping -- all tags strippped +my $bold = '<b>i am bold</b>'; +my $processed = $hr->process( $bold ); +cmp_ok( $processed, 'eq', 'i am bold', "b tag stripped" ); + +# updating rules +my $b_rules = { b => [] }; +$hr->set_rules( $b_rules ); +my $updated_rules = $hr->get_rules; +is_deeply( $b_rules, $updated_rules, "rules update correctly" ); + +# ensure allowed tags aren't stripped +$processed = $hr->process( $bold ); +cmp_ok( $processed, 'eq', $bold, "b tag not stripped" ); + +# more complex set with multiple tags +# ensure allowed tags aren't stripped and others are removed +$hr->set_rules( { a => [qw( href target )] } ); +my $link + = q[<center><a href="http://google.com" target="_blank" id="test">google</a></center>]; +my $processed_link = $hr->process( $link ); +cmp_ok( + $processed_link, 'eq', + q[<a href="http://google.com" target="_blank">google</a>], + "allowed link but not center tag", +); + +# ensure closing slash is maintained for tags +# with no end tag +$hr->set_rules( { img => [qw( src width height /)] } ); +my $img = q[<body><img src="/face.jpg" width="10" height="10" /></body>]; +my $processed_img = $hr->process( $img ); +cmp_ok( + $processed_img, 'eq', + '<img src="/face.jpg" width="10" height="10" />', + "closing slash preserved in image" +); + +# rest rules to default set +$hr->set_rules( {} ); +cmp_ok( $hr->process( $bold ), 'eq', 'i am bold', "back to default rules" ); + +# stripping of comments +cmp_ok( $hr->process( "<!-- comment this -->ok" ), + 'eq', 'ok', "comments are stripped" ); + +# stripping of javascript includes +cmp_ok( + $hr->process( + q{<script type="text/javascript" src="/js/jquery-1.3.2.js"></script>ok} + ), + 'eq', 'ok', + "javascript includes are stripped" +); + +# stripping of css includes +cmp_ok( + $hr->process( + q{<link href="/style.css" media="screen" rel="stylesheet" type="text/css" />ok} + ), + 'eq', 'ok', + "css includes are stripped" +); + +ok( $hr->trim, "trim enabled by default" ); + +# stripping of leading and trailing spaces +cmp_ok( $hr->process( " ok ok ok " ), + 'eq', 'ok ok ok', "leading and trailing spaces trimmed" ); + +# stripping of div tags +cmp_ok( $hr->process( "<div>ok</div>" ), + 'eq', 'ok', "divs are stripped away" ); + +# undef should be returned when no value is passed to the process method +is( $hr->process(), undef, "undef is returned when no value passed" ); + + +# start fresh +# RT #55775 +$hr = HTML::Restrict->new; + +cmp_ok( $hr->process( 0 ), 'eq', '0', "untrue values not processed"); +cmp_ok( $hr->process( '0' ), 'eq', '0', "untrue values not processed"); +cmp_ok( $hr->process( '000' ), 'eq', '000', "untrue values not processed"); + +ok( !$hr->process("<html>"), "process only HTML" ); + + +# bugfix: check URI scheme for "src" attributes +$hr = HTML::Restrict->new( rules => { img => [qw( src )] } ); +$hr->set_uri_schemes( [ undef, 'http', 'https' ] ); +cmp_ok( $hr->process('<img src="file:/some/file">'), 'eq', '<img>' ); + +# bugfix: ensure stripper stack is reset in case of broken html +$hr = HTML::Restrict->new; +$hr->strip_enclosed_content( ['script'] ); +$hr->process('<script < b >'); +cmp_ok($hr->process('some text'), 'eq', 'some text', "stripper stack reset"); + +done_testing(); diff --git a/t/attribute_constraints.t b/t/attribute_constraints.t new file mode 100644 index 0000000..1f58bc8 --- /dev/null +++ b/t/attribute_constraints.t @@ -0,0 +1,66 @@ +use strict; +use warnings; + +use Test::More; +use HTML::Restrict; + +my $hr = HTML::Restrict->new( + rules => { + iframe => [ + qw( width height ), + { + src => qr{^http://www\.youtube\.com}, + frameborder => qr{^(0|1)$}, + } + ], + }, +); + +cmp_ok( + $hr->process( + '<iframe width="560" height="315" frameborder="0" src="http://www.youtube.com/embed/9gKeRZM2Iyc"></iframe>' + ), + 'eq', + '<iframe width="560" height="315" frameborder="0" src="http://www.youtube.com/embed/9gKeRZM2Iyc"></iframe>', + 'all constraints pass', +); + +cmp_ok( + $hr->process( + '<iframe width="560" height="315" src="http://www.hostile.com/" frameborder="0"></iframe>' + ), + 'eq', + '<iframe width="560" height="315" frameborder="0"></iframe>', + 'one constraint fails', +); + +cmp_ok( + $hr->process( + '<iframe width="560" height="315" src="http://www.hostile.com/" frameborder="A"></iframe>' + ), + 'eq', + '<iframe width="560" height="315"></iframe>', + 'two constraints fail', +); + +$hr = HTML::Restrict->new( + rules => { + iframe => [ + { src => qr{^http://www\.youtube\.com} }, + { frameborder => qr{^(0|1)$} }, + { height => qr{^315$} }, + { width => qr{^560$} }, + ], + }, +); + +cmp_ok( + $hr->process( + '<iframe width="560" height="315" frameborder="0" src="http://www.youtube.com/embed/9gKeRZM2Iyc"></iframe>' + ), + 'eq', + '<iframe src="http://www.youtube.com/embed/9gKeRZM2Iyc" frameborder="0" height="315" width="560"></iframe>', + 'possible to maintain order', +); + +done_testing; diff --git a/t/author-pod-coverage.t b/t/author-pod-coverage.t new file mode 100644 index 0000000..243340f --- /dev/null +++ b/t/author-pod-coverage.t @@ -0,0 +1,15 @@ +#!perl + +BEGIN { + unless ($ENV{AUTHOR_TESTING}) { + print qq{1..0 # SKIP these tests are for testing by the author\n}; + exit + } +} + +# This file was automatically generated by Dist::Zilla::Plugin::PodCoverageTests. + +use Test::Pod::Coverage 1.08; +use Pod::Coverage::TrustPod; + +all_pod_coverage_ok({ coverage_class => 'Pod::Coverage::TrustPod' }); diff --git a/t/author-pod-spell.t b/t/author-pod-spell.t new file mode 100644 index 0000000..8b32a99 --- /dev/null +++ b/t/author-pod-spell.t @@ -0,0 +1,74 @@ + +BEGIN { + unless ($ENV{AUTHOR_TESTING}) { + print qq{1..0 # SKIP these tests are for testing by the author\n}; + exit + } +} + +use strict; +use warnings; +use Test::More; + +# generated by Dist::Zilla::Plugin::Test::PodSpelling 2.007004 +use Test::Spelling 0.12; +use Pod::Wordlist; + + +add_stopwords(<DATA>); +all_pod_files_spelling_ok( qw( bin lib ) ); +__DATA__ +Alders +Alders' +Arthur +Axel +Ben +Bullock +Carwyn +DOCTYPE +Dagfinn +David +Elliott +Etheridge +Fitz +Forsyth +Golden +Graham +HTML +Ilmari +Jubenville +Karen +Knop +Mannsåker +Mark +Olaf +Raybec +Restrict +Schmidt +TerMarsch +XSS +benkasminbullock +bolded +calyx238 +dagolden +ether +fREW +fitz +frioux +graham +haarg +href +img +ioncache +lib +mark +olaf +param +params +perlpong +sam +schemas +skaufman +src +whitelisted +whitelisting diff --git a/t/author-synopsis.t b/t/author-synopsis.t new file mode 100644 index 0000000..5d1d4a7 --- /dev/null +++ b/t/author-synopsis.t @@ -0,0 +1,13 @@ +#!perl + +BEGIN { + unless ($ENV{AUTHOR_TESTING}) { + print qq{1..0 # SKIP these tests are for testing by the author\n}; + exit + } +} + + +use Test::Synopsis; + +all_synopsis_ok(); diff --git a/t/author-tidyall.t b/t/author-tidyall.t new file mode 100644 index 0000000..f161a3e --- /dev/null +++ b/t/author-tidyall.t @@ -0,0 +1,19 @@ + +BEGIN { + unless ($ENV{AUTHOR_TESTING}) { + print qq{1..0 # SKIP these tests are for testing by the author\n}; + exit + } +} + +# This file was automatically generated by Dist::Zilla::Plugin::Test::TidyAll v$VERSION + +use Test::More 0.88; +use Test::Code::TidyAll 0.24; + +tidyall_ok( + verbose => ( exists $ENV{TEST_TIDYALL_VERBOSE} ? $ENV{TEST_TIDYALL_VERBOSE} : 0 ), + jobs => ( exists $ENV{TEST_TIDYALL_JOBS} ? $ENV{TEST_TIDYALL_JOBS} : 1 ), +); + +done_testing; diff --git a/t/comments.t b/t/comments.t new file mode 100644 index 0000000..2ceccbb --- /dev/null +++ b/t/comments.t @@ -0,0 +1,33 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use HTML::Restrict; +use Test::More; + +my $hr = HTML::Restrict->new; + +my $text = '<!-- comment here -->stuff'; +$hr->debug(0); + +is $hr->process($text), 'stuff', 'comments allowed'; +$hr->allow_comments(1); +is $hr->process($text), $text, 'comments allowd'; + +$text = 'before<!-- This is a comment -- -- So is this -->after'; +$hr->allow_comments(0); + +is $hr->process($text), 'beforeafter', 'comment allowed'; + +$hr->allow_comments(1); +is $hr->process($text), $text, 'comments allowd'; + +$hr->allow_comments(0); +$text = '<!-- <script> <h1> -->'; +is $hr->process($text), undef, 'tags nested in comments removed'; + +#$hr->set_rules({ script => [], 'h1' => [] }); +#is $hr->process( $text ), $text, 'tags nested in comments not removed when explicitly allowed'; + +done_testing(); diff --git a/t/declaration.t b/t/declaration.t new file mode 100644 index 0000000..2d5e9bf --- /dev/null +++ b/t/declaration.t @@ -0,0 +1,22 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use HTML::Restrict; +use Test::More; + +my $hr = HTML::Restrict->new; + +my $text = '<!DOCTYPE HTML> '; +$hr->debug(1); + +is $hr->process($text), '', 'declaration not preserved'; +$hr->allow_declaration(1); +is $hr->process($text), '<!DOCTYPE HTML>', 'declaration is preserved'; + +$text + = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; +is $hr->process($text), $text, 'declaration preserved'; + +done_testing(); @@ -0,0 +1,24 @@ +#!perl + +use strict; +use warnings; + +use HTML::Restrict; +use Scalar::Util; +use Test::More; + +my $hr = HTML::Restrict->new( debug => 0 ); + +my $html = q[<script type="text/javascript"> +$(document).ready(function() { + $('a.gallery').fancybox(); +}); +</script>]; + +is( $hr->process($html), undef, "content of script tags removed by default" ); + +$hr->set_rules( { script => ['type'] } ); + +is( $hr->process($html), $html, "content of script preserved" ); + +done_testing(); diff --git a/t/lowercase.t b/t/lowercase.t new file mode 100644 index 0000000..7284884 --- /dev/null +++ b/t/lowercase.t @@ -0,0 +1,29 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use Test::More; +use Test::Fatal; +use HTML::Restrict; + +my $html + = q[<!doctype html><!-- comments go here --><body onLoad="stuff">foo</body>]; + +like( + exception { + my $hr = HTML::Restrict->new( rules => { Body => ['onload'] } ); + }, + qr{tag names must be lower cased}, + "dies on mixed case tag names", +); + +like( + exception { + my $hr = HTML::Restrict->new( rules => { body => ['onLoad'] } ); + }, + qr{attribute names must be lower cased}, + "dies on mixed case attributes", +); + +done_testing(); diff --git a/t/memory-leak.t b/t/memory-leak.t new file mode 100644 index 0000000..7111746 --- /dev/null +++ b/t/memory-leak.t @@ -0,0 +1,24 @@ +use strict; +use warnings; + +use Test::More; +use HTML::Restrict; +use Scalar::Util qw(weaken); + +# Ensure that we don't have any circular references between the HTML::Restrict +# object and its parser. +my $hr = HTML::Restrict->new; +my $p = $hr->parser; + +my $weak_hr = $hr; +my $weak_p = $p; +weaken($weak_hr); +weaken($weak_p); + +undef $hr; +undef $p; + +ok !defined $weak_hr, 'HTML::Restrict freed; no circular reference.'; +ok !defined $weak_p, 'HTML::Parser freed; no circular reference.'; + +done_testing(); diff --git a/t/perlcriticrc b/t/perlcriticrc new file mode 100644 index 0000000..d6574dd --- /dev/null +++ b/t/perlcriticrc @@ -0,0 +1,4 @@ +severity = 3 +[-Subroutines::RequireArgUnpacking] +[-ControlStructures::ProhibitPostfixControls] +[-Miscellanea::RequireRcsKeywords] @@ -0,0 +1,12 @@ +#!perl -T + +use strict; +use warnings; +use Test::More; + +# Ensure a recent version of Test::Pod +my $min_tp = 1.22; +eval "use Test::Pod $min_tp"; +plan skip_all => "Test::Pod $min_tp required for testing POD" if $@; + +all_pod_files_ok(); diff --git a/t/release-cpan-changes.t b/t/release-cpan-changes.t new file mode 100644 index 0000000..08331d3 --- /dev/null +++ b/t/release-cpan-changes.t @@ -0,0 +1,18 @@ + +BEGIN { + unless ($ENV{RELEASE_TESTING}) { + print qq{1..0 # SKIP these tests are for release candidate testing\n}; + exit + } +} + +use strict; +use warnings; + +# this test was generated with Dist::Zilla::Plugin::Test::CPAN::Changes 0.012 + +use Test::More 0.96 tests => 1; +use Test::CPAN::Changes; +subtest 'changes_ok' => sub { + changes_file_ok('Changes'); +}; diff --git a/t/replace_img.t b/t/replace_img.t new file mode 100644 index 0000000..f40aa90 --- /dev/null +++ b/t/replace_img.t @@ -0,0 +1,62 @@ +#!perl + +use strict; +use warnings; + +use HTML::Restrict; +use Scalar::Util; +use Test::More; + +my @texts = ( + { + label => "<img ... />", + html => q{<img alt="foo bar" src="http://example.com/foo.jpg" />}, + }, + { + label => "<img ... ></img>", + html => q{<img alt="foo bar" src="http://example.com/foo.jpg"></img>}, + }, +); + +my @cases = ( + { + label => "default args", + args => {}, + expect => undef, + }, + { + label => "replace_img => 0", + args => { replace_img => 0 }, + expect => undef, + }, + { + label => "replace_img => 1", + args => { replace_img => 1 }, + expect => q{[IMAGE: foo bar]}, + }, + { + label => "replace_img => CODE", + args => { replace_img => \&replacer }, + expect => q{[IMAGE REMOVED: foo bar]}, + }, +); + +sub replacer { + my ( $tag, $attr, $text ) = @_; + return "[IMAGE REMOVED: $attr->{alt}]"; +} + +for my $c (@cases) { + ok( + my $hr = HTML::Restrict->new( debug => 0, %{ $c->{args} } ), + "$c->{label}: HTML::Restrict->new(...)" + ); + for my $t (@texts) { + is( + $hr->process( $t->{html} ), $c->{expect}, + "$c->{label}: $t->{label}" + ); + } +} + +done_testing(); diff --git a/t/scheme.t b/t/scheme.t new file mode 100644 index 0000000..8bebd90 --- /dev/null +++ b/t/scheme.t @@ -0,0 +1,70 @@ +use strict; +use warnings; + +use Test::More; +use HTML::Restrict; + +my $hr = HTML::Restrict->new( + rules => { + a => [qw( href )], + img => [qw( src /)], + blockquote => [qw( cite )], + }, +); + +$hr->set_uri_schemes( [ 'http', 'https', undef, 'ftp' ] ); + +cmp_ok( + $hr->process('<a href="http://example.com">link</a>'), + 'eq', '<a href="http://example.com">link</a>', + 'http scheme preserved', +); + +cmp_ok( + $hr->process('<a href="https://example.com">link</a>'), + 'eq', '<a href="https://example.com">link</a>', + 'https scheme preserved', +); + +cmp_ok( + $hr->process('<a href="/some/file">link</a>'), + 'eq', '<a href="/some/file">link</a>', + 'relative scheme preserved', +); + +cmp_ok( + $hr->process('<a href="ftp://example.com">link</a>'), + 'eq', '<a href="ftp://example.com">link</a>', + 'ftp scheme preserved', +); + +cmp_ok( + $hr->process('<a href="file://example.com">link</a>'), + 'eq', '<a>link</a>', + 'file scheme removed', +); + +cmp_ok( + $hr->process('<img src="javascript:evil_fc()" />'), + 'eq', '<img />', + 'img src with javascript removed', +); + +cmp_ok( + $hr->process( + '<blockquote cite="javascript:evil_fc()">blockquote</blockquote>'), + 'eq', + '<blockquote>blockquote</blockquote>', + 'blockquote cite with javascript removed', +); + +# disable relative schemes +$hr->set_uri_schemes( [ 'http', 'https', 'ftp' ] ); + +cmp_ok( + $hr->process('<a href="/some/file">link</a>'), + 'eq', '<a>link</a>', + 'relative scheme removed', +); + +done_testing(); diff --git a/t/stack.t b/t/stack.t new file mode 100644 index 0000000..b95f0ff --- /dev/null +++ b/t/stack.t @@ -0,0 +1,19 @@ +use strict; +use warnings; + +use Test::More; +use HTML::Restrict; + +my $hr = HTML::Restrict->new; + +ok( !@{ $hr->_stripper_stack }, "stack empty" ); + +push @{ $hr->_stripper_stack }, 'script', 'style', 'pre', 'script'; +$hr->_delete_tag_from_stack('script'); +is_deeply( + $hr->_stripper_stack, + [ 'script', 'style', 'pre' ], + 'deletes from stack in correct order and amount' +); + +done_testing(); diff --git a/t/style.t b/t/style.t new file mode 100644 index 0000000..2491fa5 --- /dev/null +++ b/t/style.t @@ -0,0 +1,24 @@ +#!perl + +use strict; +use warnings; + +use HTML::Restrict; +use Scalar::Util; +use Test::More; + +my $hr = HTML::Restrict->new( debug => 0 ); + +my $html = q[<style type="text/css"> +hr {color:sienna;} +p {margin-left:20px;} +body {background-image:url("images/back40.gif");} +</style>]; + +is( $hr->process($html), undef, "content of style tag removed by default" ); + +$hr->set_rules( { style => ['type'] } ); + +is( $hr->process($html), $html, "content of style tag preserved" ); + +done_testing(); @@ -0,0 +1,35 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Test::More; +use HTML::Restrict; + +my $hr = HTML::Restrict->new; +$hr->debug(0); +$hr->set_rules( { a => [ 'href', 'class' ] } ); + +my $text = '<a href="javascript:alert(1)">oops!</a>'; + +my $clean = $hr->process($text); +is $clean, '<a>oops!</a>', "bad scheme removed"; + +is $hr->process('<a href="javascript:evil_script()">evil</a>'), + '<a>evil</a>', 'bad scheme removed'; + +foreach my $uri ( + 'http://vilerichard.com', 'https://vilerichard.com', + '//vilerichard.com', '/music' + ) { + my $img = qq[<a href="$uri">click</a>]; + is $hr->process($img), $img, "good uri scheme preserved"; +} + +is $hr->process( + '<a class=""><script>alert("oops");</script><a href=""></a>' + ), + '<a class=""><script>alert("oops");</script><a href=""></a>', + 'attribute value filtered'; + +done_testing(); diff --git a/tidyall.ini b/tidyall.ini new file mode 100644 index 0000000..6b3438d --- /dev/null +++ b/tidyall.ini @@ -0,0 +1,26 @@ +[PerlTidy] +select = **/*.{pl,pm,t,psgi} +ignore = t/00-* +ignore = t/author-* +ignore = t/release-* +ignore = blib/**/* +ignore = .build/**/* +ignore = HTML-Restrict-*/**/* +argv = --profile=$ROOT/perltidyrc + +;[PerlCritic] +;select = **/*.{pl,pm,t,psgi} +;ignore = t/00-* +;ignore = t/author-* +;ignore = t/release-* +;ignore = blib/**/* +;ignore = .build/**/* +;ignore = HTML-Restrict-*/**/* +;argv = --profile $ROOT/perlcriticrc --program-extensions .pl --program-extensions .t --program-extensions .psgi + +[SortLines::Naturally] +select = .gitignore +select = .stopwords + +[JSON] +select = t/**/*json |