summaryrefslogtreecommitdiff
path: root/licenses/03-cleanup-cc-html-files.py
diff options
context:
space:
mode:
Diffstat (limited to 'licenses/03-cleanup-cc-html-files.py')
-rwxr-xr-xlicenses/03-cleanup-cc-html-files.py137
1 files changed, 0 insertions, 137 deletions
diff --git a/licenses/03-cleanup-cc-html-files.py b/licenses/03-cleanup-cc-html-files.py
deleted file mode 100755
index c81ca7d..0000000
--- a/licenses/03-cleanup-cc-html-files.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-
-from bs4 import BeautifulSoup, Comment
-import os
-import re
-import sys
-
-from license_utils import rewrite_attr
-
-def main(argv):
- langs = ['C', 'ar', 'es', 'fr', 'pt_BR']
-
- # Clean up Public domain license files
- cleanup_legalcode_file('publicdomain/', 'CC0-1.0')
- for lang in langs:
- cleanup_deed_file('publicdomain/', 'CC0-1.0', lang)
- print('')
-
- # Clean up Creative Commons license files
- cc_licenses = [
- 'CC-BY-2.0',
- 'CC-BY-3.0',
- 'CC-BY-4.0',
- 'CC-BY-NC-2.0',
- 'CC-BY-NC-3.0',
- 'CC-BY-NC-SA-2.0',
- 'CC-BY-ND-2.0',
- 'CC-BY-ND-3.0',
- 'CC-BY-SA-2.0',
- 'CC-BY-SA-2.5',
- 'CC-BY-SA-3.0',
- 'CC-BY-SA-4.0',
- ]
-
- for license in cc_licenses:
- cleanup_legalcode_file('creativecommons/', license)
- for lang in langs:
- cleanup_deed_file('creativecommons/', license, lang)
- print('')
-
-def cleanup_legalcode_file(src_dir, license):
- f_legalcode = os.path.join(src_dir, 'legalcode', license + '-legalcode.html')
- print('Cleaning up ' + f_legalcode)
-
- with open(f_legalcode, 'r+') as f:
- html = f.read()
- soup = BeautifulSoup(html, from_encoding="UTF-8")
-
- # Remove IE7 conditional comments
- cleanup_conditional_comments(soup)
-
- # Remove errata.js script
- for script in soup.findAll('script'):
- if 'errata' in script['src']:
- script.extract()
-
- # Make attributes relative
- rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../../images/')
- rewrite_attr(soup, 'img', 'src', '^/images/', '../../images/')
- rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../../includes/')
- rewrite_attr(soup, 'link', 'href', '^/includes/', '../../includes/')
- rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../../includes/')
- rewrite_attr(soup, 'a', 'href', '^creativecommons.org/', 'http://creativecommons.org/')
- rewrite_attr(soup, 'a', 'href', '^//creativecommons.org/', 'http://creativecommons.org/')
-
- # Remove footer
- for foot in soup.findAll('div', {'id': 'deed-foot'}):
- foot.extract()
-
- # Overwrite legalcode file with clean version of html
- html = soup.prettify(soup.original_encoding)
- f.seek(0)
- f.truncate()
- f.write(html)
- f.close()
-
-def cleanup_deed_file(src_dir, license, lang):
- f_deed = os.path.join(src_dir, lang, license + '.html')
- print('Cleaning up ' + f_deed)
-
- with open(f_deed, 'r+') as f:
- html = f.read()
- soup = BeautifulSoup(html, from_encoding="UTF-8")
-
- # Remove IE7 conditional comments
- cleanup_conditional_comments(soup)
-
- # Remove RDF declarations
- cleanup_rdf_declarations(soup)
-
- # Remove deed.js script
- for script in soup.findAll('script', src=True):
- if 'scraper.creativecommons.org/js/deed.js' in script['src']:
- script.extract()
-
- # Make attributes relative
- rewrite_attr(soup, 'a', 'href', '.*legalcode$', '../legalcode/' + license + '-legalcode.html')
- rewrite_attr(soup, 'a', 'href', '^/choose/', 'http://creativecommons.org/choose/')
- rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../../images/')
- rewrite_attr(soup, 'img', 'src', '^/images/', '../../images/')
- rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../../includes/')
- rewrite_attr(soup, 'link', 'href', '^/includes/', '../../includes/')
- rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../../includes/')
- rewrite_attr(soup, 'script', 'src', '^/includes/', '../../includes/')
- rewrite_attr(soup, 'script', 'src', '^//scraper.creativecommons.org/js/deed.js', '../../includes/deed.js')
-
- # Remove inline JS
- for script in soup.findAll('script'):
- if not script.has_key('src'):
- script.extract()
-
- # Remove languages footer
- for lang_footer in soup.findAll('div', {'id': 'languages'}):
- lang_footer.extract()
-
- # Overwrite deed file with clean version of html
- html = soup.prettify(soup.original_encoding)
- f.seek(0)
- f.truncate()
- f.write(html)
- f.close()
-
-def cleanup_conditional_comments(html):
- comments = html.findAll(text=lambda text:isinstance(text, Comment) and '[if' in text)
- for comment in comments:
- comment.extract()
-
-def cleanup_rdf_declarations(html):
- decls = html.findAll(text=lambda text:isinstance(text, Comment) and 'RDF' in text)
- for decl in decls:
- decl.extract()
-
- for rdf in html.findAll('link', {'href': 'rdf'}):
- rdf.extract()
-
-if __name__ == '__main__':
- main(sys.argv[1:])