summaryrefslogtreecommitdiff
path: root/licenses/03-cleanup-cc-html-files.py
diff options
context:
space:
mode:
authorFernando Farfan <ffarfan@gmail.com>2015-04-16 16:53:29 -0600
committerFernando Farfan <ffarfan@gmail.com>2015-05-05 11:10:58 -0600
commite1ea46ae801645ccb85ad38b4c672f1c3c758a44 (patch)
treec05e34b2705f1fa16aa362dd7e9c1632a6bd9798 /licenses/03-cleanup-cc-html-files.py
parentac48476eadc3a7584468bef143604133ac716e21 (diff)
Scripts to download and organize CCLicense files
These scripts download and prepare all the HTML files that are required to publish Creative Commons licenses. [endlessm/eos-sdk#3025]
Diffstat (limited to 'licenses/03-cleanup-cc-html-files.py')
-rwxr-xr-xlicenses/03-cleanup-cc-html-files.py130
1 files changed, 130 insertions, 0 deletions
diff --git a/licenses/03-cleanup-cc-html-files.py b/licenses/03-cleanup-cc-html-files.py
new file mode 100755
index 0000000..b67eeac
--- /dev/null
+++ b/licenses/03-cleanup-cc-html-files.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+
+from bs4 import BeautifulSoup, Comment
+import os
+import re
+import sys
+
+def main(argv):
+ source_dir = 'creativecommons/'
+
+ licenses = [
+ 'CC-BY-3.0',
+ 'CC-BY-4.0',
+ 'CC-BY-SA-3.0',
+ 'CC-BY-SA-4.0',
+ 'CC-BY-ND-2.0',
+ 'CC-BY-ND-3.0',
+ ]
+
+ langs = ['C', 'ar', 'es', 'fr', 'pt_BR']
+
+ for license in licenses:
+ cleanup_legalcode_file(source_dir, license)
+ for lang in langs:
+ cleanup_deed_file(source_dir, license, lang)
+ print ''
+
+def cleanup_legalcode_file(src_dir, license):
+ f_legalcode = os.path.join(src_dir, 'legalcode', license + '-legalcode.html')
+ print 'Cleaning up ' + f_legalcode
+
+ with open(f_legalcode, 'r+') as f:
+ html = f.read()
+ soup = BeautifulSoup(html, from_encoding="UTF-8")
+
+ # Remove IE7 conditional comments
+ cleanup_conditional_comments(soup)
+
+ # Remove errata.js script
+ for script in soup.findAll('script'):
+ if 'errata' in script['src']:
+ script.extract()
+
+ # Make attributes relative
+ rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../images/')
+ rewrite_attr(soup, 'img', 'src', '^/images/', '../images/')
+ rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'link', 'href', '^/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'a', 'href', '^creativecommons.org/', 'http://creativecommons.org/')
+ rewrite_attr(soup, 'a', 'href', '^//creativecommons.org/', 'http://creativecommons.org/')
+
+ # Remove footer
+ for foot in soup.findAll('div', {'id': 'deed-foot'}):
+ foot.extract()
+
+ # Overwrite legalcode file with clean version of html
+ html = soup.prettify(soup.original_encoding)
+ f.seek(0)
+ f.truncate()
+ f.write(html)
+ f.close()
+
+def cleanup_deed_file(src_dir, license, lang):
+ f_deed = os.path.join(src_dir, lang, license + '.html')
+ print 'Cleaning up ' + f_deed
+
+ with open(f_deed, 'r+') as f:
+ html = f.read()
+ soup = BeautifulSoup(html, from_encoding="UTF-8")
+
+ # Remove IE7 conditional comments
+ cleanup_conditional_comments(soup)
+
+ # Remove RDF declarations
+ cleanup_rdf_declarations(soup)
+
+ # Remove deed.js script
+ for script in soup.findAll('script', src=True):
+ if 'scraper.creativecommons.org/js/deed.js' in script['src']:
+ script.extract()
+
+ # Make attributes relative
+ rewrite_attr(soup, 'a', 'href', '.*legalcode$', '../legalcode/' + license + '-legalcode.html')
+ rewrite_attr(soup, 'a', 'href', '^/choose/', 'http://creativecommons.org/choose/')
+ rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../images/')
+ rewrite_attr(soup, 'img', 'src', '^/images/', '../images/')
+ rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'link', 'href', '^/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '^/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '^//scraper.creativecommons.org/js/deed.js', '../includes/deed.js')
+
+ # Remove inline JS
+ for script in soup.findAll('script'):
+ if not script.has_key('src'):
+ script.extract()
+
+ # Remove languages footer
+ for lang_footer in soup.findAll('div', {'id': 'languages'}):
+ lang_footer.extract()
+
+ # Overwrite deed file with clean version of html
+ html = soup.prettify(soup.original_encoding)
+ f.seek(0)
+ f.truncate()
+ f.write(html)
+ f.close()
+
+def rewrite_attr(html, elem, attr, source, target):
+ for element in html.findAll(elem):
+ if element.has_key(attr):
+ attr_val = re.sub(source, target, element[attr])
+ element[attr] = attr_val
+
+def cleanup_conditional_comments(html):
+ comments = html.findAll(text=lambda text:isinstance(text, Comment) and '[if' in text)
+ for comment in comments:
+ comment.extract()
+
+def cleanup_rdf_declarations(html):
+ decls = html.findAll(text=lambda text:isinstance(text, Comment) and 'RDF' in text)
+ for decl in decls:
+ decl.extract()
+
+ for rdf in html.findAll('link', {'href': 'rdf'}):
+ rdf.extract()
+
+if __name__ == '__main__':
+ main(sys.argv[1:])