summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFernando Farfan <ffarfan@gmail.com>2015-04-16 16:53:29 -0600
committerFernando Farfan <ffarfan@gmail.com>2015-05-05 11:10:58 -0600
commite1ea46ae801645ccb85ad38b4c672f1c3c758a44 (patch)
treec05e34b2705f1fa16aa362dd7e9c1632a6bd9798
parentac48476eadc3a7584468bef143604133ac716e21 (diff)
Scripts to download and organize CCLicense files
These scripts download and prepare all the HTML files that are required to publish Creative Commons licenses. [endlessm/eos-sdk#3025]
-rwxr-xr-xlicenses/01-download-cc-licenses.sh18
-rwxr-xr-xlicenses/02-organize-cc-files.py92
-rwxr-xr-xlicenses/03-cleanup-cc-html-files.py130
3 files changed, 240 insertions, 0 deletions
diff --git a/licenses/01-download-cc-licenses.sh b/licenses/01-download-cc-licenses.sh
new file mode 100755
index 0000000..88e05e7
--- /dev/null
+++ b/licenses/01-download-cc-licenses.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+licenses=("by/3.0" "by/4.0" "by-sa/3.0" "by-sa/4.0" "by-nd/2.0" "by-nd/3.0")
+files=("legalcode" "deed.ar" "deed.en" "deed.es" "deed.fr" "deed.pt_BR")
+
+# Remove log file if exists
+rm -f wget-cc.log
+
+# Download legalcode and deed files for each license type and version
+for license in "${licenses[@]}"
+do
+ echo $license
+ for file in "${files[@]}"
+ do
+ echo " " $file
+ wget --recursive --level=1 --adjust-extension --page-requisites --no-clobber --no-verbose https://creativecommons.org/licenses/$license/$file --append-output=wget-cc.log
+ done
+done
diff --git a/licenses/02-organize-cc-files.py b/licenses/02-organize-cc-files.py
new file mode 100755
index 0000000..457b9a6
--- /dev/null
+++ b/licenses/02-organize-cc-files.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+import os
+import shutil
+import sys
+
+def main(argv):
+ source_dir = 'creativecommons.org/'
+ target_dir = 'creativecommons/'
+
+ copy_license_files(source_dir + 'licenses/', target_dir)
+ copy_requisite_files(source_dir, target_dir)
+
+def copy_license_files(source_dir, target_dir):
+
+ licenses = [
+ 'by/3.0',
+ 'by/4.0',
+ 'by-sa/3.0',
+ 'by-sa/4.0',
+ 'by-nd/2.0',
+ 'by-nd/3.0',
+ ]
+
+ langs = ['ar', 'en', 'es', 'fr', 'pt_BR']
+
+ ensure_target_dirs_exist(target_dir, langs)
+
+ for license in licenses:
+ license_code = get_code_for_license(license)
+
+ source_license_dir = source_dir + license + '/'
+
+ target_legalcode_dir = target_dir + 'legalcode/'
+
+ # Move legalcode file for license
+ source_legalcode_path = source_license_dir + 'legalcode.html'
+ target_legalcode_path = target_legalcode_dir + license_code + '-legalcode.html'
+ print 'cp ' + source_legalcode_path + ' ' + target_legalcode_path
+ shutil.copy(source_legalcode_path, target_legalcode_path)
+
+ # Move deeds for individual languages
+ for lang in langs:
+ target_lang_dir = target_dir + lang + '/'
+ source_deed_path = source_license_dir + 'deed.' + lang + '.html'
+ target_deed_path = target_lang_dir + license_code + '.html'
+
+ # Move deed file for license/language
+ print 'cp ' + source_deed_path + ' ' + target_deed_path
+ shutil.copy(source_deed_path, target_deed_path)
+
+ # Rename 'en' subdir to default locale 'C'
+ print 'mv -f ' + target_dir + 'en/ ' + target_dir + 'C/'
+ shutil.rmtree(target_dir + 'C', ignore_errors=True)
+ shutil.move(target_dir + 'en', target_dir + 'C')
+
+def copy_requisite_files(source_dir, target_dir):
+ for req in ['images/', 'includes/']:
+ print 'cp ' + source_dir + req + ' ' + target_dir + req
+ try:
+ shutil.rmtree(source_dir + req, ignore_errors=True)
+ except IOError:
+ pass
+ shutil.copytree(target_dir + req, source_dir + req)
+
+def ensure_target_dirs_exist(target_dir, langs):
+ # Ensure target directory exists
+ try:
+ os.makedirs(target_dir)
+ except OSError:
+ pass
+
+ # Create target language directories
+ for lang in langs:
+ lang_path = os.path.join(target_dir, lang)
+ try:
+ os.makedirs(lang_path)
+ except OSError:
+ pass
+
+ # Create target legalcode directory
+ legalcode_path = os.path.join(target_dir, 'legalcode')
+ try:
+ os.makedirs(legalcode_path)
+ except OSError:
+ pass
+
+def get_code_for_license(license):
+ return 'CC-' + license.upper().replace('/', '-')
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
diff --git a/licenses/03-cleanup-cc-html-files.py b/licenses/03-cleanup-cc-html-files.py
new file mode 100755
index 0000000..b67eeac
--- /dev/null
+++ b/licenses/03-cleanup-cc-html-files.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+
+from bs4 import BeautifulSoup, Comment
+import os
+import re
+import sys
+
+def main(argv):
+ source_dir = 'creativecommons/'
+
+ licenses = [
+ 'CC-BY-3.0',
+ 'CC-BY-4.0',
+ 'CC-BY-SA-3.0',
+ 'CC-BY-SA-4.0',
+ 'CC-BY-ND-2.0',
+ 'CC-BY-ND-3.0',
+ ]
+
+ langs = ['C', 'ar', 'es', 'fr', 'pt_BR']
+
+ for license in licenses:
+ cleanup_legalcode_file(source_dir, license)
+ for lang in langs:
+ cleanup_deed_file(source_dir, license, lang)
+ print ''
+
+def cleanup_legalcode_file(src_dir, license):
+ f_legalcode = os.path.join(src_dir, 'legalcode', license + '-legalcode.html')
+ print 'Cleaning up ' + f_legalcode
+
+ with open(f_legalcode, 'r+') as f:
+ html = f.read()
+ soup = BeautifulSoup(html, from_encoding="UTF-8")
+
+ # Remove IE7 conditional comments
+ cleanup_conditional_comments(soup)
+
+ # Remove errata.js script
+ for script in soup.findAll('script'):
+ if 'errata' in script['src']:
+ script.extract()
+
+ # Make attributes relative
+ rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../images/')
+ rewrite_attr(soup, 'img', 'src', '^/images/', '../images/')
+ rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'link', 'href', '^/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'a', 'href', '^creativecommons.org/', 'http://creativecommons.org/')
+ rewrite_attr(soup, 'a', 'href', '^//creativecommons.org/', 'http://creativecommons.org/')
+
+ # Remove footer
+ for foot in soup.findAll('div', {'id': 'deed-foot'}):
+ foot.extract()
+
+ # Overwrite legalcode file with clean version of html
+ html = soup.prettify(soup.original_encoding)
+ f.seek(0)
+ f.truncate()
+ f.write(html)
+ f.close()
+
+def cleanup_deed_file(src_dir, license, lang):
+ f_deed = os.path.join(src_dir, lang, license + '.html')
+ print 'Cleaning up ' + f_deed
+
+ with open(f_deed, 'r+') as f:
+ html = f.read()
+ soup = BeautifulSoup(html, from_encoding="UTF-8")
+
+ # Remove IE7 conditional comments
+ cleanup_conditional_comments(soup)
+
+ # Remove RDF declarations
+ cleanup_rdf_declarations(soup)
+
+ # Remove deed.js script
+ for script in soup.findAll('script', src=True):
+ if 'scraper.creativecommons.org/js/deed.js' in script['src']:
+ script.extract()
+
+ # Make attributes relative
+ rewrite_attr(soup, 'a', 'href', '.*legalcode$', '../legalcode/' + license + '-legalcode.html')
+ rewrite_attr(soup, 'a', 'href', '^/choose/', 'http://creativecommons.org/choose/')
+ rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../images/')
+ rewrite_attr(soup, 'img', 'src', '^/images/', '../images/')
+ rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'link', 'href', '^/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '^/includes/', '../includes/')
+ rewrite_attr(soup, 'script', 'src', '^//scraper.creativecommons.org/js/deed.js', '../includes/deed.js')
+
+ # Remove inline JS
+ for script in soup.findAll('script'):
+ if not script.has_key('src'):
+ script.extract()
+
+ # Remove languages footer
+ for lang_footer in soup.findAll('div', {'id': 'languages'}):
+ lang_footer.extract()
+
+ # Overwrite deed file with clean version of html
+ html = soup.prettify(soup.original_encoding)
+ f.seek(0)
+ f.truncate()
+ f.write(html)
+ f.close()
+
+def rewrite_attr(html, elem, attr, source, target):
+ for element in html.findAll(elem):
+ if element.has_key(attr):
+ attr_val = re.sub(source, target, element[attr])
+ element[attr] = attr_val
+
+def cleanup_conditional_comments(html):
+ comments = html.findAll(text=lambda text:isinstance(text, Comment) and '[if' in text)
+ for comment in comments:
+ comment.extract()
+
+def cleanup_rdf_declarations(html):
+ decls = html.findAll(text=lambda text:isinstance(text, Comment) and 'RDF' in text)
+ for decl in decls:
+ decl.extract()
+
+ for rdf in html.findAll('link', {'href': 'rdf'}):
+ rdf.extract()
+
+if __name__ == '__main__':
+ main(sys.argv[1:])