From e1ea46ae801645ccb85ad38b4c672f1c3c758a44 Mon Sep 17 00:00:00 2001 From: Fernando Farfan Date: Thu, 16 Apr 2015 16:53:29 -0600 Subject: Scripts to download and organize CCLicense files These scripts download and prepare all the HTML files that are required to publish Creative Commons licenses. [endlessm/eos-sdk#3025] --- licenses/01-download-cc-licenses.sh | 18 +++++ licenses/02-organize-cc-files.py | 92 +++++++++++++++++++++++++ licenses/03-cleanup-cc-html-files.py | 130 +++++++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+) create mode 100755 licenses/01-download-cc-licenses.sh create mode 100755 licenses/02-organize-cc-files.py create mode 100755 licenses/03-cleanup-cc-html-files.py diff --git a/licenses/01-download-cc-licenses.sh b/licenses/01-download-cc-licenses.sh new file mode 100755 index 0000000..88e05e7 --- /dev/null +++ b/licenses/01-download-cc-licenses.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +licenses=("by/3.0" "by/4.0" "by-sa/3.0" "by-sa/4.0" "by-nd/2.0" "by-nd/3.0") +files=("legalcode" "deed.ar" "deed.en" "deed.es" "deed.fr" "deed.pt_BR") + +# Remove log file if exists +rm -f wget-cc.log + +# Download legalcode and deed files for each license type and version +for license in "${licenses[@]}" +do + echo $license + for file in "${files[@]}" + do + echo " " $file + wget --recursive --level=1 --adjust-extension --page-requisites --no-clobber --no-verbose https://creativecommons.org/licenses/$license/$file --append-output=wget-cc.log + done +done diff --git a/licenses/02-organize-cc-files.py b/licenses/02-organize-cc-files.py new file mode 100755 index 0000000..457b9a6 --- /dev/null +++ b/licenses/02-organize-cc-files.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +import os +import shutil +import sys + +def main(argv): + source_dir = 'creativecommons.org/' + target_dir = 'creativecommons/' + + copy_license_files(source_dir + 'licenses/', target_dir) + copy_requisite_files(source_dir, target_dir) + +def copy_license_files(source_dir, target_dir): + + licenses = [ + 'by/3.0', + 'by/4.0', + 'by-sa/3.0', + 'by-sa/4.0', + 'by-nd/2.0', + 'by-nd/3.0', + ] + + langs = ['ar', 'en', 'es', 'fr', 'pt_BR'] + + ensure_target_dirs_exist(target_dir, langs) + + for license in licenses: + license_code = get_code_for_license(license) + + source_license_dir = source_dir + license + '/' + + target_legalcode_dir = target_dir + 'legalcode/' + + # Move legalcode file for license + source_legalcode_path = source_license_dir + 'legalcode.html' + target_legalcode_path = target_legalcode_dir + license_code + '-legalcode.html' + print 'cp ' + source_legalcode_path + ' ' + target_legalcode_path + shutil.copy(source_legalcode_path, target_legalcode_path) + + # Move deeds for individual languages + for lang in langs: + target_lang_dir = target_dir + lang + '/' + source_deed_path = source_license_dir + 'deed.' + lang + '.html' + target_deed_path = target_lang_dir + license_code + '.html' + + # Move deed file for license/language + print 'cp ' + source_deed_path + ' ' + target_deed_path + shutil.copy(source_deed_path, target_deed_path) + + # Rename 'en' subdir to default locale 'C' + print 'mv -f ' + target_dir + 'en/ ' + target_dir + 'C/' + shutil.rmtree(target_dir + 'C', ignore_errors=True) + shutil.move(target_dir + 'en', target_dir + 'C') + +def copy_requisite_files(source_dir, target_dir): + for req in ['images/', 'includes/']: + print 'cp ' + source_dir + req + ' ' + target_dir + req + try: + shutil.rmtree(source_dir + req, ignore_errors=True) + except IOError: + pass + shutil.copytree(target_dir + req, source_dir + req) + +def ensure_target_dirs_exist(target_dir, langs): + # Ensure target directory exists + try: + os.makedirs(target_dir) + except OSError: + pass + + # Create target language directories + for lang in langs: + lang_path = os.path.join(target_dir, lang) + try: + os.makedirs(lang_path) + except OSError: + pass + + # Create target legalcode directory + legalcode_path = os.path.join(target_dir, 'legalcode') + try: + os.makedirs(legalcode_path) + except OSError: + pass + +def get_code_for_license(license): + return 'CC-' + license.upper().replace('/', '-') + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/licenses/03-cleanup-cc-html-files.py b/licenses/03-cleanup-cc-html-files.py new file mode 100755 index 0000000..b67eeac --- /dev/null +++ b/licenses/03-cleanup-cc-html-files.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python + +from bs4 import BeautifulSoup, Comment +import os +import re +import sys + +def main(argv): + source_dir = 'creativecommons/' + + licenses = [ + 'CC-BY-3.0', + 'CC-BY-4.0', + 'CC-BY-SA-3.0', + 'CC-BY-SA-4.0', + 'CC-BY-ND-2.0', + 'CC-BY-ND-3.0', + ] + + langs = ['C', 'ar', 'es', 'fr', 'pt_BR'] + + for license in licenses: + cleanup_legalcode_file(source_dir, license) + for lang in langs: + cleanup_deed_file(source_dir, license, lang) + print '' + +def cleanup_legalcode_file(src_dir, license): + f_legalcode = os.path.join(src_dir, 'legalcode', license + '-legalcode.html') + print 'Cleaning up ' + f_legalcode + + with open(f_legalcode, 'r+') as f: + html = f.read() + soup = BeautifulSoup(html, from_encoding="UTF-8") + + # Remove IE7 conditional comments + cleanup_conditional_comments(soup) + + # Remove errata.js script + for script in soup.findAll('script'): + if 'errata' in script['src']: + script.extract() + + # Make attributes relative + rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../images/') + rewrite_attr(soup, 'img', 'src', '^/images/', '../images/') + rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../includes/') + rewrite_attr(soup, 'link', 'href', '^/includes/', '../includes/') + rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../includes/') + rewrite_attr(soup, 'a', 'href', '^creativecommons.org/', 'http://creativecommons.org/') + rewrite_attr(soup, 'a', 'href', '^//creativecommons.org/', 'http://creativecommons.org/') + + # Remove footer + for foot in soup.findAll('div', {'id': 'deed-foot'}): + foot.extract() + + # Overwrite legalcode file with clean version of html + html = soup.prettify(soup.original_encoding) + f.seek(0) + f.truncate() + f.write(html) + f.close() + +def cleanup_deed_file(src_dir, license, lang): + f_deed = os.path.join(src_dir, lang, license + '.html') + print 'Cleaning up ' + f_deed + + with open(f_deed, 'r+') as f: + html = f.read() + soup = BeautifulSoup(html, from_encoding="UTF-8") + + # Remove IE7 conditional comments + cleanup_conditional_comments(soup) + + # Remove RDF declarations + cleanup_rdf_declarations(soup) + + # Remove deed.js script + for script in soup.findAll('script', src=True): + if 'scraper.creativecommons.org/js/deed.js' in script['src']: + script.extract() + + # Make attributes relative + rewrite_attr(soup, 'a', 'href', '.*legalcode$', '../legalcode/' + license + '-legalcode.html') + rewrite_attr(soup, 'a', 'href', '^/choose/', 'http://creativecommons.org/choose/') + rewrite_attr(soup, 'img', 'src', '.*creativecommons.org/images/', '../images/') + rewrite_attr(soup, 'img', 'src', '^/images/', '../images/') + rewrite_attr(soup, 'link', 'href', '.*creativecommons.org/includes/', '../includes/') + rewrite_attr(soup, 'link', 'href', '^/includes/', '../includes/') + rewrite_attr(soup, 'script', 'src', '.*creativecommons.org/includes/', '../includes/') + rewrite_attr(soup, 'script', 'src', '^/includes/', '../includes/') + rewrite_attr(soup, 'script', 'src', '^//scraper.creativecommons.org/js/deed.js', '../includes/deed.js') + + # Remove inline JS + for script in soup.findAll('script'): + if not script.has_key('src'): + script.extract() + + # Remove languages footer + for lang_footer in soup.findAll('div', {'id': 'languages'}): + lang_footer.extract() + + # Overwrite deed file with clean version of html + html = soup.prettify(soup.original_encoding) + f.seek(0) + f.truncate() + f.write(html) + f.close() + +def rewrite_attr(html, elem, attr, source, target): + for element in html.findAll(elem): + if element.has_key(attr): + attr_val = re.sub(source, target, element[attr]) + element[attr] = attr_val + +def cleanup_conditional_comments(html): + comments = html.findAll(text=lambda text:isinstance(text, Comment) and '[if' in text) + for comment in comments: + comment.extract() + +def cleanup_rdf_declarations(html): + decls = html.findAll(text=lambda text:isinstance(text, Comment) and 'RDF' in text) + for decl in decls: + decl.extract() + + for rdf in html.findAll('link', {'href': 'rdf'}): + rdf.extract() + +if __name__ == '__main__': + main(sys.argv[1:]) -- cgit v1.2.3