diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | LICENSE.txt | 28 | ||||
-rw-r--r-- | README.rst | 145 | ||||
-rw-r--r-- | README.txt | 1 | ||||
-rw-r--r-- | pynzb/__init__.py | 17 | ||||
-rw-r--r-- | pynzb/base.py | 84 | ||||
-rw-r--r-- | pynzb/etree_nzb.py | 19 | ||||
-rw-r--r-- | pynzb/expat_nzb.py | 38 | ||||
-rw-r--r-- | pynzb/lxml_nzb.py | 16 | ||||
-rw-r--r-- | pynzb/tests.py | 58 | ||||
-rw-r--r-- | setup.py | 171 |
11 files changed, 577 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..da5fa03 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,28 @@ +Copyright (c) 2009, Eric Florenzano +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the author nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..2e36ef9 --- /dev/null +++ b/README.rst @@ -0,0 +1,145 @@ +Introduction +------------ + +NZB is an XML-based file format for retrieving posts from NNTP (Usenet) servers. +Since NZB is XML-based, it's relatively easy to build one-off parsers to parse +NZB files. This project is an attempt to consolidate those many one-off NZB +parsers into one simple interface. + +This package includes three implementations: one based on expat, another based +on ElementTree, and a final implementation based on lxml. The order in which +they were listed is in order of compatibility. The expat version should work on +all versions of Python > 2.0, the lxml one will work on all versions > 2.5, and +lxml will only work if you have lxml installed. + + +A Note on Installing lxml +------------------------- + +While lxml is not a requirement, I have had a hard time installing lxml in the +past. I have found this set of commands to work perfectly: + +.. sourcecode:: bash + + STATIC_DEPS=true easy_install 'lxml>=2.2beta4' + STATIC_DEPS=true sudo easy_install 'lxml>=2.2beta4' + + +API Documentation +----------------- + + +Accessing the Default Parser +============================ + +Simply import nzb_parser from the pynzb package. It's an instantiated version +of the fastest available parser that your system can support. + + +Other Parser Locations +====================== + +``ExpatNZBParser``: + Available in the ``pynzb.expat_nzb`` namespace. + +``ETreeNZBParser``: + Available in the ``pynzb.etree_nzb`` namespace. + +``LXMLNZBParser``: + Available in the ``pynzb.lxml_nzb`` namespace. + + +Using the NZB Parser +==================== + +If you're using a specific parser, like the ``ETreeNZBParser``, you will first +have to instantiate it: + +.. sourcecode:: python + + nzb_parser = ETreeNZBParser() + + +Otherwise, you can just import the default parser for your system: + +.. sourcecode:: python + + from pynzb import nzb_parser + + +Then, simply call the ``parse`` method, giving it the xml string as the only +argument: + +.. sourcecode:: python + + files = nzb_parser.parse('<?xml ... my nzb file here ... </nzb>') + + +This will return a list of ``NZBFiles`` for you to use. + + +NZBFile Objects +=============== + +All of the parsers return ``NZBFile`` objects, which are objects with the +following properties: + +``poster``: + The name of the user who posted the file to the newsgroup. + +``date``: + A ``datetime.date`` representation of when the server first saw the file. + +``subject``: + The subject used when the user posted the file to the newsgroup. + +``groups``: + A list of strings representing the newsgroups in which this file may be + found. + +``segments``: + A list of ``NZBSegment`` objects talking about where to get the contents + of this file. + + +NZBSegment Objects +================== + +Each ``NZBFile`` has a list of ``NZBSegment`` objects, which include information +on how to retrieve a part of a file. Here's what you can find on an +``NZBSegment`` object: + +``number``: + The number of the segment in the list of files. + +``bytes``: + The size of the segment, in bytes. + +``message_id``: + The Message-ID of the segment (useful for retrieving the full contents) + + +Example +-------- + +In this example, we will grab an Ubuntu NZB and parse the file, printing out +some information about each file and its segments. + +.. sourcecode:: python + + from pynzb import nzb_parser + from urllib2 import urlopen + + # Grab a sample Ubuntu NZB + ubuntu_nzb = urlopen('http://media.eflorenzano.com/misc/sample-ubuntu-nzb.nzb').read() + + # Parse the NZB into files + files = nzb_parser.parse(ubuntu_nzb) + + # Print out each file's subject and the first two segment message ids + for nzb_file in files: + print nzb_file.subject + for segment in nzb_file.segments[:2]: + print ' ' + segment.message_id + if len(nzb_file.segments) > 2: + print ' ...'
\ No newline at end of file diff --git a/README.txt b/README.txt deleted file mode 100644 index b68450f..0000000 --- a/README.txt +++ /dev/null @@ -1 +0,0 @@ -Nothing to see here, yet. diff --git a/pynzb/__init__.py b/pynzb/__init__.py index 8b13789..b6f8e87 100644 --- a/pynzb/__init__.py +++ b/pynzb/__init__.py @@ -1 +1,18 @@ +from pynzb.expat_nzb import ExpatNZBParser +try: + from pynzb.etree_nzb import ETreeNZBParser +except ImportError: + ETreeNZBParser = None +try: + from pynzb.lxml_nzb import LXMLNZBParser +except ImportError: + LXMLNZBParser = None + +# Set up the parser based on speed precedence +if LXMLNZBParser is not None: + nzb_parser = LXMLNZBParser() +elif ETreeNZBParser is not None: + nzb_parser = ETreeNZBParser() +else: + nzb_parser = ExpatNZBParser()
\ No newline at end of file diff --git a/pynzb/base.py b/pynzb/base.py new file mode 100644 index 0000000..8539a2e --- /dev/null +++ b/pynzb/base.py @@ -0,0 +1,84 @@ +import datetime +import time + +def parse_date(date): + if isinstance(date, basestring): + date = int(date) + gmtime = time.gmtime(date) + return datetime.date(gmtime.tm_year, gmtime.tm_mon, gmtime.tm_mday) + + +class NZBSegment(object): + def __init__(self, bytes, number, message_id=None): + self.bytes = int(bytes) + self.number = int(number) + if message_id: + self.message_id = message_id + + + def set_message_id(self, message_id): + self.message_id = message_id + + + +class NZBFile(object): + def __init__(self, poster, date, subject, groups=None, segments=None): + self.poster = poster + self.date = parse_date(date) + self.subject = subject + self.groups = groups or [] + self.segments = segments or [] + + + def add_group(self, group): + self.groups.append(group) + + + def add_segment(self, segment): + self.segments.append(segment) + + + +class BaseNZBParser(object): + def parse(self, xml): + raise NotImplementedError + + + +class BaseETreeNZBParser(BaseNZBParser): + def get_etree_iter(self, xml, et=None): + raise NotImplementedError + + def parse(self, xml): + context = self.get_etree_iter(xml) + files, current_file, current_segment = [], None, None + + for event, elem in context: + if event == "start": + # If it's an NZBFile, create an object so that we can add the + # appropriate stuff to it. + if elem.tag == "{http://www.newzbin.com/DTD/2003/nzb}file": + current_file = NZBFile( + poster = elem.attrib['poster'], + date = elem.attrib['date'], + subject = elem.attrib['subject'] + ) + + elif event == "end": + if elem.tag == "{http://www.newzbin.com/DTD/2003/nzb}file": + files.append(current_file) + + elif elem.tag == "{http://www.newzbin.com/DTD/2003/nzb}group": + current_file.add_group(elem.text) + + elif elem.tag == "{http://www.newzbin.com/DTD/2003/nzb}segment": + current_file.add_segment( + NZBSegment( + bytes = elem.attrib['bytes'], + number = elem.attrib['number'], + message_id = elem.text + ) + ) + # Clear the element, we don't need it any more. + elem.clear() + return files
\ No newline at end of file diff --git a/pynzb/etree_nzb.py b/pynzb/etree_nzb.py new file mode 100644 index 0000000..ebbb6e6 --- /dev/null +++ b/pynzb/etree_nzb.py @@ -0,0 +1,19 @@ +from pynzb.base import BaseETreeNZBParser, NZBFile, NZBSegment + +try: + import cElementTree as etree +except ImportError: + try: + from xml.etree import ElementTree as etree + except ImportError: + raise ImportError("You must have either Python 2.5 or cElementTree " + + "installed before you can use the etree NZB parser.") + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +class ETreeNZBParser(BaseETreeNZBParser): + def get_etree_iter(self, xml, et=etree): + return iter(et.iterparse(StringIO(xml), events=("start", "end")))
\ No newline at end of file diff --git a/pynzb/expat_nzb.py b/pynzb/expat_nzb.py new file mode 100644 index 0000000..5c1b096 --- /dev/null +++ b/pynzb/expat_nzb.py @@ -0,0 +1,38 @@ +from xml.parsers import expat + +from pynzb.base import BaseNZBParser, NZBFile, NZBSegment + +class ExpatNZBParser(BaseNZBParser): + def start_element(self, name, attrs): + if name == 'file': + self.current_file = NZBFile( + poster = attrs['poster'], + date = attrs['date'], + subject = attrs['subject'] + ) + if name == 'segment': + self.current_segment = NZBSegment( + bytes = attrs['bytes'], + number = attrs['number'] + ) + + def end_element(self, name): + if name == 'file': + self.files.append(self.current_file) + elif name == 'group': + self.current_file.add_group(self.current_data) + elif name == 'segment': + self.current_segment.message_id(self.current_data) + self.current_file.add_segment(self.current_segment) + + def char_data(self, data): + self.current_data = data + + def parse(self, xml): + self.files = [] + parser = expat.ParserCreate() + parser.StartElementHandler = self.start_element + parser.EndElementHandler = self.end_element + parser.CharacterDataHandler = self.char_data + parser.Parse(xml) + return self.files
\ No newline at end of file diff --git a/pynzb/lxml_nzb.py b/pynzb/lxml_nzb.py new file mode 100644 index 0000000..790671d --- /dev/null +++ b/pynzb/lxml_nzb.py @@ -0,0 +1,16 @@ +from pynzb.base import BaseETreeNZBParser, NZBFile, NZBSegment + +try: + from lxml import etree +except ImportError: + raise ImportError("You must have lxml installed before you can use the " + + "lxml NZB parser.") + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +class LXMLNZBParser(BaseETreeNZBParser): + def get_etree_iter(self, xml, et=etree): + return iter(et.iterparse(StringIO(xml), events=("start", "end")))
\ No newline at end of file diff --git a/pynzb/tests.py b/pynzb/tests.py new file mode 100644 index 0000000..eec5832 --- /dev/null +++ b/pynzb/tests.py @@ -0,0 +1,58 @@ +import datetime +import time + +from pynzb.base import BaseNZBParser, parse_date +from pynzb import ExpatNZBParser, ETreeNZBParser, LXMLNZBParser + +SAMPLE_NZB = """<?xml version="1.0" encoding="iso-8859-1" ?> +<!DOCTYPE nzb PUBLIC "-//newzBin//DTD NZB 1.0//EN" "http://www.newzbin.com/DTD/nzb/nzb-1.0.dtd"> +<nzb xmlns="http://www.newzbin.com/DTD/2003/nzb"> + <file poster="Joe Bloggs (bloggs@nowhere.example)" date="1071674882" subject="Here's your file! abc-mr2a.r01 (1/2)"> + <groups> + <group>alt.binaries.newzbin</group> + <group>alt.binaries.mojo</group> + </groups> + <segments> + <segment bytes="102394" number="1">123456789abcdef@news.newzbin.com</segment> + <segment bytes="4501" number="2">987654321fedbca@news.newzbin.com</segment> + </segments> + </file> +</nzb>""" + +def test_parse_date(): + parser = BaseNZBParser() + date = parse_date("1071674882") + assert date == datetime.date(2003, 12, 17) + + +def assert_sample_nzb(f): + assert f.poster == 'Joe Bloggs (bloggs@nowhere.example)' + assert f.date == parse_date(1071674882) + assert f.subject == "Here's your file! abc-mr2a.r01 (1/2)" + assert sorted(f.groups) == sorted(['alt.binaries.newzbin', 'alt.binaries.mojo']) + first_segment = sorted(f.segments, key=lambda s: s.number)[0] + assert first_segment.bytes == 102394 + assert first_segment.number == 1 + assert first_segment.message_id == '123456789abcdef@news.newzbin.com' + second_segment = sorted(f.segments, key=lambda s: s.number)[1] + assert second_segment.bytes == 4501 + assert second_segment.number == 2 + assert second_segment.message_id == '987654321fedbca@news.newzbin.com' + + +def test_expat(): + parser = ExpatNZBParser() + files = parser.parse(SAMPLE_NZB) + assert_sample_nzb(files[0]) + + +def test_etree(): + parser = ETreeNZBParser() + files = parser.parse(SAMPLE_NZB) + assert_sample_nzb(files[0]) + + +def test_lxml(): + parser = LXMLNZBParser() + files = parser.parse(SAMPLE_NZB) + assert_sample_nzb(files[0])
\ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0fd9d51 --- /dev/null +++ b/setup.py @@ -0,0 +1,171 @@ +from setuptools import setup, find_packages + +version = '0.1.0' + +LONG_DESCRIPTION = """ +Introduction +------------ + +NZB is an XML-based file format for retrieving posts from NNTP (Usenet) servers. +Since NZB is XML-based, it's relatively easy to build one-off parsers to parse +NZB files. This project is an attempt to consolidate those many one-off NZB +parsers into one simple interface. + +This package includes three implementations: one based on expat, another based +on ElementTree, and a final implementation based on lxml. The order in which +they were listed is in order of compatibility. The expat version should work on +all versions of Python > 2.0, the lxml one will work on all versions > 2.5, and +lxml will only work if you have lxml installed. + + +A Note on Installing lxml +------------------------- + +While lxml is not a requirement, I have had a hard time installing lxml in the +past. I have found this set of commands to work perfectly: + +.. sourcecode:: bash + + STATIC_DEPS=true easy_install 'lxml>=2.2beta4' + STATIC_DEPS=true sudo easy_install 'lxml>=2.2beta4' + + +API Documentation +----------------- + + +Accessing the Default Parser +============================ + +Simply import nzb_parser from the pynzb package. It's an instantiated version +of the fastest available parser that your system can support. + + +Other Parser Locations +====================== + +``ExpatNZBParser``: + Available in the ``pynzb.expat_nzb`` namespace. + +``ETreeNZBParser``: + Available in the ``pynzb.etree_nzb`` namespace. + +``LXMLNZBParser``: + Available in the ``pynzb.lxml_nzb`` namespace. + + +Using the NZB Parser +==================== + +If you're using a specific parser, like the ``ETreeNZBParser``, you will first +have to instantiate it: + +.. sourcecode:: python + + nzb_parser = ETreeNZBParser() + + +Otherwise, you can just import the default parser for your system: + +.. sourcecode:: python + + from pynzb import nzb_parser + + +Then, simply call the ``parse`` method, giving it the xml string as the only +argument: + +.. sourcecode:: python + + files = nzb_parser.parse('<?xml ... my nzb file here ... </nzb>') + + +This will return a list of ``NZBFiles`` for you to use. + + +NZBFile Objects +=============== + +All of the parsers return ``NZBFile`` objects, which are objects with the +following properties: + +``poster``: + The name of the user who posted the file to the newsgroup. + +``date``: + A ``datetime.date`` representation of when the server first saw the file. + +``subject``: + The subject used when the user posted the file to the newsgroup. + +``groups``: + A list of strings representing the newsgroups in which this file may be + found. + +``segments``: + A list of ``NZBSegment`` objects talking about where to get the contents + of this file. + + +NZBSegment Objects +================== + +Each ``NZBFile`` has a list of ``NZBSegment`` objects, which include information +on how to retrieve a part of a file. Here's what you can find on an +``NZBSegment`` object: + +``number``: + The number of the segment in the list of files. + +``bytes``: + The size of the segment, in bytes. + +``message_id``: + The Message-ID of the segment (useful for retrieving the full contents) + + +Example +-------- + +In this example, we will grab an Ubuntu NZB and parse the file, printing out +some information about each file and its segments. + +.. sourcecode:: python + + from pynzb import nzb_parser + from urllib2 import urlopen + + # Grab a sample Ubuntu NZB + ubuntu_nzb = urlopen('http://media.eflorenzano.com/misc/sample-ubuntu-nzb.nzb').read() + + # Parse the NZB into files + files = nzb_parser.parse(ubuntu_nzb) + + # Print out each file's subject and the first two segment message ids + for nzb_file in files: + print nzb_file.subject + for segment in nzb_file.segments[:2]: + print ' ' + segment.message_id + if len(nzb_file.segments) > 2: + print ' ...' +""" + +setup( + name='pynzb', + version=version, + description="pynzb is a unified API for parsing NZB files, with several concrete implementations included", + long_description=LONG_DESCRIPTION, + classifiers=[ + "Programming Language :: Python", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + keywords='nzb,parser,xml', + author='Eric Florenzano', + author_email='floguy@gmail.com', + url='http://github.com/ericflo/pynzb/tree/master', + license='BSD', + packages=find_packages(), + include_package_data=True, + zip_safe=False, + install_requires=['setuptools'], +)
\ No newline at end of file |